From fd3e454366603c8e35b31c52195b1ea8798a8fff Mon Sep 17 00:00:00 2001 From: Ganapatrao Kulkarni Date: Wed, 15 Nov 2017 22:59:02 +0530 Subject: ACPI / NUMA: ia64: Parse all entries of SRAT memory affinity table In current implementation, SRAT Memory Affinity Structure table parsing is restricted to number of maximum memblocks allowed (NR_NODE_MEMBLKS). However NR_NODE_MEMBLKS is defined individually as per architecture requirements. Hence removing the restriction of SRAT Memory Affinity Structure parsing in ACPI driver code and let architecture code check for allowed memblocks count. This check is already there in the x86 code, so do the same on ia64. Signed-off-by: Ganapatrao Kulkarni Acked-by: Tony Luck Signed-off-by: Rafael J. Wysocki --- arch/ia64/kernel/acpi.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 1d29b2f8726b..1dacbf5e9e09 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -504,6 +504,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) return -1; + if (num_node_memblks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + /* record this node in proximity bitmap */ pxm_bit_set(pxm); -- cgit v1.2.3 From 57044031b0cb11325e1034394a4721484f9dc9fe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Nov 2017 02:16:55 +0100 Subject: ACPI / PM: Make it possible to ignore the system sleep blacklist The ACPI code supporting system transitions to sleep states uses an internal blacklist to apply special handling to some machines reported to behave incorrectly in some ways. However, some entries of that blacklist cover problematic as well as non-problematic systems, so give the users of the latter a chance to ignore the blacklist and run their systems in the default way by adding acpi_sleep=nobl to the kernel command line. For example, that allows the users of Dell XPS13 9360 systems not affected by the issue that caused the blacklist entry for this machine to be added by commit 71630b7a832f (ACPI / PM: Blacklist Low Power S0 Idle _DSM for Dell XPS13 9360) to use suspend-to-idle with the Low Power S0 Idle _DSM interface which in principle should be more energy-efficient than S3 on them. Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/kernel-parameters.txt | 5 ++++- arch/x86/kernel/acpi/sleep.c | 2 ++ drivers/acpi/sleep.c | 10 ++++++++++ include/linux/acpi.h | 1 + 4 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6571fbfdb2a1..b125690d5dbc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -223,7 +223,7 @@ acpi_sleep= [HW,ACPI] Sleep options Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, - old_ordering, nonvs, sci_force_enable } + old_ordering, nonvs, sci_force_enable, nobl } See Documentation/power/video.txt for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep @@ -239,6 +239,9 @@ sci_force_enable causes the kernel to set SCI_EN directly on resume from S1/S3 (which is against the ACPI spec, but some broken systems don't work without it). + nobl causes the internal blacklist of systems known to + behave incorrectly in some ways with respect to system + suspend and resume to be ignored (use wisely). acpi_use_timer_override [HW,ACPI] Use timer override. For some broken Nvidia NF5 boards diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7188aea91549..f1915b744052 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -138,6 +138,8 @@ static int __init acpi_sleep_setup(char *str) acpi_nvs_nosave_s3(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "nobl", 4) == 0) + acpi_sleep_no_blacklist(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 8082871b409a..15cd862a87c2 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -367,10 +367,20 @@ static const struct dmi_system_id acpisleep_dmi_table[] __initconst = { {}, }; +static bool ignore_blacklist; + +void __init acpi_sleep_no_blacklist(void) +{ + ignore_blacklist = true; +} + static void __init acpi_sleep_dmi_check(void) { int year; + if (ignore_blacklist) + return; + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year >= 2012) acpi_nvs_nosave_s3(); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index dc1ebfeeb5ec..699655a9618b 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -451,6 +451,7 @@ void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); void __init acpi_nvs_nosave_s3(void); +void __init acpi_sleep_no_blacklist(void); #endif /* CONFIG_PM_SLEEP */ struct acpi_osc_context { -- cgit v1.2.3 From bdfe4cebea11476d278b1b98dd0f7cdac8269d62 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Fri, 10 Nov 2017 17:26:54 +0800 Subject: arm64: allwinner: a64: add Ethernet PHY regulator for several boards On several A64 boards the Ethernet PHY is powered by the DC1SW regulator on the AXP803 PMIC. Add phy-handle property to these boards' emac node. Signed-off-by: Icenowy Zheng Acked-by: Corentin LABBE Tested-by: Corentin LABBE Signed-off-by: Maxime Ripard --- arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts | 1 + arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts | 1 + arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts | 1 + 3 files changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts index 45bdbfb96126..4a8d3f83a36e 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-bananapi-m64.dts @@ -75,6 +75,7 @@ pinctrl-0 = <&rgmii_pins>; phy-mode = "rgmii"; phy-handle = <&ext_rgmii_phy>; + phy-supply = <®_dc1sw>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts index 806442d3e846..604cdaedac38 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64.dts @@ -77,6 +77,7 @@ pinctrl-0 = <&rmii_pins>; phy-mode = "rmii"; phy-handle = <&ext_rmii_phy1>; + phy-supply = <®_dc1sw>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts index 0eb2acedf8c3..a053a6ac5267 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts @@ -82,6 +82,7 @@ pinctrl-0 = <&rgmii_pins>; phy-mode = "rgmii"; phy-handle = <&ext_rgmii_phy>; + phy-supply = <®_dc1sw>; status = "okay"; }; -- cgit v1.2.3 From 329b4130bc5eb2a1b123a652b985dbdb08d6b9a8 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Thu, 23 Nov 2017 13:21:55 +0300 Subject: ARC: Fix detection of dual-issue enabled As per PRM bit #0 ("D") in EXEC_CTRL enables dual-issue if set to 0, otherwise if set to 1 all instructions are executed one at a time, i.e. dual-issue is disabled. Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 7ef7d9a8ff89..9d27331fe69a 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -199,7 +199,7 @@ static void read_arc_build_cfg_regs(void) unsigned int exec_ctrl; READ_BCR(AUX_EXEC_CTRL, exec_ctrl); - cpu->extn.dual_enb = exec_ctrl & 1; + cpu->extn.dual_enb = !(exec_ctrl & 1); /* dual issue always present for this core */ cpu->extn.dual = 1; -- cgit v1.2.3 From ecaaab5649781c5a0effdaf298a925063020500e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 28 Nov 2017 20:56:59 -0800 Subject: crypto: salsa20 - fix blkcipher_walk API usage When asked to encrypt or decrypt 0 bytes, both the generic and x86 implementations of Salsa20 crash in blkcipher_walk_done(), either when doing 'kfree(walk->buffer)' or 'free_page((unsigned long)walk->page)', because walk->buffer and walk->page have not been initialized. The bug is that Salsa20 is calling blkcipher_walk_done() even when nothing is in 'walk.nbytes'. But blkcipher_walk_done() is only meant to be called when a nonzero number of bytes have been provided. The broken code is part of an optimization that tries to make only one call to salsa20_encrypt_bytes() to process inputs that are not evenly divisible by 64 bytes. To fix the bug, just remove this "optimization" and use the blkcipher_walk API the same way all the other users do. Reproducer: #include #include #include int main() { int algfd, reqfd; struct sockaddr_alg addr = { .salg_type = "skcipher", .salg_name = "salsa20", }; char key[16] = { 0 }; algfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(algfd, (void *)&addr, sizeof(addr)); reqfd = accept(algfd, 0, 0); setsockopt(algfd, SOL_ALG, ALG_SET_KEY, key, sizeof(key)); read(reqfd, key, sizeof(key)); } Reported-by: syzbot Fixes: eb6f13eb9f81 ("[CRYPTO] salsa20_generic: Fix multi-page processing") Cc: # v2.6.25+ Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/x86/crypto/salsa20_glue.c | 7 ------- crypto/salsa20_generic.c | 7 ------- 2 files changed, 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 399a29d067d6..cb91a64a99e7 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -59,13 +59,6 @@ static int encrypt(struct blkcipher_desc *desc, salsa20_ivsetup(ctx, walk.iv); - if (likely(walk.nbytes == nbytes)) - { - salsa20_encrypt_bytes(ctx, walk.src.virt.addr, - walk.dst.virt.addr, nbytes); - return blkcipher_walk_done(desc, &walk, 0); - } - while (walk.nbytes >= 64) { salsa20_encrypt_bytes(ctx, walk.src.virt.addr, walk.dst.virt.addr, diff --git a/crypto/salsa20_generic.c b/crypto/salsa20_generic.c index f550b5d94630..d7da0eea5622 100644 --- a/crypto/salsa20_generic.c +++ b/crypto/salsa20_generic.c @@ -188,13 +188,6 @@ static int encrypt(struct blkcipher_desc *desc, salsa20_ivsetup(ctx, walk.iv); - if (likely(walk.nbytes == nbytes)) - { - salsa20_encrypt_bytes(ctx, walk.dst.virt.addr, - walk.src.virt.addr, nbytes); - return blkcipher_walk_done(desc, &walk, 0); - } - while (walk.nbytes >= 64) { salsa20_encrypt_bytes(ctx, walk.dst.virt.addr, walk.src.virt.addr, -- cgit v1.2.3 From 741f5afbba70ff3cddcc5bba2595d9a44fa722e5 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Sat, 2 Dec 2017 17:36:45 +0100 Subject: ARM: dts: rockchip: add cpu0-regulator on rk3066a-marsboard The rk3066 also has operating points now, but without adjusting the cpu-regulator will break once higher voltages are needed for a specific frequency, so add the needed cpu0-regulator. Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rk3066a-marsboard.dts | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/rk3066a-marsboard.dts b/arch/arm/boot/dts/rk3066a-marsboard.dts index c6d92c25df42..d23ee6d911ac 100644 --- a/arch/arm/boot/dts/rk3066a-marsboard.dts +++ b/arch/arm/boot/dts/rk3066a-marsboard.dts @@ -83,6 +83,10 @@ }; }; +&cpu0 { + cpu0-supply = <&vdd_arm>; +}; + &i2c1 { status = "okay"; clock-frequency = <400000>; -- cgit v1.2.3 From 912d7985f3cef1b901a4fd9fede549b919fe7ac3 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 9 Nov 2017 16:35:35 -0600 Subject: ARM: dts: rockchip: fix rk3288 iep-IOMMU interrupts property cells The interrupts property in the iep-IOMMU node for the rk3288 dts file has a spurious extra cell causing a dtc warning: Warning (interrupts_property): interrupts size is (16), expected multiple of 12 in /iommu@ff900800 Remove the extra cell. Signed-off-by: Rob Herring Signed-off-by: Heiko Stuebner --- arch/arm/boot/dts/rk3288.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index cd24894ee5c6..6102e4e7f35c 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -956,7 +956,7 @@ iep_mmu: iommu@ff900800 { compatible = "rockchip,iommu"; reg = <0x0 0xff900800 0x0 0x40>; - interrupts = ; + interrupts = ; interrupt-names = "iep_mmu"; #iommu-cells = <0>; status = "disabled"; -- cgit v1.2.3 From 3fa8c49f27c15df259b7b8f94eb126ae491893fd Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Dec 2017 18:36:10 +0100 Subject: arm64: dts: rockchip: fix trailing 0 in rk3328 tsadc interrupts Probably due to some copy-paste mistake, the tsadc of rk3328 ended up with a 0 as 4th element that shouldn't be there, as interrupts on the rk3328 only have multiples of 3, making dtc complain. So remove it. Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3328.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 41d61840fb99..2426da631938 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -514,7 +514,7 @@ tsadc: tsadc@ff250000 { compatible = "rockchip,rk3328-tsadc"; reg = <0x0 0xff250000 0x0 0x100>; - interrupts = ; + interrupts = ; assigned-clocks = <&cru SCLK_TSADC>; assigned-clock-rates = <50000>; clocks = <&cru SCLK_TSADC>, <&cru PCLK_TSADC>; -- cgit v1.2.3 From a4bd78ed215873a68869e41fd59543be8ca38e7f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Oct 2017 09:17:49 -0700 Subject: mn10300: READ_ONCE() now implies smp_read_barrier_depends() Given that READ_ONCE() now implies smp_read_barrier_depends(), there is no need for the open-coded smp_read_barrier_depends() in mn10300_serial_receive_interrupt() and mn10300_serial_poll_get_char(). This commit therefore removes them, but replaces them with comments calling out that carrying dependencies through non-pointers is quite dangerous. Compilers simply know too much about integers. Signed-off-by: Paul E. McKenney Cc: David Howells Cc: Mark Rutland Cc: --- arch/mn10300/kernel/mn10300-serial.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c index d7ef1232a82a..4994b570dfd9 100644 --- a/arch/mn10300/kernel/mn10300-serial.c +++ b/arch/mn10300/kernel/mn10300-serial.c @@ -550,7 +550,7 @@ try_again: return; } - smp_read_barrier_depends(); + /* READ_ONCE() enforces dependency, but dangerous through integer!!! */ ch = port->rx_buffer[ix++]; st = port->rx_buffer[ix++]; smp_mb(); @@ -1728,7 +1728,10 @@ static int mn10300_serial_poll_get_char(struct uart_port *_port) if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0) return NO_POLL_CHAR; - smp_read_barrier_depends(); + /* + * READ_ONCE() enforces dependency, but dangerous + * through integer!!! + */ ch = port->rx_buffer[ix++]; st = port->rx_buffer[ix++]; smp_mb(); -- cgit v1.2.3 From bc53e3aa88e8240823c1c440e6bab3c3a5ba5f59 Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Mon, 27 Nov 2017 17:31:01 +0100 Subject: ARM: dts: at91: disable the nxp,se97b SMBUS timeout on the TSE-850 The I2C adapter driver is sometimes slow, causing the SCL line to be stuck low for more than the stipulated SMBUS timeout of 25-35 ms. This causes the client device to give up which in turn causes silent corruption of data. So, disable the SMBUS timeout in the client device. Signed-off-by: Peter Rosin Acked-by: Guenter Roeck Signed-off-by: Alexandre Belloni --- arch/arm/boot/dts/at91-tse850-3.dts | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/at91-tse850-3.dts b/arch/arm/boot/dts/at91-tse850-3.dts index 5f29010cdbd8..9b82cc8843e1 100644 --- a/arch/arm/boot/dts/at91-tse850-3.dts +++ b/arch/arm/boot/dts/at91-tse850-3.dts @@ -221,6 +221,7 @@ jc42@18 { compatible = "nxp,se97b", "jedec,jc-42.4-temp"; reg = <0x18>; + smbus-timeout-disable; }; dpot: mcp4651-104@28 { -- cgit v1.2.3 From e085ac7a6ddbd746966083c5e13aa290c3e9a253 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Mon, 4 Dec 2017 17:54:37 +0100 Subject: x86/MCE: Extend table to report action optional errors through CMCI too According to the Intel SDM Volume 3B (253669-063US, July 2017), action optional (SRAO) errors can be reported either via MCE or CMC: In cases when SRAO is signaled via CMCI the error signature is indicated via UC=1, PCC=0, S=0. Type(*1) UC EN PCC S AR Signaling --------------------------------------------------------------- UC 1 1 1 x x MCE SRAR 1 1 0 1 1 MCE SRAO 1 x(*2) 0 x(*2) 0 MCE/CMC UCNA 1 x 0 0 0 CMC CE 0 x x x x CMC NOTES: 1. SRAR, SRAO and UCNA errors are supported by the processor only when IA32_MCG_CAP[24] (MCG_SER_P) is set. 2. EN=1, S=1 when signaled via MCE. EN=x, S=0 when signaled via CMC. And there is a description in 15.6.2 UCR Error Reporting and Logging, for bit S: S (Signaling) flag, bit 56 - Indicates (when set) that a machine check exception was generated for the UCR error reported in this MC bank... When the S flag in the IA32_MCi_STATUS register is clear, this UCR error was not signaled via a machine check exception and instead was reported as a corrected machine check (CMC). So merge the two cases and just remove the S=0 check for SRAO in mce_severity(). [ Borislav: Massage commit message.] Signed-off-by: Xie XiuQi Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Tested-by: Chen Wei Cc: linux-edac Link: http://lkml.kernel.org/r/1511575548-41992-1-git-send-email-xiexiuqi@huawei.com --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a06e0b..5bbd06f38ff6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ static struct severity { #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ static struct severity { NOSER, BITCLR(MCI_STATUS_UC) ), + /* + * known AO MCACODs reported via MCE or CMC: + * + * SRAO could be signaled either via a machine check exception or + * CMCI with the corresponding bit S 1 or 0. So we don't need to + * check bit S for SRAO. + */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), - /* known AO MCACODs: */ - MCESEV( - AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) - ), - MCESEV( - AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) - ), MCESEV( SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) -- cgit v1.2.3 From c8a4364c33ac7ed63278267b8f6d8c15810d5fd1 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 4 Dec 2017 17:54:38 +0100 Subject: x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems The McaIntrCfg register (MSRC000_0410), previously known as CU_DEFER_ERR, is used on SMCA systems to set the LVT offset for the Threshold and Deferred error interrupts. This register was used on non-SMCA systems to also set the Deferred interrupt type in bits 2:1. However, these bits are reserved on SMCA systems. Only set MSRC000_0410[2:1] on non-SMCA systems. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20171120162646.5210-1-Yazen.Ghannam@amd.com --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 486f640b02ef..a38ab1fa53a2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -407,7 +407,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) (deferred_error_int_vector != amd_deferred_error_interrupt)) deferred_error_int_vector = amd_deferred_error_interrupt; - low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + if (!mce_flags.smca) + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); } -- cgit v1.2.3 From e17e237cd69f9f6ecaa0e875f889ad401a625148 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 4 Dec 2017 16:44:01 +0800 Subject: ARM: dts: sunxi: Convert to CCU index macros for HDMI controller When the HDMI controller device node was added, the needed PLL clock macros were not exported. A separate patch addresses that, but it is merged through a different tree. Now that both patches are in mainline proper, we can convert the raw numbers to proper macros. Signed-off-by: Chen-Yu Tsai Signed-off-by: Maxime Ripard --- arch/arm/boot/dts/sun4i-a10.dtsi | 4 ++-- arch/arm/boot/dts/sun5i-a10s.dtsi | 4 ++-- arch/arm/boot/dts/sun6i-a31.dtsi | 4 ++-- arch/arm/boot/dts/sun7i-a20.dtsi | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi index b91300d49a31..5840f5c75c3b 100644 --- a/arch/arm/boot/dts/sun4i-a10.dtsi +++ b/arch/arm/boot/dts/sun4i-a10.dtsi @@ -502,8 +502,8 @@ reg = <0x01c16000 0x1000>; interrupts = <58>; clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>, - <&ccu 9>, - <&ccu 18>; + <&ccu CLK_PLL_VIDEO0_2X>, + <&ccu CLK_PLL_VIDEO1_2X>; clock-names = "ahb", "mod", "pll-0", "pll-1"; dmas = <&dma SUN4I_DMA_NORMAL 16>, <&dma SUN4I_DMA_NORMAL 16>, diff --git a/arch/arm/boot/dts/sun5i-a10s.dtsi b/arch/arm/boot/dts/sun5i-a10s.dtsi index 6ae4d95e230e..316cb8b2945b 100644 --- a/arch/arm/boot/dts/sun5i-a10s.dtsi +++ b/arch/arm/boot/dts/sun5i-a10s.dtsi @@ -82,8 +82,8 @@ reg = <0x01c16000 0x1000>; interrupts = <58>; clocks = <&ccu CLK_AHB_HDMI>, <&ccu CLK_HDMI>, - <&ccu 9>, - <&ccu 16>; + <&ccu CLK_PLL_VIDEO0_2X>, + <&ccu CLK_PLL_VIDEO1_2X>; clock-names = "ahb", "mod", "pll-0", "pll-1"; dmas = <&dma SUN4I_DMA_NORMAL 16>, <&dma SUN4I_DMA_NORMAL 16>, diff --git a/arch/arm/boot/dts/sun6i-a31.dtsi b/arch/arm/boot/dts/sun6i-a31.dtsi index 8bfa12b548e0..72d3fe44ecaf 100644 --- a/arch/arm/boot/dts/sun6i-a31.dtsi +++ b/arch/arm/boot/dts/sun6i-a31.dtsi @@ -429,8 +429,8 @@ interrupts = ; clocks = <&ccu CLK_AHB1_HDMI>, <&ccu CLK_HDMI>, <&ccu CLK_HDMI_DDC>, - <&ccu 7>, - <&ccu 13>; + <&ccu CLK_PLL_VIDEO0_2X>, + <&ccu CLK_PLL_VIDEO1_2X>; clock-names = "ahb", "mod", "ddc", "pll-0", "pll-1"; resets = <&ccu RST_AHB1_HDMI>; reset-names = "ahb"; diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 68dfa82544fc..59655e42e4b0 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -581,8 +581,8 @@ reg = <0x01c16000 0x1000>; interrupts = ; clocks = <&ccu CLK_AHB_HDMI0>, <&ccu CLK_HDMI>, - <&ccu 9>, - <&ccu 18>; + <&ccu CLK_PLL_VIDEO0_2X>, + <&ccu CLK_PLL_VIDEO1_2X>; clock-names = "ahb", "mod", "pll-0", "pll-1"; dmas = <&dma SUN4I_DMA_NORMAL 16>, <&dma SUN4I_DMA_NORMAL 16>, -- cgit v1.2.3 From 7d556bfc49adddf2beb0d16c91945c3b8b783282 Mon Sep 17 00:00:00 2001 From: Jagan Teki Date: Mon, 4 Dec 2017 10:23:07 +0530 Subject: arm64: allwinner: a64-sopine: Fix to use dcdc1 regulator instead of vcc3v3 Since current tree support AXP803 regulators, replace fixed regulator vcc3v3 with AXP803 dcdc1 regulator where ever it need to replace. Tested mmc0 on sopine baseboard. Signed-off-by: Jagan Teki Signed-off-by: Maxime Ripard --- arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts | 2 +- arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi | 11 +---------- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts index a053a6ac5267..abe179de35d7 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine-baseboard.dts @@ -96,7 +96,7 @@ &mmc2 { pinctrl-names = "default"; pinctrl-0 = <&mmc2_pins>; - vmmc-supply = <®_vcc3v3>; + vmmc-supply = <®_dcdc1>; vqmmc-supply = <®_vcc1v8>; bus-width = <8>; non-removable; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi index a5da18a6f286..43418bd881d8 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-sopine.dtsi @@ -45,19 +45,10 @@ #include "sun50i-a64.dtsi" -/ { - reg_vcc3v3: vcc3v3 { - compatible = "regulator-fixed"; - regulator-name = "vcc3v3"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - }; -}; - &mmc0 { pinctrl-names = "default"; pinctrl-0 = <&mmc0_pins>; - vmmc-supply = <®_vcc3v3>; + vmmc-supply = <®_dcdc1>; non-removable; disable-wp; bus-width = <4>; -- cgit v1.2.3 From f88e9301948173dd35afad4a6939092c7f269aed Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Fri, 3 Nov 2017 22:58:54 +0300 Subject: arm64: dts: orange-pi-zero-plus2: fix sdcard detect The sdcard detect pin on orange-pi-zero-plus2 is pulled up. Fix cd-gpio description to enable sdcard detect. Signed-off-by: Sergey Matyukevich Signed-off-by: Maxime Ripard --- arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts index b6b7a561df8c..a42fd79a62a3 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5-orangepi-zero-plus2.dts @@ -71,7 +71,7 @@ pinctrl-0 = <&mmc0_pins_a>, <&mmc0_cd_pin>; vmmc-supply = <®_vcc3v3>; bus-width = <4>; - cd-gpios = <&pio 5 6 GPIO_ACTIVE_HIGH>; + cd-gpios = <&pio 5 6 GPIO_ACTIVE_LOW>; status = "okay"; }; -- cgit v1.2.3 From c343bade301dfe608e86b034cbabed3c0d5a50f5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 4 Dec 2017 13:08:47 -0300 Subject: x86/asm: Allow again using asm.h when building for the 'bpf' clang target Up to f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") we were able to use x86 headers to build to the 'bpf' clang target, as done by the BPF code in tools/perf/. With that commit, we ended up with following failure for 'perf test LLVM', this is because "clang ... -target bpf ..." fails since 4.0 does not have bpf inline asm support and 6.0 does not recognize the register 'esp', fix it by guarding that part with an #ifndef __BPF__, that is defined by clang when building to the "bpf" target. # perf test -v LLVM 37: LLVM search and compile : 37.1: Basic BPF llvm compile : --- start --- test child forked, pid 25526 Kernel build dir is set to /lib/modules/4.14.0+/build set env: KBUILD_DIR=/lib/modules/4.14.0+/build unset env: KBUILD_OPTS include option is set to -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: NR_CPUS=4 set env: LINUX_VERSION_CODE=0x40e00 set env: CLANG_EXEC=/usr/local/bin/clang set env: CLANG_OPTIONS=-xc set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: WORKING_DIR=/lib/modules/4.14.0+/build set env: CLANG_SOURCE=- llvm compiling command template: echo '/* * bpf-script-example.c * Test basic LLVM building */ #ifndef LINUX_VERSION_CODE # error Need LINUX_VERSION_CODE # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig' #endif #define BPF_ANY 0 #define BPF_MAP_TYPE_ARRAY 2 #define BPF_FUNC_map_lookup_elem 1 #define BPF_FUNC_map_update_elem 2 static void *(*bpf_map_lookup_elem)(void *map, void *key) = (void *) BPF_FUNC_map_lookup_elem; static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) = (void *) BPF_FUNC_map_update_elem; struct bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; }; #define SEC(NAME) __attribute__((section(NAME), used)) struct bpf_map_def SEC("maps") flip_table = { .type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), .value_size = sizeof(int), .max_entries = 1, }; SEC("func=SyS_epoll_wait") int bpf_func__SyS_epoll_wait(void *ctx) { int ind =0; int *flag = bpf_map_lookup_elem(&flip_table, &ind); int new_flag; if (!flag) return 0; /* flip flag and store back */ new_flag = !*flag; bpf_map_update_elem(&flip_table, &ind, &new_flag, BPF_ANY); return new_flag; } char _license[] SEC("license") = "GPL"; int _version SEC("version") = LINUX_VERSION_CODE; ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o - test child finished with 0 ---- end ---- LLVM search and compile subtest 0: Ok 37.2: kbuild searching : --- start --- test child forked, pid 25950 Kernel build dir is set to /lib/modules/4.14.0+/build set env: KBUILD_DIR=/lib/modules/4.14.0+/build unset env: KBUILD_OPTS include option is set to -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: NR_CPUS=4 set env: LINUX_VERSION_CODE=0x40e00 set env: CLANG_EXEC=/usr/local/bin/clang set env: CLANG_OPTIONS=-xc set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: WORKING_DIR=/lib/modules/4.14.0+/build set env: CLANG_SOURCE=- llvm compiling command template: echo '/* * bpf-script-test-kbuild.c * Test include from kernel header */ #ifndef LINUX_VERSION_CODE # error Need LINUX_VERSION_CODE # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig' #endif #define SEC(NAME) __attribute__((section(NAME), used)) #include #include SEC("func=vfs_llseek") int bpf_func__vfs_llseek(void *ctx) { return 0; } char _license[] SEC("license") = "GPL"; int _version SEC("version") = LINUX_VERSION_CODE; ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o - In file included from :12: In file included from /home/acme/git/linux/arch/x86/include/uapi/asm/ptrace.h:5: In file included from /home/acme/git/linux/include/linux/compiler.h:242: In file included from /home/acme/git/linux/arch/x86/include/asm/barrier.h:5: In file included from /home/acme/git/linux/arch/x86/include/asm/alternative.h:10: /home/acme/git/linux/arch/x86/include/asm/asm.h:145:50: error: unknown register name 'esp' in asm register unsigned long current_stack_pointer asm(_ASM_SP); ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:44:18: note: expanded from macro '_ASM_SP' #define _ASM_SP __ASM_REG(sp) ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:27:32: note: expanded from macro '__ASM_REG' #define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:18:29: note: expanded from macro '__ASM_SEL_RAW' # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:11:32: note: expanded from macro '__ASM_FORM_RAW' # define __ASM_FORM_RAW(x) #x ^ :4:1: note: expanded from here "esp" ^ 1 error generated. ERROR: unable to compile - Hint: Check error message shown above. Hint: You can also pre-compile it into .o using: clang -target bpf -O2 -c - with proper -I and -D options. Failed to compile test case: 'kbuild searching' test child finished with -1 ---- end ---- LLVM search and compile subtest 1: FAILED! Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Daniel Borkmann Cc: David Ahern Cc: Dmitriy Vyukov Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wang Nan Cc: Yonghong Song Link: https://lkml.kernel.org/r/20171128175948.GL3298@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/include/asm/asm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..386a6900e206 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -136,6 +136,7 @@ #endif #ifndef __ASSEMBLY__ +#ifndef __BPF__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -145,5 +146,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif +#endif #endif /* _ASM_X86_ASM_H */ -- cgit v1.2.3 From 87eba0716011e528f7841026f2cc65683219d0ad Mon Sep 17 00:00:00 2001 From: Klaus Goger Date: Tue, 5 Dec 2017 08:11:58 +0100 Subject: arm64: dts: rockchip: remove vdd_log from rk3399-puma vdd_log has no consumer and therefore will not be set to a specific voltage. Still the PWM output pin gets configured and thence the vdd_log output voltage will changed from it's default. Depending on the idle state of the PWM this will slightly over or undervoltage the logic supply of the RK3399 and cause instability with GbE (undervoltage) and PCIe (overvoltage). Since the default value set by a voltage divider is the correct supply voltage and we don't need to change it during runtime we remove the rail from the devicetree completely so the PWM pin will not be configured. Signed-off-by: Klaus Goger Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi index 910628d18add..1fc5060d7027 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma.dtsi @@ -155,17 +155,6 @@ regulator-min-microvolt = <5000000>; regulator-max-microvolt = <5000000>; }; - - vdd_log: vdd-log { - compatible = "pwm-regulator"; - pwms = <&pwm2 0 25000 0>; - regulator-name = "vdd_log"; - regulator-min-microvolt = <800000>; - regulator-max-microvolt = <1400000>; - regulator-always-on; - regulator-boot-on; - status = "okay"; - }; }; &cpu_b0 { -- cgit v1.2.3 From bc631943faba6fc3f755748091ada31798fb7d50 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Wed, 6 Dec 2017 01:10:05 +0100 Subject: arm64: dts: rockchip: limit rk3328-rock64 gmac speed to 100MBit for now It looks like either the current kernel or the hardware has reliability issues when the gmac is actually running at 1GBit. In my test-case it is not able to boot on a nfsroot at this speed, as the system will always lose the connection to the nfs-server during boot, before reaching any login prompt and not recover from this. So until this is solved, limit the speed to 100MBit as with this the nfsroot survives stress tests like an apt-get upgrade without problems. Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3328-rock64.dts | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts index d4f80786e7c2..3890468678ce 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts @@ -132,6 +132,8 @@ assigned-clocks = <&cru SCLK_MAC2IO>, <&cru SCLK_MAC2IO_EXT>; assigned-clock-parents = <&gmac_clkin>, <&gmac_clkin>; clock_in_out = "input"; + /* shows instability at 1GBit right now */ + max-speed = <100>; phy-supply = <&vcc_io>; phy-mode = "rgmii"; pinctrl-names = "default"; -- cgit v1.2.3 From 3073774e638ef18d222465fe92bfc8fccb90d288 Mon Sep 17 00:00:00 2001 From: Serhii Popovych Date: Mon, 4 Dec 2017 09:36:41 -0500 Subject: KVM: PPC: Book3S HV: Drop prepare_done from struct kvm_resize_hpt Currently the kvm_resize_hpt structure has two fields relevant to the state of an ongoing resize: 'prepare_done', which indicates whether the worker thread has completed or not, and 'error' which indicates whether it was successful or not. Since the success/failure isn't known until completion, this is confusingly redundant. This patch consolidates the information into just the 'error' value: -EBUSY indicates the worked is still in progress, other negative values indicate (completed) failure, 0 indicates successful completion. As a bonus this reduces size of struct kvm_resize_hpt by __alignof__(struct kvm_hpt_info) and saves few bytes of code. While there correct comment in struct kvm_resize_hpt which references a non-existent semaphore (leftover from an early draft). Assert with WARN_ON() in case of HPT allocation thread work runs more than once for resize request or resize_hpt_allocate() returns -EBUSY that is treated specially. Change comparison against zero to make checkpatch.pl happy. Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Serhii Popovych [dwg: Changed BUG_ON()s to WARN_ON()s and altered commit message for clarity] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 44 +++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 966097232d21..f5f2c6bf5856 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -65,11 +65,17 @@ struct kvm_resize_hpt { u32 order; /* These fields protected by kvm->lock */ + + /* Possible values and their usage: + * <0 an error occurred during allocation, + * -EBUSY allocation is in the progress, + * 0 allocation made successfuly. + */ int error; - bool prepare_done; - /* Private to the work thread, until prepare_done is true, - * then protected by kvm->resize_hpt_sem */ + /* Private to the work thread, until error != -EBUSY, + * then protected by kvm->lock. + */ struct kvm_hpt_info hpt; }; @@ -1433,15 +1439,23 @@ static void resize_hpt_prepare_work(struct work_struct *work) struct kvm *kvm = resize->kvm; int err; + if (WARN_ON(resize->error != -EBUSY)) + return; + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", resize->order); err = resize_hpt_allocate(resize); + /* We have strict assumption about -EBUSY + * when preparing for HPT resize. + */ + if (WARN_ON(err == -EBUSY)) + err = -EINPROGRESS; + mutex_lock(&kvm->lock); resize->error = err; - resize->prepare_done = true; mutex_unlock(&kvm->lock); } @@ -1466,14 +1480,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, if (resize) { if (resize->order == shift) { - /* Suitable resize in progress */ - if (resize->prepare_done) { - ret = resize->error; - if (ret != 0) - resize_hpt_release(kvm, resize); - } else { + /* Suitable resize in progress? */ + ret = resize->error; + if (ret == -EBUSY) ret = 100; /* estimated time in ms */ - } + else if (ret) + resize_hpt_release(kvm, resize); goto out; } @@ -1493,6 +1505,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, ret = -ENOMEM; goto out; } + + resize->error = -EBUSY; resize->order = shift; resize->kvm = kvm; INIT_WORK(&resize->work, resize_hpt_prepare_work); @@ -1547,16 +1561,12 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, if (!resize || (resize->order != shift)) goto out; - ret = -EBUSY; - if (!resize->prepare_done) - goto out; - ret = resize->error; - if (ret != 0) + if (ret) goto out; ret = resize_hpt_rehash(resize); - if (ret != 0) + if (ret) goto out; resize_hpt_pivot(resize); -- cgit v1.2.3 From 4ed11aeefda439c76ddae3ceebcfa4fad111f149 Mon Sep 17 00:00:00 2001 From: Serhii Popovych Date: Mon, 4 Dec 2017 09:36:42 -0500 Subject: KVM: PPC: Book3S HV: Fix use after free in case of multiple resize requests When serving multiple resize requests following could happen: CPU0 CPU1 ---- ---- kvm_vm_ioctl_resize_hpt_prepare(1); -> schedule_work() /* system_rq might be busy: delay */ kvm_vm_ioctl_resize_hpt_prepare(2); mutex_lock(); if (resize) { ... release_hpt_resize(); } ... resize_hpt_prepare_work() -> schedule_work() { mutex_unlock() /* resize->kvm could be wrong */ struct kvm *kvm = resize->kvm; mutex_lock(&kvm->lock); <<<< UAF ... } i.e. a second resize request with different order could be started by kvm_vm_ioctl_resize_hpt_prepare(), causing the previous request to be free()d when there's still an active worker thread which will try to access it. This leads to a use after free in point marked with UAF on the diagram above. To prevent this from happening, instead of unconditionally releasing a pre-existing resize structure from the prepare ioctl(), we check if the existing structure has an in-progress worker. We do that by checking if the resize->error == -EBUSY, which is safe because the resize->error field is protected by the kvm->lock. If there is an active worker, instead of releasing, we mark the structure as stale by unlinking it from kvm_struct. In the worker thread we check for a stale structure (with kvm->lock held), and in that case abort, releasing the stale structure ourself. We make the check both before and the actual allocation. Strictly, only the check afterwards is needed, the check before is an optimization: if the structure happens to become stale before the worker thread is dispatched, rather than during the allocation, it means we can avoid allocating then immediately freeing a potentially substantial amount of memory. This fixes following or similar host kernel crash message: [ 635.277361] Unable to handle kernel paging request for data at address 0x00000000 [ 635.277438] Faulting instruction address: 0xc00000000052f568 [ 635.277446] Oops: Kernel access of bad area, sig: 11 [#1] [ 635.277451] SMP NR_CPUS=2048 NUMA PowerNV [ 635.277470] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter nfsv3 nfs_acl nfs lockd grace fscache kvm_hv kvm rpcrdma sunrpc ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ext4 ib_srp scsi_transport_srp ib_ipoib mbcache jbd2 rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ocrdma(T) ib_core ses enclosure scsi_transport_sas sg shpchp leds_powernv ibmpowernv i2c_opal i2c_core powernv_rng ipmi_powernv ipmi_devintf ipmi_msghandler ip_tables xfs libcrc32c sr_mod sd_mod cdrom lpfc nvme_fc(T) nvme_fabrics nvme_core ipr nvmet_fc(T) tg3 nvmet libata be2net crc_t10dif crct10dif_generic scsi_transport_fc ptp scsi_tgt pps_core crct10dif_common dm_mirror dm_region_hash dm_log dm_mod [ 635.278687] CPU: 40 PID: 749 Comm: kworker/40:1 Tainted: G ------------ T 3.10.0.bz1510771+ #1 [ 635.278782] Workqueue: events resize_hpt_prepare_work [kvm_hv] [ 635.278851] task: c0000007e6840000 ti: c0000007e9180000 task.ti: c0000007e9180000 [ 635.278919] NIP: c00000000052f568 LR: c0000000009ea310 CTR: c0000000009ea4f0 [ 635.278988] REGS: c0000007e91837f0 TRAP: 0300 Tainted: G ------------ T (3.10.0.bz1510771+) [ 635.279077] MSR: 9000000100009033 CR: 24002022 XER: 00000000 [ 635.279248] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1 GPR00: c0000000009ea310 c0000007e9183a70 c000000001250b00 c0000007e9183b10 GPR04: 0000000000000000 0000000000000000 c0000007e9183650 0000000000000000 GPR08: c0000007ffff7b80 00000000ffffffff 0000000080000028 d00000000d2529a0 GPR12: 0000000000002200 c000000007b56800 c000000000120028 c0000007f135bb40 GPR16: 0000000000000000 c000000005c1e018 c000000005c1e018 0000000000000000 GPR20: 0000000000000001 c0000000011bf778 0000000000000001 fffffffffffffef7 GPR24: 0000000000000000 c000000f1e262e50 0000000000000002 c0000007e9180000 GPR28: c000000f1e262e4c c000000f1e262e50 0000000000000000 c0000007e9183b10 [ 635.280149] NIP [c00000000052f568] __list_add+0x38/0x110 [ 635.280197] LR [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 [ 635.280253] Call Trace: [ 635.280277] [c0000007e9183af0] [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 [ 635.280356] [c0000007e9183b70] [c0000000009ea554] mutex_lock+0x64/0x70 [ 635.280426] [c0000007e9183ba0] [d00000000d24da04] resize_hpt_prepare_work+0xe4/0x1c0 [kvm_hv] [ 635.280507] [c0000007e9183c40] [c000000000113c0c] process_one_work+0x1dc/0x680 [ 635.280587] [c0000007e9183ce0] [c000000000114250] worker_thread+0x1a0/0x520 [ 635.280655] [c0000007e9183d80] [c00000000012010c] kthread+0xec/0x100 [ 635.280724] [c0000007e9183e30] [c00000000000a4b8] ret_from_kernel_thread+0x5c/0xa4 [ 635.280814] Instruction dump: [ 635.280880] 7c0802a6 fba1ffe8 fbc1fff0 7cbd2b78 fbe1fff8 7c9e2378 7c7f1b78 f8010010 [ 635.281099] f821ff81 e8a50008 7fa52040 40de00b8 7fbd2840 40de008c 7fbff040 [ 635.281324] ---[ end trace b628b73449719b9d ]--- Cc: stable@vger.kernel.org # v4.10+ Fixes: b5baa6877315 ("KVM: PPC: Book3S HV: KVM-HV HPT resizing implementation") Signed-off-by: Serhii Popovych [dwg: Replaced BUG_ON()s with WARN_ONs() and reworded commit message for clarity] Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 50 ++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index f5f2c6bf5856..8355398f0bb6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1419,16 +1419,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) { - BUG_ON(kvm->arch.resize_hpt != resize); + if (WARN_ON(!mutex_is_locked(&kvm->lock))) + return; if (!resize) return; - if (resize->hpt.virt) - kvmppc_free_hpt(&resize->hpt); + if (resize->error != -EBUSY) { + if (resize->hpt.virt) + kvmppc_free_hpt(&resize->hpt); + kfree(resize); + } - kvm->arch.resize_hpt = NULL; - kfree(resize); + if (kvm->arch.resize_hpt == resize) + kvm->arch.resize_hpt = NULL; } static void resize_hpt_prepare_work(struct work_struct *work) @@ -1437,26 +1441,42 @@ static void resize_hpt_prepare_work(struct work_struct *work) struct kvm_resize_hpt, work); struct kvm *kvm = resize->kvm; - int err; + int err = 0; if (WARN_ON(resize->error != -EBUSY)) return; - resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", - resize->order); + mutex_lock(&kvm->lock); - err = resize_hpt_allocate(resize); + /* Request is still current? */ + if (kvm->arch.resize_hpt == resize) { + /* We may request large allocations here: + * do not sleep with kvm->lock held for a while. + */ + mutex_unlock(&kvm->lock); - /* We have strict assumption about -EBUSY - * when preparing for HPT resize. - */ - if (WARN_ON(err == -EBUSY)) - err = -EINPROGRESS; + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize->order); - mutex_lock(&kvm->lock); + err = resize_hpt_allocate(resize); + + /* We have strict assumption about -EBUSY + * when preparing for HPT resize. + */ + if (WARN_ON(err == -EBUSY)) + err = -EINPROGRESS; + + mutex_lock(&kvm->lock); + /* It is possible that kvm->arch.resize_hpt != resize + * after we grab kvm->lock again. + */ + } resize->error = err; + if (kvm->arch.resize_hpt != resize) + resize_hpt_release(kvm, resize); + mutex_unlock(&kvm->lock); } -- cgit v1.2.3 From 470195f82e4ea550b7c37736a12bf3fa565295ea Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 29 Nov 2017 15:12:27 +0100 Subject: x86/PCI: Fix infinite loop in search for 64bit BAR placement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Break the loop if we can't find some address space for a 64bit BAR. Signed-off-by: Christian König Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 1e996df687a3..5328e86f73eb 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -696,8 +696,13 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) res->end = 0xfd00000000ull - 1; /* Just grab the free area behind system memory for this */ - while ((conflict = request_resource_conflict(&iomem_resource, res))) + while ((conflict = request_resource_conflict(&iomem_resource, res))) { + if (conflict->end >= res->end) { + kfree(res); + return; + } res->start = conflict->end + 1; + } dev_info(&dev->dev, "adding root bus resource %pR\n", res); -- cgit v1.2.3 From a19e2696135efb471981c1ae1ec3cb2b70c41a2e Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 29 Nov 2017 15:12:28 +0100 Subject: x86/PCI: Only enable a 64bit BAR on single-socket AMD Family 15h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we have a multi-socket system, each CPU core needs the same setup. Since this is tricky to do in the fixup code, don't enable a 64bit BAR on multi-socket systems for now. Signed-off-by: Christian König Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 5328e86f73eb..e663d6bf1328 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -665,6 +665,16 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) unsigned i; u32 base, limit, high; struct resource *res, *conflict; + struct pci_dev *other; + + /* Check that we are the only device of that type */ + other = pci_get_device(dev->vendor, dev->device, NULL); + if (other != dev || + (other = pci_get_device(dev->vendor, dev->device, other))) { + /* This is a multi-socket system, don't touch it for now */ + pci_dev_put(other); + return; + } for (i = 0; i < 8; i++) { pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base); @@ -719,10 +729,10 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) pci_bus_add_resource(dev->bus, res, 0); } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); #endif -- cgit v1.2.3 From b638823a7bbd251d442042b0e9522100bdaa5b66 Mon Sep 17 00:00:00 2001 From: Alejandro Mery Date: Tue, 5 Dec 2017 12:34:56 +0000 Subject: ARM: davinci: Use platform_device_register_full() to create pdev for dm365's eDMA Convert the DM365 EDMA platform device creation to use struct platform_device_info XXXXXX __initconst and platform_device_register_full() This will allow us to specify the dma_mask for the device in an upcoming patch. Without this, EDMA on DM365 refuses to probe. Fixes: 7ab388e85faa ("ARM: davinci: Use platform_device_register_full() to create pdev for eDMA") Reviewed-by: Peter Ujfalusi Signed-off-by: Alejandro Mery Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/dm365.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c index 8be04ec95adf..9bd17bc77b5c 100644 --- a/arch/arm/mach-davinci/dm365.c +++ b/arch/arm/mach-davinci/dm365.c @@ -925,12 +925,13 @@ static struct resource edma_resources[] = { /* not using TC*_ERR */ }; -static struct platform_device dm365_edma_device = { - .name = "edma", - .id = 0, - .dev.platform_data = &dm365_edma_pdata, - .num_resources = ARRAY_SIZE(edma_resources), - .resource = edma_resources, +static const struct platform_device_info dm365_edma_device __initconst = { + .name = "edma", + .id = 0, + .res = edma_resources, + .num_res = ARRAY_SIZE(edma_resources), + .data = &dm365_edma_pdata, + .size_data = sizeof(dm365_edma_pdata), }; static struct resource dm365_asp_resources[] = { @@ -1428,13 +1429,18 @@ int __init dm365_init_video(struct vpfe_config *vpfe_cfg, static int __init dm365_init_devices(void) { + struct platform_device *edma_pdev; int ret = 0; if (!cpu_is_davinci_dm365()) return 0; davinci_cfg_reg(DM365_INT_EDMA_CC); - platform_device_register(&dm365_edma_device); + edma_pdev = platform_device_register_full(&dm365_edma_device); + if (IS_ERR(edma_pdev)) { + pr_warn("%s: Failed to register eDMA\n", __func__); + return PTR_ERR(edma_pdev); + } platform_device_register(&dm365_mdio_device); platform_device_register(&dm365_emac_device); -- cgit v1.2.3 From 621f96bcb49412010876a1e6e006f748b91d9e75 Mon Sep 17 00:00:00 2001 From: Alejandro Mery Date: Tue, 5 Dec 2017 12:34:57 +0000 Subject: ARM: davinci: Add dma_mask to dm365's eDMA device Add dma_mask to dm365's EDMA device. Without a valid dma_mask, EDMA on DM365 refuses to probe. Fixes: cef5b0da4019 ("ARM: davinci: Add dma_mask to eDMA devices") Reviewed-by: Peter Ujfalusi Signed-off-by: Alejandro Mery Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/dm365.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c index 9bd17bc77b5c..103316f01a22 100644 --- a/arch/arm/mach-davinci/dm365.c +++ b/arch/arm/mach-davinci/dm365.c @@ -928,6 +928,7 @@ static struct resource edma_resources[] = { static const struct platform_device_info dm365_edma_device __initconst = { .name = "edma", .id = 0, + .dma_mask = DMA_BIT_MASK(32), .res = edma_resources, .num_res = ARRAY_SIZE(edma_resources), .data = &dm365_edma_pdata, -- cgit v1.2.3 From c5a88cd2e1c508868922bafa0a5c3365986b98e5 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Sun, 3 Dec 2017 16:04:53 -0600 Subject: ARM: dts: da850-lego-ev3: Fix battery voltage gpio This fixes the battery voltage monitoring gpio-hog settings. When the gpio is low, it turns off the battery voltage to the ADC chip. However, this needs to be on all of the time so that we can monitor battery voltage. Also, there was a typo that prevented pinmuxing from working correctly. Signed-off-by: David Lechner Signed-off-by: Sekhar Nori --- arch/arm/boot/dts/da850-lego-ev3.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/da850-lego-ev3.dts b/arch/arm/boot/dts/da850-lego-ev3.dts index 413dbd5d9f64..81942ae83e1f 100644 --- a/arch/arm/boot/dts/da850-lego-ev3.dts +++ b/arch/arm/boot/dts/da850-lego-ev3.dts @@ -178,7 +178,7 @@ */ battery { pinctrl-names = "default"; - pintctrl-0 = <&battery_pins>; + pinctrl-0 = <&battery_pins>; compatible = "lego,ev3-battery"; io-channels = <&adc 4>, <&adc 3>; io-channel-names = "voltage", "current"; @@ -392,7 +392,7 @@ batt_volt_en { gpio-hog; gpios = <6 GPIO_ACTIVE_HIGH>; - output-low; + output-high; }; }; -- cgit v1.2.3 From 947134d9b00f342415af7eddd42a5fce7262a1b9 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 4 Dec 2017 11:45:21 -0500 Subject: x86/smpboot: Do not use smp_num_siblings in __max_logical_packages calculation Documentation/x86/topology.txt defines smp_num_siblings as "The number of threads in a core". Since commit bbb65d2d365e ("x86: use cpuid vector 0xb when available for detecting cpu topology") smp_num_siblings is the maximum number of threads in a core. If Simultaneous MultiThreading (SMT) is disabled on a system, smp_num_siblings is 2 and not 1 as expected. Use topology_max_smt_threads(), which contains the active numer of threads, in the __max_logical_packages calculation. On a single socket, single core, single thread system __max_smt_threads has not been updated when the __max_logical_packages calculation happens, so its zero which makes the package estimate fail. Initialize it to one, which is the minimum number of threads on a core. [ tglx: Folded the __max_smt_threads fix in ] Fixes: b4c0a7326f5d ("x86/smpboot: Fix __max_logical_packages estimate") Reported-by: Jakub Kicinski Signed-off-by: Prarit Bhargava Tested-by: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: "netdev@vger.kernel.org" Cc: Clark Williams Link: https://lkml.kernel.org/r/20171204164521.17870-1-prarit@redhat.com --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 05a97d5fe298..35cb20994e32 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -106,7 +106,7 @@ EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; /* Maximum number of SMT threads on any online core */ -int __max_smt_threads __read_mostly; +int __read_mostly __max_smt_threads = 1; /* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update; @@ -1304,7 +1304,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) * Today neither Intel nor AMD support heterogenous systems so * extrapolate the boot cpu's data to all packages. */ - ncpus = cpu_data(0).booted_cores * smp_num_siblings; + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); -- cgit v1.2.3 From 08529078d8d9adf689bf39cc38d53979a0869970 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2017 15:40:55 +0300 Subject: x86/boot/compressed/64: Detect and handle 5-level paging at boot-time Prerequisite for fixing the current problem of instantaneous reboots when a 5-level paging kernel is booted on 4-level paging hardware. At the same time this change prepares the decompression code to boot-time switching between 4- and 5-level paging. [ tglx: Folded the GCC < 5 fix. ] Fixes: 77ef56e4f0fb ("x86: Enable 5-level paging support via CONFIG_X86_5LEVEL=y") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: linux-mm@kvack.org Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20171204124059.63515-2-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/head_64.S | 16 ++++++++++++---- arch/x86/boot/compressed/pgtable_64.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 arch/x86/boot/compressed/pgtable_64.c (limited to 'arch') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 1e9c322e973a..f25e1530e064 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -80,6 +80,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-y += $(obj)/pgtable_64.o endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 20919b4f3133..fc313e29fe2c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -305,10 +305,18 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp #ifdef CONFIG_X86_5LEVEL - /* Check if 5-level paging has already enabled */ - movq %cr4, %rax - testl $X86_CR4_LA57, %eax - jnz lvl5 + /* + * Check if we need to enable 5-level paging. + * RSI holds real mode data and need to be preserved across + * a function call. + */ + pushq %rsi + call l5_paging_required + popq %rsi + + /* If l5_paging_required() returned zero, we're done here. */ + cmpq $0, %rax + je lvl5 /* * At this point we are in long mode with 4-level paging enabled, diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c new file mode 100644 index 000000000000..b4469a37e9a1 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -0,0 +1,28 @@ +#include + +/* + * __force_order is used by special_insns.h asm code to force instruction + * serialization. + * + * It is not referenced from the code, but GCC < 5 with -fPIE would fail + * due to an undefined symbol. Define it to make these ancient GCCs work. + */ +unsigned long __force_order; + +int l5_paging_required(void) +{ + /* Check if leaf 7 is supported. */ + + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return 0; + + /* Check if 5-level paging has already been enabled. */ + if (native_read_cr4() & X86_CR4_LA57) + return 0; + + return 1; +} -- cgit v1.2.3 From 6d7e0ba2d2be9e50cccba213baf07e0e183c1b24 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 4 Dec 2017 15:40:56 +0300 Subject: x86/boot/compressed/64: Print error if 5-level paging is not supported If the machine does not support the paging mode for which the kernel was compiled, the boot process cannot continue. It's not possible to let the kernel detect the mismatch as it does not even reach the point where cpu features can be evaluted due to a triple fault in the KASLR setup. Instead of instantaneous silent reboot, emit an error message which gives the user the information why the boot fails. Fixes: 77ef56e4f0fb ("x86: Enable 5-level paging support via CONFIG_X86_5LEVEL=y") Reported-by: Borislav Petkov Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Cc: Andi Kleen Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: linux-mm@kvack.org Cc: Cyrill Gorcunov Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20171204124059.63515-3-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/misc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b50c42455e25..98761a1576ce 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,6 +169,16 @@ void __puthex(unsigned long value) } } +static bool l5_supported(void) +{ + /* Check if leaf 7 is supported. */ + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); +} + #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); + if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { + error("This linux kernel as configured requires 5-level paging\n" + "This CPU does not support the required 'cr4.la57' feature\n" + "Unable to boot - please use a kernel appropriate for your CPU\n"); + } + free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; -- cgit v1.2.3 From 451df7d110b82998c04a80d0de0f1e79aaa7792a Mon Sep 17 00:00:00 2001 From: Alejandro Mery Date: Fri, 8 Dec 2017 10:35:58 +0000 Subject: ARM: davinci: fix mmc entries in dm365's dma_slave_map fix mmc entries in dm365's dma_slave_map to match the actual device names Fixes: 0c750e1fe481 ("ARM: davinci: dm365: Add dma_slave_map to edma") Signed-off-by: Alejandro Mery Signed-off-by: Sekhar Nori --- arch/arm/mach-davinci/dm365.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c index 103316f01a22..5ace9380626a 100644 --- a/arch/arm/mach-davinci/dm365.c +++ b/arch/arm/mach-davinci/dm365.c @@ -868,10 +868,10 @@ static const struct dma_slave_map dm365_edma_map[] = { { "spi_davinci.0", "rx", EDMA_FILTER_PARAM(0, 17) }, { "spi_davinci.3", "tx", EDMA_FILTER_PARAM(0, 18) }, { "spi_davinci.3", "rx", EDMA_FILTER_PARAM(0, 19) }, - { "dm6441-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) }, - { "dm6441-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) }, - { "dm6441-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) }, - { "dm6441-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) }, + { "da830-mmc.0", "rx", EDMA_FILTER_PARAM(0, 26) }, + { "da830-mmc.0", "tx", EDMA_FILTER_PARAM(0, 27) }, + { "da830-mmc.1", "rx", EDMA_FILTER_PARAM(0, 30) }, + { "da830-mmc.1", "tx", EDMA_FILTER_PARAM(0, 31) }, }; static struct edma_soc_info dm365_edma_pdata = { -- cgit v1.2.3 From 7bf5234db7cce45fa9ff237ce0f45da2bd277cad Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 28 Apr 2017 09:40:00 -0700 Subject: xtensa: add -mno-serialize-volatile to CFLAGS By default xtensa gcc inserts memw for all volatile object accesses. This is too pessimistic for the kernel: there should be no "normal" volatile objects, and all special objects, like MMIO or objects shared between CPUs should have explicit barriers. Signed-off-by: Max Filippov --- arch/xtensa/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/xtensa/Makefile b/arch/xtensa/Makefile index 7ee02fe4a63d..a206598b5d95 100644 --- a/arch/xtensa/Makefile +++ b/arch/xtensa/Makefile @@ -46,6 +46,7 @@ KBUILD_CFLAGS += -ffreestanding -D__linux__ KBUILD_CFLAGS += -pipe -mlongcalls KBUILD_CFLAGS += $(call cc-option,-mforce-no-pic,) +KBUILD_CFLAGS += $(call cc-option,-mno-serialize-volatile,) ifneq ($(CONFIG_LD_NO_RELAX),) LDFLAGS := --no-relax -- cgit v1.2.3 From f8f02ca73cd8d1e2ac61ea1e5f0574a8c1f472fa Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 3 Dec 2017 20:55:35 -0800 Subject: xtensa: build kernel with text-section-literals vmlinux.lds.S doesn't do anything special with literals, so instead of keeping them separate put them into the corresponding text sections. Drop explicit .literal sections from the vmlinux.lds.S, use standard section macros. Mark literal pool locations in the assembly sources. Unfortunately assembler doesn't put literals into .init sections and external libgcc may still have .literal sections, so sed transformation to the linker script is still needed. Signed-off-by: Max Filippov --- arch/xtensa/Makefile | 6 +-- arch/xtensa/boot/boot-redboot/bootstrap.S | 1 + arch/xtensa/kernel/Makefile | 3 -- arch/xtensa/kernel/align.S | 2 +- arch/xtensa/kernel/entry.S | 8 +++ arch/xtensa/kernel/setup.c | 16 +++--- arch/xtensa/kernel/vectors.S | 14 ++--- arch/xtensa/kernel/vmlinux.lds.S | 90 +++++++++---------------------- 8 files changed, 52 insertions(+), 88 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/Makefile b/arch/xtensa/Makefile index a206598b5d95..3a934b72a272 100644 --- a/arch/xtensa/Makefile +++ b/arch/xtensa/Makefile @@ -42,12 +42,12 @@ export PLATFORM # temporarily until string.h is fixed KBUILD_CFLAGS += -ffreestanding -D__linux__ - -KBUILD_CFLAGS += -pipe -mlongcalls - +KBUILD_CFLAGS += -pipe -mlongcalls -mtext-section-literals KBUILD_CFLAGS += $(call cc-option,-mforce-no-pic,) KBUILD_CFLAGS += $(call cc-option,-mno-serialize-volatile,) +KBUILD_AFLAGS += -mlongcalls -mtext-section-literals + ifneq ($(CONFIG_LD_NO_RELAX),) LDFLAGS := --no-relax endif diff --git a/arch/xtensa/boot/boot-redboot/bootstrap.S b/arch/xtensa/boot/boot-redboot/bootstrap.S index bf7fabe6310d..bbf3b4b080cd 100644 --- a/arch/xtensa/boot/boot-redboot/bootstrap.S +++ b/arch/xtensa/boot/boot-redboot/bootstrap.S @@ -42,6 +42,7 @@ __start_a0: .align 4 .section .text, "ax" + .literal_position .begin literal_prefix .text /* put literals in here! */ diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile index bb8d55775a97..91907590d183 100644 --- a/arch/xtensa/kernel/Makefile +++ b/arch/xtensa/kernel/Makefile @@ -17,9 +17,6 @@ obj-$(CONFIG_XTENSA_VARIANT_HAVE_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_S32C1I_SELFTEST) += s32c1i_selftest.o -AFLAGS_head.o += -mtext-section-literals -AFLAGS_mxhead.o += -mtext-section-literals - # In the Xtensa architecture, assembly generates literals which must always # precede the L32R instruction with a relative offset less than 256 kB. # Therefore, the .text and .literal section must be combined in parenthesis diff --git a/arch/xtensa/kernel/align.S b/arch/xtensa/kernel/align.S index 890004af03a9..24b3189d7841 100644 --- a/arch/xtensa/kernel/align.S +++ b/arch/xtensa/kernel/align.S @@ -155,7 +155,7 @@ * < VALID_DOUBLE_EXCEPTION_ADDRESS: regular exception */ - + .literal_position ENTRY(fast_unaligned) /* Note: We don't expect the address to be aligned on a word diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index 37a239556889..5d5707831626 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -125,6 +125,7 @@ * * Note: _user_exception might be at an odd address. Don't use call0..call12 */ + .literal_position ENTRY(user_exception) @@ -777,6 +778,8 @@ ENDPROC(kernel_exception) * When we get here, a0 is trashed and saved to excsave[debuglevel] */ + .literal_position + ENTRY(debug_exception) rsr a0, SREG_EPS + XCHAL_DEBUGLEVEL @@ -916,6 +919,8 @@ ENDPROC(debug_exception) unrecoverable_text: .ascii "Unrecoverable error in exception handler\0" + .literal_position + ENTRY(unrecoverable_exception) movi a0, 1 @@ -1117,6 +1122,8 @@ ENDPROC(fast_syscall_unrecoverable) * j done */ + .literal_position + #ifdef CONFIG_FAST_SYSCALL_XTENSA #define TRY \ @@ -1887,6 +1894,7 @@ ENDPROC(fast_store_prohibited) * void system_call (struct pt_regs* regs, int exccause) * a2 a3 */ + .literal_position ENTRY(system_call) diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 08175df7a69e..253a0178f1bd 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -277,13 +277,13 @@ extern char _end[]; extern char _stext[]; extern char _WindowVectors_text_start; extern char _WindowVectors_text_end; -extern char _DebugInterruptVector_literal_start; +extern char _DebugInterruptVector_text_start; extern char _DebugInterruptVector_text_end; -extern char _KernelExceptionVector_literal_start; +extern char _KernelExceptionVector_text_start; extern char _KernelExceptionVector_text_end; -extern char _UserExceptionVector_literal_start; +extern char _UserExceptionVector_text_start; extern char _UserExceptionVector_text_end; -extern char _DoubleExceptionVector_literal_start; +extern char _DoubleExceptionVector_text_start; extern char _DoubleExceptionVector_text_end; #if XCHAL_EXCM_LEVEL >= 2 extern char _Level2InterruptVector_text_start; @@ -339,16 +339,16 @@ void __init setup_arch(char **cmdline_p) mem_reserve(__pa(&_WindowVectors_text_start), __pa(&_WindowVectors_text_end)); - mem_reserve(__pa(&_DebugInterruptVector_literal_start), + mem_reserve(__pa(&_DebugInterruptVector_text_start), __pa(&_DebugInterruptVector_text_end)); - mem_reserve(__pa(&_KernelExceptionVector_literal_start), + mem_reserve(__pa(&_KernelExceptionVector_text_start), __pa(&_KernelExceptionVector_text_end)); - mem_reserve(__pa(&_UserExceptionVector_literal_start), + mem_reserve(__pa(&_UserExceptionVector_text_start), __pa(&_UserExceptionVector_text_end)); - mem_reserve(__pa(&_DoubleExceptionVector_literal_start), + mem_reserve(__pa(&_DoubleExceptionVector_text_start), __pa(&_DoubleExceptionVector_text_end)); #if XCHAL_EXCM_LEVEL >= 2 diff --git a/arch/xtensa/kernel/vectors.S b/arch/xtensa/kernel/vectors.S index 332e9d635fb6..2bc85051c680 100644 --- a/arch/xtensa/kernel/vectors.S +++ b/arch/xtensa/kernel/vectors.S @@ -205,9 +205,6 @@ ENDPROC(_KernelExceptionVector) */ .section .DoubleExceptionVector.text, "ax" - .begin literal_prefix .DoubleExceptionVector - .globl _DoubleExceptionVector_WindowUnderflow - .globl _DoubleExceptionVector_WindowOverflow ENTRY(_DoubleExceptionVector) @@ -217,8 +214,12 @@ ENTRY(_DoubleExceptionVector) /* Check for kernel double exception (usually fatal). */ rsr a2, ps - _bbci.l a2, PS_UM_BIT, .Lksp + _bbsi.l a2, PS_UM_BIT, 1f + j .Lksp + .align 4 + .literal_position +1: /* Check if we are currently handling a window exception. */ /* Note: We don't need to indicate that we enter a critical section. */ @@ -475,11 +476,8 @@ _DoubleExceptionVector_handle_exception: rotw -3 j 1b - ENDPROC(_DoubleExceptionVector) - .end literal_prefix - .text /* * Fixup handler for TLB miss in double exception handler for window owerflow. @@ -508,6 +506,8 @@ ENDPROC(_DoubleExceptionVector) * a3: exctable, original value in excsave1 */ + .literal_position + ENTRY(window_overflow_restore_a0_fixup) rsr a0, ps diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index 162c77e53ca8..70b731edc7b8 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -45,24 +45,16 @@ jiffies = jiffies_64; LONG(sym ## _end); \ LONG(LOADADDR(section)) -/* Macro to define a section for a vector. - * - * Use of the MIN function catches the types of errors illustrated in - * the following example: - * - * Assume the section .DoubleExceptionVector.literal is completely - * full. Then a programmer adds code to .DoubleExceptionVector.text - * that produces another literal. The final literal position will - * overlay onto the first word of the adjacent code section - * .DoubleExceptionVector.text. (In practice, the literals will - * overwrite the code, and the first few instructions will be - * garbage.) +/* + * Macro to define a section for a vector. When CONFIG_VECTORS_OFFSET is + * defined code for every vector is located with other init data. At startup + * time head.S copies code for every vector to its final position according + * to description recorded in the corresponding RELOCATE_ENTRY. */ #ifdef CONFIG_VECTORS_OFFSET -#define SECTION_VECTOR(sym, section, addr, max_prevsec_size, prevsec) \ - section addr : AT((MIN(LOADADDR(prevsec) + max_prevsec_size, \ - LOADADDR(prevsec) + SIZEOF(prevsec)) + 3) & ~ 3) \ +#define SECTION_VECTOR(sym, section, addr, prevsec) \ + section addr : AT(((LOADADDR(prevsec) + SIZEOF(prevsec)) + 3) & ~ 3) \ { \ . = ALIGN(4); \ sym ## _start = ABSOLUTE(.); \ @@ -112,26 +104,19 @@ SECTIONS #if XCHAL_EXCM_LEVEL >= 6 SECTION_VECTOR (.Level6InterruptVector.text, INTLEVEL6_VECTOR_VADDR) #endif - SECTION_VECTOR (.DebugInterruptVector.literal, DEBUG_VECTOR_VADDR - 4) SECTION_VECTOR (.DebugInterruptVector.text, DEBUG_VECTOR_VADDR) - SECTION_VECTOR (.KernelExceptionVector.literal, KERNEL_VECTOR_VADDR - 4) SECTION_VECTOR (.KernelExceptionVector.text, KERNEL_VECTOR_VADDR) - SECTION_VECTOR (.UserExceptionVector.literal, USER_VECTOR_VADDR - 4) SECTION_VECTOR (.UserExceptionVector.text, USER_VECTOR_VADDR) - SECTION_VECTOR (.DoubleExceptionVector.literal, DOUBLEEXC_VECTOR_VADDR - 20) SECTION_VECTOR (.DoubleExceptionVector.text, DOUBLEEXC_VECTOR_VADDR) #endif + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT + ENTRY_TEXT TEXT_TEXT - VMLINUX_SYMBOL(__sched_text_start) = .; - *(.sched.literal .sched.text) - VMLINUX_SYMBOL(__sched_text_end) = .; - VMLINUX_SYMBOL(__cpuidle_text_start) = .; - *(.cpuidle.literal .cpuidle.text) - VMLINUX_SYMBOL(__cpuidle_text_end) = .; - VMLINUX_SYMBOL(__lock_text_start) = .; - *(.spinlock.literal .spinlock.text) - VMLINUX_SYMBOL(__lock_text_end) = .; + SCHED_TEXT + CPUIDLE_TEXT + LOCK_TEXT } _etext = .; @@ -196,8 +181,6 @@ SECTIONS .KernelExceptionVector.text); RELOCATE_ENTRY(_UserExceptionVector_text, .UserExceptionVector.text); - RELOCATE_ENTRY(_DoubleExceptionVector_literal, - .DoubleExceptionVector.literal); RELOCATE_ENTRY(_DoubleExceptionVector_text, .DoubleExceptionVector.text); RELOCATE_ENTRY(_DebugInterruptVector_text, @@ -230,25 +213,19 @@ SECTIONS SECTION_VECTOR (_WindowVectors_text, .WindowVectors.text, - WINDOW_VECTORS_VADDR, 4, + WINDOW_VECTORS_VADDR, .dummy) - SECTION_VECTOR (_DebugInterruptVector_literal, - .DebugInterruptVector.literal, - DEBUG_VECTOR_VADDR - 4, - SIZEOF(.WindowVectors.text), - .WindowVectors.text) SECTION_VECTOR (_DebugInterruptVector_text, .DebugInterruptVector.text, DEBUG_VECTOR_VADDR, - 4, - .DebugInterruptVector.literal) + .WindowVectors.text) #undef LAST #define LAST .DebugInterruptVector.text #if XCHAL_EXCM_LEVEL >= 2 SECTION_VECTOR (_Level2InterruptVector_text, .Level2InterruptVector.text, INTLEVEL2_VECTOR_VADDR, - SIZEOF(LAST), LAST) + LAST) # undef LAST # define LAST .Level2InterruptVector.text #endif @@ -256,7 +233,7 @@ SECTIONS SECTION_VECTOR (_Level3InterruptVector_text, .Level3InterruptVector.text, INTLEVEL3_VECTOR_VADDR, - SIZEOF(LAST), LAST) + LAST) # undef LAST # define LAST .Level3InterruptVector.text #endif @@ -264,7 +241,7 @@ SECTIONS SECTION_VECTOR (_Level4InterruptVector_text, .Level4InterruptVector.text, INTLEVEL4_VECTOR_VADDR, - SIZEOF(LAST), LAST) + LAST) # undef LAST # define LAST .Level4InterruptVector.text #endif @@ -272,7 +249,7 @@ SECTIONS SECTION_VECTOR (_Level5InterruptVector_text, .Level5InterruptVector.text, INTLEVEL5_VECTOR_VADDR, - SIZEOF(LAST), LAST) + LAST) # undef LAST # define LAST .Level5InterruptVector.text #endif @@ -280,40 +257,23 @@ SECTIONS SECTION_VECTOR (_Level6InterruptVector_text, .Level6InterruptVector.text, INTLEVEL6_VECTOR_VADDR, - SIZEOF(LAST), LAST) + LAST) # undef LAST # define LAST .Level6InterruptVector.text #endif - SECTION_VECTOR (_KernelExceptionVector_literal, - .KernelExceptionVector.literal, - KERNEL_VECTOR_VADDR - 4, - SIZEOF(LAST), LAST) -#undef LAST SECTION_VECTOR (_KernelExceptionVector_text, .KernelExceptionVector.text, KERNEL_VECTOR_VADDR, - 4, - .KernelExceptionVector.literal) - SECTION_VECTOR (_UserExceptionVector_literal, - .UserExceptionVector.literal, - USER_VECTOR_VADDR - 4, - SIZEOF(.KernelExceptionVector.text), - .KernelExceptionVector.text) + LAST) +#undef LAST SECTION_VECTOR (_UserExceptionVector_text, .UserExceptionVector.text, USER_VECTOR_VADDR, - 4, - .UserExceptionVector.literal) - SECTION_VECTOR (_DoubleExceptionVector_literal, - .DoubleExceptionVector.literal, - DOUBLEEXC_VECTOR_VADDR - 20, - SIZEOF(.UserExceptionVector.text), - .UserExceptionVector.text) + .KernelExceptionVector.text) SECTION_VECTOR (_DoubleExceptionVector_text, .DoubleExceptionVector.text, DOUBLEEXC_VECTOR_VADDR, - 20, - .DoubleExceptionVector.literal) + .UserExceptionVector.text) . = (LOADADDR( .DoubleExceptionVector.text ) + SIZEOF( .DoubleExceptionVector.text ) + 3) & ~ 3; @@ -323,7 +283,6 @@ SECTIONS SECTION_VECTOR (_SecondaryResetVector_text, .SecondaryResetVector.text, RESET_VECTOR1_VADDR, - SIZEOF(.DoubleExceptionVector.text), .DoubleExceptionVector.text) . = LOADADDR(.SecondaryResetVector.text)+SIZEOF(.SecondaryResetVector.text); @@ -373,5 +332,4 @@ SECTIONS /* Sections to be discarded */ DISCARDS - /DISCARD/ : { *(.exit.literal) } } -- cgit v1.2.3 From 2da03d4114b2587f0e8e45f4862074e34daee64e Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sat, 9 Dec 2017 18:44:11 -0800 Subject: xtensa: use call instead of callx in assembly code Now that xtensa assembly sources are compiled with -mlongcalls let the assembler and linker relax call instructions into l32r + callx where needed. This change makes the code cleaner and potentially a bit faster. Signed-off-by: Max Filippov --- arch/xtensa/kernel/coprocessor.S | 3 +-- arch/xtensa/kernel/entry.S | 56 ++++++++++++++-------------------------- arch/xtensa/kernel/head.S | 10 +++---- arch/xtensa/kernel/vectors.S | 3 +-- 4 files changed, 24 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index 3a98503ad11a..4f8b52d575a2 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -212,8 +212,7 @@ ENDPROC(coprocessor_restore) ENTRY(fast_coprocessor_double) wsr a0, excsave1 - movi a0, unrecoverable_exception - callx0 a0 + call0 unrecoverable_exception ENDPROC(fast_coprocessor_double) diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index 5d5707831626..5a2110bb5902 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -476,8 +476,7 @@ common_exception_return: 1: irq_save a2, a3 #ifdef CONFIG_TRACE_IRQFLAGS - movi a4, trace_hardirqs_off - callx4 a4 + call4 trace_hardirqs_off #endif /* Jump if we are returning from kernel exceptions. */ @@ -504,24 +503,20 @@ common_exception_return: /* Call do_signal() */ #ifdef CONFIG_TRACE_IRQFLAGS - movi a4, trace_hardirqs_on - callx4 a4 + call4 trace_hardirqs_on #endif rsil a2, 0 - movi a4, do_notify_resume # int do_notify_resume(struct pt_regs*) mov a6, a1 - callx4 a4 + call4 do_notify_resume # int do_notify_resume(struct pt_regs*) j 1b 3: /* Reschedule */ #ifdef CONFIG_TRACE_IRQFLAGS - movi a4, trace_hardirqs_on - callx4 a4 + call4 trace_hardirqs_on #endif rsil a2, 0 - movi a4, schedule # void schedule (void) - callx4 a4 + call4 schedule # void schedule (void) j 1b #ifdef CONFIG_PREEMPT @@ -532,8 +527,7 @@ common_exception_return: l32i a4, a2, TI_PRE_COUNT bnez a4, 4f - movi a4, preempt_schedule_irq - callx4 a4 + call4 preempt_schedule_irq j 1b #endif @@ -546,23 +540,20 @@ common_exception_return: 5: #ifdef CONFIG_HAVE_HW_BREAKPOINT _bbci.l a4, TIF_DB_DISABLED, 7f - movi a4, restore_dbreak - callx4 a4 + call4 restore_dbreak 7: #endif #ifdef CONFIG_DEBUG_TLB_SANITY l32i a4, a1, PT_DEPC bgeui a4, VALID_DOUBLE_EXCEPTION_ADDRESS, 4f - movi a4, check_tlb_sanity - callx4 a4 + call4 check_tlb_sanity #endif 6: 4: #ifdef CONFIG_TRACE_IRQFLAGS extui a4, a3, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH bgei a4, LOCKLEVEL, 1f - movi a4, trace_hardirqs_on - callx4 a4 + call4 trace_hardirqs_on 1: #endif /* Restore optional registers. */ @@ -938,10 +929,8 @@ ENTRY(unrecoverable_exception) movi a0, 0 addi a1, a1, PT_REGS_OFFSET - movi a4, panic movi a6, unrecoverable_text - - callx4 a4 + call4 panic 1: j 1b @@ -1078,8 +1067,7 @@ ENTRY(fast_syscall_unrecoverable) xsr a2, depc # restore a2, depc wsr a0, excsave1 - movi a0, unrecoverable_exception - callx0 a0 + call0 unrecoverable_exception ENDPROC(fast_syscall_unrecoverable) @@ -1418,14 +1406,12 @@ ENTRY(fast_syscall_spill_registers) rsync movi a6, SIGSEGV - movi a4, do_exit - callx4 a4 + call4 do_exit /* shouldn't return, so panic */ wsr a0, excsave1 - movi a0, unrecoverable_exception - callx0 a0 # should not return + call0 unrecoverable_exception # should not return 1: j 1b @@ -1571,8 +1557,8 @@ ENDPROC(fast_syscall_spill_registers) ENTRY(fast_second_level_miss_double_kernel) -1: movi a0, unrecoverable_exception - callx0 a0 # should not return +1: + call0 unrecoverable_exception # should not return 1: j 1b ENDPROC(fast_second_level_miss_double_kernel) @@ -1904,9 +1890,8 @@ ENTRY(system_call) l32i a3, a2, PT_AREG2 mov a6, a2 - movi a4, do_syscall_trace_enter s32i a3, a2, PT_SYSCALL - callx4 a4 + call4 do_syscall_trace_enter mov a3, a6 /* syscall = sys_call_table[syscall_nr] */ @@ -1938,9 +1923,8 @@ ENTRY(system_call) 1: /* regs->areg[2] = return_value */ s32i a6, a2, PT_AREG2 - movi a4, do_syscall_trace_leave mov a6, a2 - callx4 a4 + call4 do_syscall_trace_leave retw ENDPROC(system_call) @@ -2056,12 +2040,10 @@ ENTRY(ret_from_fork) /* void schedule_tail (struct task_struct *prev) * Note: prev is still in a6 (return value from fake call4 frame) */ - movi a4, schedule_tail - callx4 a4 + call4 schedule_tail - movi a4, do_syscall_trace_leave mov a6, a1 - callx4 a4 + call4 do_syscall_trace_leave j common_exception_return diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index 23ce62e60435..9c4e9433e536 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -264,11 +264,8 @@ ENTRY(_startup) /* init_arch kick-starts the linux kernel */ - movi a4, init_arch - callx4 a4 - - movi a4, start_kernel - callx4 a4 + call4 init_arch + call4 start_kernel should_never_return: j should_never_return @@ -294,8 +291,7 @@ should_never_return: movi a6, 0 wsr a6, excsave1 - movi a4, secondary_start_kernel - callx4 a4 + call4 secondary_start_kernel j should_never_return #endif /* CONFIG_SMP */ diff --git a/arch/xtensa/kernel/vectors.S b/arch/xtensa/kernel/vectors.S index 2bc85051c680..841503d3307c 100644 --- a/arch/xtensa/kernel/vectors.S +++ b/arch/xtensa/kernel/vectors.S @@ -305,8 +305,7 @@ _DoubleExceptionVector_WindowUnderflow: .Lunrecoverable: rsr a3, excsave1 wsr a0, excsave1 - movi a0, unrecoverable_exception - callx0 a0 + call0 unrecoverable_exception .Lfixup:/* Check for a fixup handler or if we were in a critical section. */ -- cgit v1.2.3 From 0013aceb307482ba83a5b6a29f6ba1791be0d32b Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sat, 9 Dec 2017 21:18:47 -0800 Subject: xtensa: clean up fixups in assembly code Remove duplicate definitions of EX() and similar TRY/CATCH and SRC/DST macros from assembly sources and put single definition into asm/asmmacro.h Signed-off-by: Max Filippov --- arch/xtensa/include/asm/asmmacro.h | 7 +++ arch/xtensa/kernel/entry.S | 33 ++---------- arch/xtensa/lib/checksum.S | 74 +++++++++++--------------- arch/xtensa/lib/memset.S | 36 +++++-------- arch/xtensa/lib/strncpy_user.S | 52 ++++++++---------- arch/xtensa/lib/strnlen_user.S | 19 +++---- arch/xtensa/lib/usercopy.S | 106 +++++++++++++++++-------------------- 7 files changed, 133 insertions(+), 194 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/asmmacro.h b/arch/xtensa/include/asm/asmmacro.h index 746dcc8b5abc..d2a4415a4c08 100644 --- a/arch/xtensa/include/asm/asmmacro.h +++ b/arch/xtensa/include/asm/asmmacro.h @@ -150,5 +150,12 @@ __endl \ar \as .endm +/* Load or store instructions that may cause exceptions use the EX macro. */ + +#define EX(handler) \ + .section __ex_table, "a"; \ + .word 97f, handler; \ + .previous \ +97: #endif /* _XTENSA_ASMMACRO_H */ diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index 5a2110bb5902..a27a9a65635b 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -1094,35 +1095,12 @@ ENDPROC(fast_syscall_unrecoverable) * < VALID_DOUBLE_EXCEPTION_ADDRESS: regular exception * * Note: we don't have to save a2; a2 holds the return value - * - * We use the two macros TRY and CATCH: - * - * TRY adds an entry to the __ex_table fixup table for the immediately - * following instruction. - * - * CATCH catches any exception that occurred at one of the preceding TRY - * statements and continues from there - * - * Usage TRY l32i a0, a1, 0 - * - * done: rfe - * CATCH - * j done */ .literal_position #ifdef CONFIG_FAST_SYSCALL_XTENSA -#define TRY \ - .section __ex_table, "a"; \ - .word 66f, 67f; \ - .text; \ -66: - -#define CATCH \ -67: - ENTRY(fast_syscall_xtensa) s32i a7, a2, PT_AREG7 # we need an additional register @@ -1136,9 +1114,9 @@ ENTRY(fast_syscall_xtensa) .Lswp: /* Atomic compare and swap */ -TRY l32i a0, a3, 0 # read old value +EX(.Leac) l32i a0, a3, 0 # read old value bne a0, a4, 1f # same as old value? jump -TRY s32i a5, a3, 0 # different, modify value +EX(.Leac) s32i a5, a3, 0 # different, modify value l32i a7, a2, PT_AREG7 # restore a7 l32i a0, a2, PT_AREG0 # restore a0 movi a2, 1 # and return 1 @@ -1151,12 +1129,12 @@ TRY s32i a5, a3, 0 # different, modify value .Lnswp: /* Atomic set, add, and exg_add. */ -TRY l32i a7, a3, 0 # orig +EX(.Leac) l32i a7, a3, 0 # orig addi a6, a6, -SYS_XTENSA_ATOMIC_SET add a0, a4, a7 # + arg moveqz a0, a4, a6 # set addi a6, a6, SYS_XTENSA_ATOMIC_SET -TRY s32i a0, a3, 0 # write new value +EX(.Leac) s32i a0, a3, 0 # write new value mov a0, a2 mov a2, a7 @@ -1164,7 +1142,6 @@ TRY s32i a0, a3, 0 # write new value l32i a0, a0, PT_AREG0 # restore a0 rfe -CATCH .Leac: l32i a7, a2, PT_AREG7 # restore a7 l32i a0, a2, PT_AREG0 # restore a0 movi a2, -EFAULT diff --git a/arch/xtensa/lib/checksum.S b/arch/xtensa/lib/checksum.S index 4eb573d2720e..528fe0dd9339 100644 --- a/arch/xtensa/lib/checksum.S +++ b/arch/xtensa/lib/checksum.S @@ -14,9 +14,10 @@ * 2 of the License, or (at your option) any later version. */ -#include +#include #include #include +#include /* * computes a partial checksum, e.g. for TCP/UDP fragments @@ -175,23 +176,8 @@ ENDPROC(csum_partial) /* * Copy from ds while checksumming, otherwise like csum_partial - * - * The macros SRC and DST specify the type of access for the instruction. - * thus we can call a custom exception handler for each access type. */ -#define SRC(y...) \ - 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6001f ; \ - .previous - -#define DST(y...) \ - 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6002f ; \ - .previous - /* unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, int sum, int *src_err_ptr, int *dst_err_ptr) @@ -244,28 +230,28 @@ ENTRY(csum_partial_copy_generic) add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ .Loop5: #endif -SRC( l32i a9, a2, 0 ) -SRC( l32i a8, a2, 4 ) -DST( s32i a9, a3, 0 ) -DST( s32i a8, a3, 4 ) +EX(10f) l32i a9, a2, 0 +EX(10f) l32i a8, a2, 4 +EX(11f) s32i a9, a3, 0 +EX(11f) s32i a8, a3, 4 ONES_ADD(a5, a9) ONES_ADD(a5, a8) -SRC( l32i a9, a2, 8 ) -SRC( l32i a8, a2, 12 ) -DST( s32i a9, a3, 8 ) -DST( s32i a8, a3, 12 ) +EX(10f) l32i a9, a2, 8 +EX(10f) l32i a8, a2, 12 +EX(11f) s32i a9, a3, 8 +EX(11f) s32i a8, a3, 12 ONES_ADD(a5, a9) ONES_ADD(a5, a8) -SRC( l32i a9, a2, 16 ) -SRC( l32i a8, a2, 20 ) -DST( s32i a9, a3, 16 ) -DST( s32i a8, a3, 20 ) +EX(10f) l32i a9, a2, 16 +EX(10f) l32i a8, a2, 20 +EX(11f) s32i a9, a3, 16 +EX(11f) s32i a8, a3, 20 ONES_ADD(a5, a9) ONES_ADD(a5, a8) -SRC( l32i a9, a2, 24 ) -SRC( l32i a8, a2, 28 ) -DST( s32i a9, a3, 24 ) -DST( s32i a8, a3, 28 ) +EX(10f) l32i a9, a2, 24 +EX(10f) l32i a8, a2, 28 +EX(11f) s32i a9, a3, 24 +EX(11f) s32i a8, a3, 28 ONES_ADD(a5, a9) ONES_ADD(a5, a8) addi a2, a2, 32 @@ -284,8 +270,8 @@ DST( s32i a8, a3, 28 ) add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ .Loop6: #endif -SRC( l32i a9, a2, 0 ) -DST( s32i a9, a3, 0 ) +EX(10f) l32i a9, a2, 0 +EX(11f) s32i a9, a3, 0 ONES_ADD(a5, a9) addi a2, a2, 4 addi a3, a3, 4 @@ -315,8 +301,8 @@ DST( s32i a9, a3, 0 ) add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ .Loop7: #endif -SRC( l16ui a9, a2, 0 ) -DST( s16i a9, a3, 0 ) +EX(10f) l16ui a9, a2, 0 +EX(11f) s16i a9, a3, 0 ONES_ADD(a5, a9) addi a2, a2, 2 addi a3, a3, 2 @@ -326,8 +312,8 @@ DST( s16i a9, a3, 0 ) 4: /* This section processes a possible trailing odd byte. */ _bbci.l a4, 0, 8f /* 1-byte chunk */ -SRC( l8ui a9, a2, 0 ) -DST( s8i a9, a3, 0 ) +EX(10f) l8ui a9, a2, 0 +EX(11f) s8i a9, a3, 0 #ifdef __XTENSA_EB__ slli a9, a9, 8 /* shift byte to bits 8..15 */ #endif @@ -350,10 +336,10 @@ DST( s8i a9, a3, 0 ) add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ .Loop8: #endif -SRC( l8ui a9, a2, 0 ) -SRC( l8ui a8, a2, 1 ) -DST( s8i a9, a3, 0 ) -DST( s8i a8, a3, 1 ) +EX(10f) l8ui a9, a2, 0 +EX(10f) l8ui a8, a2, 1 +EX(11f) s8i a9, a3, 0 +EX(11f) s8i a8, a3, 1 #ifdef __XTENSA_EB__ slli a9, a9, 8 /* combine into a single 16-bit value */ #else /* for checksum computation */ @@ -381,7 +367,7 @@ ENDPROC(csum_partial_copy_generic) a12 = original dst for exception handling */ -6001: +10: _movi a2, -EFAULT s32i a2, a6, 0 /* src_err_ptr */ @@ -403,7 +389,7 @@ ENDPROC(csum_partial_copy_generic) 2: retw -6002: +11: movi a2, -EFAULT s32i a2, a7, 0 /* dst_err_ptr */ movi a2, 0 diff --git a/arch/xtensa/lib/memset.S b/arch/xtensa/lib/memset.S index 10b8c400f175..7a724edaf4f1 100644 --- a/arch/xtensa/lib/memset.S +++ b/arch/xtensa/lib/memset.S @@ -12,6 +12,7 @@ */ #include +#include /* * void *memset(void *dst, int c, size_t length) @@ -28,15 +29,6 @@ * the alignment labels). */ -/* Load or store instructions that may cause exceptions use the EX macro. */ - -#define EX(insn,reg1,reg2,offset,handler) \ -9: insn reg1, reg2, offset; \ - .section __ex_table, "a"; \ - .word 9b, handler; \ - .previous - - .text .align 4 .global memset @@ -73,10 +65,10 @@ memset: add a6, a6, a5 # a6 = end of last 16B chunk #endif /* !XCHAL_HAVE_LOOPS */ .Loop1: - EX(s32i, a3, a5, 0, memset_fixup) - EX(s32i, a3, a5, 4, memset_fixup) - EX(s32i, a3, a5, 8, memset_fixup) - EX(s32i, a3, a5, 12, memset_fixup) +EX(10f) s32i a3, a5, 0 +EX(10f) s32i a3, a5, 4 +EX(10f) s32i a3, a5, 8 +EX(10f) s32i a3, a5, 12 addi a5, a5, 16 #if !XCHAL_HAVE_LOOPS blt a5, a6, .Loop1 @@ -84,23 +76,23 @@ memset: .Loop1done: bbci.l a4, 3, .L2 # set 8 bytes - EX(s32i, a3, a5, 0, memset_fixup) - EX(s32i, a3, a5, 4, memset_fixup) +EX(10f) s32i a3, a5, 0 +EX(10f) s32i a3, a5, 4 addi a5, a5, 8 .L2: bbci.l a4, 2, .L3 # set 4 bytes - EX(s32i, a3, a5, 0, memset_fixup) +EX(10f) s32i a3, a5, 0 addi a5, a5, 4 .L3: bbci.l a4, 1, .L4 # set 2 bytes - EX(s16i, a3, a5, 0, memset_fixup) +EX(10f) s16i a3, a5, 0 addi a5, a5, 2 .L4: bbci.l a4, 0, .L5 # set 1 byte - EX(s8i, a3, a5, 0, memset_fixup) +EX(10f) s8i a3, a5, 0 .L5: .Lret1: retw @@ -114,7 +106,7 @@ memset: bbci.l a5, 0, .L20 # branch if dst alignment half-aligned # dst is only byte aligned # set 1 byte - EX(s8i, a3, a5, 0, memset_fixup) +EX(10f) s8i a3, a5, 0 addi a5, a5, 1 addi a4, a4, -1 # now retest if dst aligned @@ -122,7 +114,7 @@ memset: .L20: # dst half-aligned # set 2 bytes - EX(s16i, a3, a5, 0, memset_fixup) +EX(10f) s16i a3, a5, 0 addi a5, a5, 2 addi a4, a4, -2 j .L0 # dst is now aligned, return to main algorithm @@ -141,7 +133,7 @@ memset: add a6, a5, a4 # a6 = ending address #endif /* !XCHAL_HAVE_LOOPS */ .Lbyteloop: - EX(s8i, a3, a5, 0, memset_fixup) +EX(10f) s8i a3, a5, 0 addi a5, a5, 1 #if !XCHAL_HAVE_LOOPS blt a5, a6, .Lbyteloop @@ -155,6 +147,6 @@ memset: /* We return zero if a failure occurred. */ -memset_fixup: +10: movi a2, 0 retw diff --git a/arch/xtensa/lib/strncpy_user.S b/arch/xtensa/lib/strncpy_user.S index 1ad0ecf45368..827e1b393f3f 100644 --- a/arch/xtensa/lib/strncpy_user.S +++ b/arch/xtensa/lib/strncpy_user.S @@ -11,16 +11,9 @@ * Copyright (C) 2002 Tensilica Inc. */ -#include #include - -/* Load or store instructions that may cause exceptions use the EX macro. */ - -#define EX(insn,reg1,reg2,offset,handler) \ -9: insn reg1, reg2, offset; \ - .section __ex_table, "a"; \ - .word 9b, handler; \ - .previous +#include +#include /* * char *__strncpy_user(char *dst, const char *src, size_t len) @@ -75,9 +68,9 @@ __strncpy_user: j .Ldstunaligned .Lsrc1mod2: # src address is odd - EX(l8ui, a9, a3, 0, fixup_l) # get byte 0 +EX(11f) l8ui a9, a3, 0 # get byte 0 addi a3, a3, 1 # advance src pointer - EX(s8i, a9, a11, 0, fixup_s) # store byte 0 +EX(10f) s8i a9, a11, 0 # store byte 0 beqz a9, .Lret # if byte 0 is zero addi a11, a11, 1 # advance dst pointer addi a4, a4, -1 # decrement len @@ -85,16 +78,16 @@ __strncpy_user: bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned .Lsrc2mod4: # src address is 2 mod 4 - EX(l8ui, a9, a3, 0, fixup_l) # get byte 0 +EX(11f) l8ui a9, a3, 0 # get byte 0 /* 1-cycle interlock */ - EX(s8i, a9, a11, 0, fixup_s) # store byte 0 +EX(10f) s8i a9, a11, 0 # store byte 0 beqz a9, .Lret # if byte 0 is zero addi a11, a11, 1 # advance dst pointer addi a4, a4, -1 # decrement len beqz a4, .Lret # if len is zero - EX(l8ui, a9, a3, 1, fixup_l) # get byte 0 +EX(11f) l8ui a9, a3, 1 # get byte 0 addi a3, a3, 2 # advance src pointer - EX(s8i, a9, a11, 0, fixup_s) # store byte 0 +EX(10f) s8i a9, a11, 0 # store byte 0 beqz a9, .Lret # if byte 0 is zero addi a11, a11, 1 # advance dst pointer addi a4, a4, -1 # decrement len @@ -117,12 +110,12 @@ __strncpy_user: add a12, a12, a11 # a12 = end of last 4B chunck #endif .Loop1: - EX(l32i, a9, a3, 0, fixup_l) # get word from src +EX(11f) l32i a9, a3, 0 # get word from src addi a3, a3, 4 # advance src pointer bnone a9, a5, .Lz0 # if byte 0 is zero bnone a9, a6, .Lz1 # if byte 1 is zero bnone a9, a7, .Lz2 # if byte 2 is zero - EX(s32i, a9, a11, 0, fixup_s) # store word to dst +EX(10f) s32i a9, a11, 0 # store word to dst bnone a9, a8, .Lz3 # if byte 3 is zero addi a11, a11, 4 # advance dst pointer #if !XCHAL_HAVE_LOOPS @@ -132,7 +125,7 @@ __strncpy_user: .Loop1done: bbci.l a4, 1, .L100 # copy 2 bytes - EX(l16ui, a9, a3, 0, fixup_l) +EX(11f) l16ui a9, a3, 0 addi a3, a3, 2 # advance src pointer #ifdef __XTENSA_EB__ bnone a9, a7, .Lz0 # if byte 2 is zero @@ -141,13 +134,13 @@ __strncpy_user: bnone a9, a5, .Lz0 # if byte 0 is zero bnone a9, a6, .Lz1 # if byte 1 is zero #endif - EX(s16i, a9, a11, 0, fixup_s) +EX(10f) s16i a9, a11, 0 addi a11, a11, 2 # advance dst pointer .L100: bbci.l a4, 0, .Lret - EX(l8ui, a9, a3, 0, fixup_l) +EX(11f) l8ui a9, a3, 0 /* slot */ - EX(s8i, a9, a11, 0, fixup_s) +EX(10f) s8i a9, a11, 0 beqz a9, .Lret # if byte is zero addi a11, a11, 1-3 # advance dst ptr 1, but also cancel # the effect of adding 3 in .Lz3 code @@ -161,14 +154,14 @@ __strncpy_user: #ifdef __XTENSA_EB__ movi a9, 0 #endif /* __XTENSA_EB__ */ - EX(s8i, a9, a11, 0, fixup_s) +EX(10f) s8i a9, a11, 0 sub a2, a11, a2 # compute strlen retw .Lz1: # byte 1 is zero #ifdef __XTENSA_EB__ extui a9, a9, 16, 16 #endif /* __XTENSA_EB__ */ - EX(s16i, a9, a11, 0, fixup_s) +EX(10f) s16i a9, a11, 0 addi a11, a11, 1 # advance dst pointer sub a2, a11, a2 # compute strlen retw @@ -176,9 +169,9 @@ __strncpy_user: #ifdef __XTENSA_EB__ extui a9, a9, 16, 16 #endif /* __XTENSA_EB__ */ - EX(s16i, a9, a11, 0, fixup_s) +EX(10f) s16i a9, a11, 0 movi a9, 0 - EX(s8i, a9, a11, 2, fixup_s) +EX(10f) s8i a9, a11, 2 addi a11, a11, 2 # advance dst pointer sub a2, a11, a2 # compute strlen retw @@ -196,9 +189,9 @@ __strncpy_user: add a12, a11, a4 # a12 = ending address #endif /* XCHAL_HAVE_LOOPS */ .Lnextbyte: - EX(l8ui, a9, a3, 0, fixup_l) +EX(11f) l8ui a9, a3, 0 addi a3, a3, 1 - EX(s8i, a9, a11, 0, fixup_s) +EX(10f) s8i a9, a11, 0 beqz a9, .Lunalignedend addi a11, a11, 1 #if !XCHAL_HAVE_LOOPS @@ -218,8 +211,7 @@ __strncpy_user: * implementation in memset(). Thus, we differentiate between * load/store fixups. */ -fixup_s: -fixup_l: +10: +11: movi a2, -EFAULT retw - diff --git a/arch/xtensa/lib/strnlen_user.S b/arch/xtensa/lib/strnlen_user.S index 4c03b1e581e9..9404ac46ce4c 100644 --- a/arch/xtensa/lib/strnlen_user.S +++ b/arch/xtensa/lib/strnlen_user.S @@ -12,14 +12,7 @@ */ #include - -/* Load or store instructions that may cause exceptions use the EX macro. */ - -#define EX(insn,reg1,reg2,offset,handler) \ -9: insn reg1, reg2, offset; \ - .section __ex_table, "a"; \ - .word 9b, handler; \ - .previous +#include /* * size_t __strnlen_user(const char *s, size_t len) @@ -77,7 +70,7 @@ __strnlen_user: add a10, a10, a4 # a10 = end of last 4B chunk #endif /* XCHAL_HAVE_LOOPS */ .Loop: - EX(l32i, a9, a4, 4, lenfixup) # get next word of string +EX(10f) l32i a9, a4, 4 # get next word of string addi a4, a4, 4 # advance string pointer bnone a9, a5, .Lz0 # if byte 0 is zero bnone a9, a6, .Lz1 # if byte 1 is zero @@ -88,7 +81,7 @@ __strnlen_user: #endif .Ldone: - EX(l32i, a9, a4, 4, lenfixup) # load 4 bytes for remaining checks +EX(10f) l32i a9, a4, 4 # load 4 bytes for remaining checks bbci.l a3, 1, .L100 # check two more bytes (bytes 0, 1 of word) @@ -125,14 +118,14 @@ __strnlen_user: retw .L1mod2: # address is odd - EX(l8ui, a9, a4, 4, lenfixup) # get byte 0 +EX(10f) l8ui a9, a4, 4 # get byte 0 addi a4, a4, 1 # advance string pointer beqz a9, .Lz3 # if byte 0 is zero bbci.l a4, 1, .Laligned # if string pointer is now word-aligned .L2mod4: # address is 2 mod 4 addi a4, a4, 2 # advance ptr for aligned access - EX(l32i, a9, a4, 0, lenfixup) # get word with first two bytes of string +EX(10f) l32i a9, a4, 0 # get word with first two bytes of string bnone a9, a7, .Lz2 # if byte 2 (of word, not string) is zero bany a9, a8, .Laligned # if byte 3 (of word, not string) is nonzero # byte 3 is zero @@ -142,6 +135,6 @@ __strnlen_user: .section .fixup, "ax" .align 4 -lenfixup: +10: movi a2, 0 retw diff --git a/arch/xtensa/lib/usercopy.S b/arch/xtensa/lib/usercopy.S index d9cd766bde3e..4172b73b0364 100644 --- a/arch/xtensa/lib/usercopy.S +++ b/arch/xtensa/lib/usercopy.S @@ -54,6 +54,7 @@ */ #include +#include #ifdef __XTENSA_EB__ #define ALIGN(R, W0, W1) src R, W0, W1 @@ -63,15 +64,6 @@ #define SSA8(R) ssa8l R #endif -/* Load or store instructions that may cause exceptions use the EX macro. */ - -#define EX(insn,reg1,reg2,offset,handler) \ -9: insn reg1, reg2, offset; \ - .section __ex_table, "a"; \ - .word 9b, handler; \ - .previous - - .text .align 4 .global __xtensa_copy_user @@ -102,9 +94,9 @@ __xtensa_copy_user: bltui a4, 7, .Lbytecopy # do short copies byte by byte # copy 1 byte - EX(l8ui, a6, a3, 0, fixup) +EX(10f) l8ui a6, a3, 0 addi a3, a3, 1 - EX(s8i, a6, a5, 0, fixup) +EX(10f) s8i a6, a5, 0 addi a5, a5, 1 addi a4, a4, -1 bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then @@ -112,11 +104,11 @@ __xtensa_copy_user: .Ldst2mod4: # dst 16-bit aligned # copy 2 bytes bltui a4, 6, .Lbytecopy # do short copies byte by byte - EX(l8ui, a6, a3, 0, fixup) - EX(l8ui, a7, a3, 1, fixup) +EX(10f) l8ui a6, a3, 0 +EX(10f) l8ui a7, a3, 1 addi a3, a3, 2 - EX(s8i, a6, a5, 0, fixup) - EX(s8i, a7, a5, 1, fixup) +EX(10f) s8i a6, a5, 0 +EX(10f) s8i a7, a5, 1 addi a5, a5, 2 addi a4, a4, -2 j .Ldstaligned # dst is now aligned, return to main algorithm @@ -135,9 +127,9 @@ __xtensa_copy_user: add a7, a3, a4 # a7 = end address for source #endif /* !XCHAL_HAVE_LOOPS */ .Lnextbyte: - EX(l8ui, a6, a3, 0, fixup) +EX(10f) l8ui a6, a3, 0 addi a3, a3, 1 - EX(s8i, a6, a5, 0, fixup) +EX(10f) s8i a6, a5, 0 addi a5, a5, 1 #if !XCHAL_HAVE_LOOPS blt a3, a7, .Lnextbyte @@ -161,15 +153,15 @@ __xtensa_copy_user: add a8, a8, a3 # a8 = end of last 16B source chunk #endif /* !XCHAL_HAVE_LOOPS */ .Loop1: - EX(l32i, a6, a3, 0, fixup) - EX(l32i, a7, a3, 4, fixup) - EX(s32i, a6, a5, 0, fixup) - EX(l32i, a6, a3, 8, fixup) - EX(s32i, a7, a5, 4, fixup) - EX(l32i, a7, a3, 12, fixup) - EX(s32i, a6, a5, 8, fixup) +EX(10f) l32i a6, a3, 0 +EX(10f) l32i a7, a3, 4 +EX(10f) s32i a6, a5, 0 +EX(10f) l32i a6, a3, 8 +EX(10f) s32i a7, a5, 4 +EX(10f) l32i a7, a3, 12 +EX(10f) s32i a6, a5, 8 addi a3, a3, 16 - EX(s32i, a7, a5, 12, fixup) +EX(10f) s32i a7, a5, 12 addi a5, a5, 16 #if !XCHAL_HAVE_LOOPS blt a3, a8, .Loop1 @@ -177,31 +169,31 @@ __xtensa_copy_user: .Loop1done: bbci.l a4, 3, .L2 # copy 8 bytes - EX(l32i, a6, a3, 0, fixup) - EX(l32i, a7, a3, 4, fixup) +EX(10f) l32i a6, a3, 0 +EX(10f) l32i a7, a3, 4 addi a3, a3, 8 - EX(s32i, a6, a5, 0, fixup) - EX(s32i, a7, a5, 4, fixup) +EX(10f) s32i a6, a5, 0 +EX(10f) s32i a7, a5, 4 addi a5, a5, 8 .L2: bbci.l a4, 2, .L3 # copy 4 bytes - EX(l32i, a6, a3, 0, fixup) +EX(10f) l32i a6, a3, 0 addi a3, a3, 4 - EX(s32i, a6, a5, 0, fixup) +EX(10f) s32i a6, a5, 0 addi a5, a5, 4 .L3: bbci.l a4, 1, .L4 # copy 2 bytes - EX(l16ui, a6, a3, 0, fixup) +EX(10f) l16ui a6, a3, 0 addi a3, a3, 2 - EX(s16i, a6, a5, 0, fixup) +EX(10f) s16i a6, a5, 0 addi a5, a5, 2 .L4: bbci.l a4, 0, .L5 # copy 1 byte - EX(l8ui, a6, a3, 0, fixup) - EX(s8i, a6, a5, 0, fixup) +EX(10f) l8ui a6, a3, 0 +EX(10f) s8i a6, a5, 0 .L5: movi a2, 0 # return success for len bytes copied retw @@ -217,7 +209,7 @@ __xtensa_copy_user: # copy 16 bytes per iteration for word-aligned dst and unaligned src and a10, a3, a8 # save unalignment offset for below sub a3, a3, a10 # align a3 (to avoid sim warnings only; not needed for hardware) - EX(l32i, a6, a3, 0, fixup) # load first word +EX(10f) l32i a6, a3, 0 # load first word #if XCHAL_HAVE_LOOPS loopnez a7, .Loop2done #else /* !XCHAL_HAVE_LOOPS */ @@ -226,19 +218,19 @@ __xtensa_copy_user: add a12, a12, a3 # a12 = end of last 16B source chunk #endif /* !XCHAL_HAVE_LOOPS */ .Loop2: - EX(l32i, a7, a3, 4, fixup) - EX(l32i, a8, a3, 8, fixup) +EX(10f) l32i a7, a3, 4 +EX(10f) l32i a8, a3, 8 ALIGN( a6, a6, a7) - EX(s32i, a6, a5, 0, fixup) - EX(l32i, a9, a3, 12, fixup) +EX(10f) s32i a6, a5, 0 +EX(10f) l32i a9, a3, 12 ALIGN( a7, a7, a8) - EX(s32i, a7, a5, 4, fixup) - EX(l32i, a6, a3, 16, fixup) +EX(10f) s32i a7, a5, 4 +EX(10f) l32i a6, a3, 16 ALIGN( a8, a8, a9) - EX(s32i, a8, a5, 8, fixup) +EX(10f) s32i a8, a5, 8 addi a3, a3, 16 ALIGN( a9, a9, a6) - EX(s32i, a9, a5, 12, fixup) +EX(10f) s32i a9, a5, 12 addi a5, a5, 16 #if !XCHAL_HAVE_LOOPS blt a3, a12, .Loop2 @@ -246,39 +238,39 @@ __xtensa_copy_user: .Loop2done: bbci.l a4, 3, .L12 # copy 8 bytes - EX(l32i, a7, a3, 4, fixup) - EX(l32i, a8, a3, 8, fixup) +EX(10f) l32i a7, a3, 4 +EX(10f) l32i a8, a3, 8 ALIGN( a6, a6, a7) - EX(s32i, a6, a5, 0, fixup) +EX(10f) s32i a6, a5, 0 addi a3, a3, 8 ALIGN( a7, a7, a8) - EX(s32i, a7, a5, 4, fixup) +EX(10f) s32i a7, a5, 4 addi a5, a5, 8 mov a6, a8 .L12: bbci.l a4, 2, .L13 # copy 4 bytes - EX(l32i, a7, a3, 4, fixup) +EX(10f) l32i a7, a3, 4 addi a3, a3, 4 ALIGN( a6, a6, a7) - EX(s32i, a6, a5, 0, fixup) +EX(10f) s32i a6, a5, 0 addi a5, a5, 4 mov a6, a7 .L13: add a3, a3, a10 # readjust a3 with correct misalignment bbci.l a4, 1, .L14 # copy 2 bytes - EX(l8ui, a6, a3, 0, fixup) - EX(l8ui, a7, a3, 1, fixup) +EX(10f) l8ui a6, a3, 0 +EX(10f) l8ui a7, a3, 1 addi a3, a3, 2 - EX(s8i, a6, a5, 0, fixup) - EX(s8i, a7, a5, 1, fixup) +EX(10f) s8i a6, a5, 0 +EX(10f) s8i a7, a5, 1 addi a5, a5, 2 .L14: bbci.l a4, 0, .L15 # copy 1 byte - EX(l8ui, a6, a3, 0, fixup) - EX(s8i, a6, a5, 0, fixup) +EX(10f) l8ui a6, a3, 0 +EX(10f) s8i a6, a5, 0 .L15: movi a2, 0 # return success for len bytes copied retw @@ -294,7 +286,7 @@ __xtensa_copy_user: */ -fixup: +10: sub a2, a5, a2 /* a2 <-- bytes copied */ sub a2, a11, a2 /* a2 <-- bytes not copied */ retw -- cgit v1.2.3 From fbb871e220672a8e9e4e7870da5b206fe05904b2 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sat, 9 Dec 2017 21:21:35 -0800 Subject: xtensa: clean up word alignment macros in assembly code Remove duplicate definitions of ALIGN/src_b/__src_b and SSA8/ssa8/__ssa8 from assembly sources and put single definition into asm/asmmacro.h Signed-off-by: Max Filippov --- arch/xtensa/include/asm/asmmacro.h | 33 +++++++++++++++++++++++++ arch/xtensa/kernel/align.S | 5 +--- arch/xtensa/lib/memcopy.S | 49 +++++++++++++------------------------- arch/xtensa/lib/usercopy.S | 24 +++++++------------ 4 files changed, 59 insertions(+), 52 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/asmmacro.h b/arch/xtensa/include/asm/asmmacro.h index d2a4415a4c08..7f2ae5872151 100644 --- a/arch/xtensa/include/asm/asmmacro.h +++ b/arch/xtensa/include/asm/asmmacro.h @@ -158,4 +158,37 @@ .previous \ 97: + +/* + * Extract unaligned word that is split between two registers w0 and w1 + * into r regardless of machine endianness. SAR must be loaded with the + * starting bit of the word (see __ssa8). + */ + + .macro __src_b r, w0, w1 +#ifdef __XTENSA_EB__ + src \r, \w0, \w1 +#else + src \r, \w1, \w0 +#endif + .endm + +/* + * Load 2 lowest address bits of r into SAR for __src_b to extract unaligned + * word starting at r from two registers loaded from consecutive aligned + * addresses covering r regardless of machine endianness. + * + * r 0 1 2 3 + * LE SAR 0 8 16 24 + * BE SAR 32 24 16 8 + */ + + .macro __ssa8 r +#ifdef __XTENSA_EB__ + ssa8b \r +#else + ssa8l \r +#endif + .endm + #endif /* _XTENSA_ASMMACRO_H */ diff --git a/arch/xtensa/kernel/align.S b/arch/xtensa/kernel/align.S index 24b3189d7841..9301452e521e 100644 --- a/arch/xtensa/kernel/align.S +++ b/arch/xtensa/kernel/align.S @@ -19,6 +19,7 @@ #include #include #include +#include #include #if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION @@ -66,8 +67,6 @@ #define INSN_T 24 #define INSN_OP1 16 -.macro __src_b r, w0, w1; src \r, \w0, \w1; .endm -.macro __ssa8 r; ssa8b \r; .endm .macro __ssa8r r; ssa8l \r; .endm .macro __sh r, s; srl \r, \s; .endm .macro __sl r, s; sll \r, \s; .endm @@ -81,8 +80,6 @@ #define INSN_T 4 #define INSN_OP1 12 -.macro __src_b r, w0, w1; src \r, \w1, \w0; .endm -.macro __ssa8 r; ssa8l \r; .endm .macro __ssa8r r; ssa8b \r; .endm .macro __sh r, s; sll \r, \s; .endm .macro __sl r, s; srl \r, \s; .endm diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S index b1c219acabe7..9bda748a1e3e 100644 --- a/arch/xtensa/lib/memcopy.S +++ b/arch/xtensa/lib/memcopy.S @@ -10,22 +10,7 @@ */ #include - - .macro src_b r, w0, w1 -#ifdef __XTENSA_EB__ - src \r, \w0, \w1 -#else - src \r, \w1, \w0 -#endif - .endm - - .macro ssa8 r -#ifdef __XTENSA_EB__ - ssa8b \r -#else - ssa8l \r -#endif - .endm +#include /* * void *memcpy(void *dst, const void *src, size_t len); @@ -209,7 +194,7 @@ memcpy: .Lsrcunaligned: _beqz a4, .Ldone # avoid loading anything for zero-length copies # copy 16 bytes per iteration for word-aligned dst and unaligned src - ssa8 a3 # set shift amount from byte offset + __ssa8 a3 # set shift amount from byte offset /* set to 1 when running on ISS (simulator) with the lint or ferret client, or 0 to save a few cycles */ @@ -229,16 +214,16 @@ memcpy: .Loop2: l32i a7, a3, 4 l32i a8, a3, 8 - src_b a6, a6, a7 + __src_b a6, a6, a7 s32i a6, a5, 0 l32i a9, a3, 12 - src_b a7, a7, a8 + __src_b a7, a7, a8 s32i a7, a5, 4 l32i a6, a3, 16 - src_b a8, a8, a9 + __src_b a8, a8, a9 s32i a8, a5, 8 addi a3, a3, 16 - src_b a9, a9, a6 + __src_b a9, a9, a6 s32i a9, a5, 12 addi a5, a5, 16 #if !XCHAL_HAVE_LOOPS @@ -249,10 +234,10 @@ memcpy: # copy 8 bytes l32i a7, a3, 4 l32i a8, a3, 8 - src_b a6, a6, a7 + __src_b a6, a6, a7 s32i a6, a5, 0 addi a3, a3, 8 - src_b a7, a7, a8 + __src_b a7, a7, a8 s32i a7, a5, 4 addi a5, a5, 8 mov a6, a8 @@ -261,7 +246,7 @@ memcpy: # copy 4 bytes l32i a7, a3, 4 addi a3, a3, 4 - src_b a6, a6, a7 + __src_b a6, a6, a7 s32i a6, a5, 0 addi a5, a5, 4 mov a6, a7 @@ -485,7 +470,7 @@ memmove: .Lbacksrcunaligned: _beqz a4, .Lbackdone # avoid loading anything for zero-length copies # copy 16 bytes per iteration for word-aligned dst and unaligned src - ssa8 a3 # set shift amount from byte offset + __ssa8 a3 # set shift amount from byte offset #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with * the lint or ferret client, or 0 * to save a few cycles */ @@ -506,15 +491,15 @@ memmove: l32i a7, a3, 12 l32i a8, a3, 8 addi a5, a5, -16 - src_b a6, a7, a6 + __src_b a6, a7, a6 s32i a6, a5, 12 l32i a9, a3, 4 - src_b a7, a8, a7 + __src_b a7, a8, a7 s32i a7, a5, 8 l32i a6, a3, 0 - src_b a8, a9, a8 + __src_b a8, a9, a8 s32i a8, a5, 4 - src_b a9, a6, a9 + __src_b a9, a6, a9 s32i a9, a5, 0 #if !XCHAL_HAVE_LOOPS bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start @@ -526,9 +511,9 @@ memmove: l32i a7, a3, 4 l32i a8, a3, 0 addi a5, a5, -8 - src_b a6, a7, a6 + __src_b a6, a7, a6 s32i a6, a5, 4 - src_b a7, a8, a7 + __src_b a7, a8, a7 s32i a7, a5, 0 mov a6, a8 .Lback12: @@ -537,7 +522,7 @@ memmove: addi a3, a3, -4 l32i a7, a3, 0 addi a5, a5, -4 - src_b a6, a7, a6 + __src_b a6, a7, a6 s32i a6, a5, 0 mov a6, a7 .Lback13: diff --git a/arch/xtensa/lib/usercopy.S b/arch/xtensa/lib/usercopy.S index 4172b73b0364..0959b6e71f11 100644 --- a/arch/xtensa/lib/usercopy.S +++ b/arch/xtensa/lib/usercopy.S @@ -56,14 +56,6 @@ #include #include -#ifdef __XTENSA_EB__ -#define ALIGN(R, W0, W1) src R, W0, W1 -#define SSA8(R) ssa8b R -#else -#define ALIGN(R, W0, W1) src R, W1, W0 -#define SSA8(R) ssa8l R -#endif - .text .align 4 .global __xtensa_copy_user @@ -81,7 +73,7 @@ __xtensa_copy_user: # per iteration movi a8, 3 # if source is also aligned, bnone a3, a8, .Laligned # then use word copy - SSA8( a3) # set shift amount from byte offset + __ssa8 a3 # set shift amount from byte offset bnez a4, .Lsrcunaligned movi a2, 0 # return success for len==0 retw @@ -220,16 +212,16 @@ EX(10f) l32i a6, a3, 0 # load first word .Loop2: EX(10f) l32i a7, a3, 4 EX(10f) l32i a8, a3, 8 - ALIGN( a6, a6, a7) + __src_b a6, a6, a7 EX(10f) s32i a6, a5, 0 EX(10f) l32i a9, a3, 12 - ALIGN( a7, a7, a8) + __src_b a7, a7, a8 EX(10f) s32i a7, a5, 4 EX(10f) l32i a6, a3, 16 - ALIGN( a8, a8, a9) + __src_b a8, a8, a9 EX(10f) s32i a8, a5, 8 addi a3, a3, 16 - ALIGN( a9, a9, a6) + __src_b a9, a9, a6 EX(10f) s32i a9, a5, 12 addi a5, a5, 16 #if !XCHAL_HAVE_LOOPS @@ -240,10 +232,10 @@ EX(10f) s32i a9, a5, 12 # copy 8 bytes EX(10f) l32i a7, a3, 4 EX(10f) l32i a8, a3, 8 - ALIGN( a6, a6, a7) + __src_b a6, a6, a7 EX(10f) s32i a6, a5, 0 addi a3, a3, 8 - ALIGN( a7, a7, a8) + __src_b a7, a7, a8 EX(10f) s32i a7, a5, 4 addi a5, a5, 8 mov a6, a8 @@ -252,7 +244,7 @@ EX(10f) s32i a7, a5, 4 # copy 4 bytes EX(10f) l32i a7, a3, 4 addi a3, a3, 4 - ALIGN( a6, a6, a7) + __src_b a6, a6, a7 EX(10f) s32i a6, a5, 0 addi a5, a5, 4 mov a6, a7 -- cgit v1.2.3 From 5cf97ebd8b40e2b1791136fc1476d17365864b18 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sat, 9 Dec 2017 21:22:37 -0800 Subject: xtensa: clean up functions in assembly code Use ENTRY and ENDPROC throughout arch/xtensa/lib assembly sources. Introduce asm/linkage.h and define xtensa-specific __ALIGN macro there. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/linkage.h | 9 +++++++++ arch/xtensa/lib/memcopy.S | 30 ++++++++++-------------------- arch/xtensa/lib/memset.S | 8 ++++---- arch/xtensa/lib/strncpy_user.S | 8 ++++---- arch/xtensa/lib/strnlen_user.S | 9 +++++---- arch/xtensa/lib/usercopy.S | 8 ++++---- 6 files changed, 36 insertions(+), 36 deletions(-) create mode 100644 arch/xtensa/include/asm/linkage.h (limited to 'arch') diff --git a/arch/xtensa/include/asm/linkage.h b/arch/xtensa/include/asm/linkage.h new file mode 100644 index 000000000000..0ba9973235d9 --- /dev/null +++ b/arch/xtensa/include/asm/linkage.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_LINKAGE_H +#define __ASM_LINKAGE_H + +#define __ALIGN .align 4 +#define __ALIGN_STR ".align 4" + +#endif diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S index 9bda748a1e3e..24d650864c3a 100644 --- a/arch/xtensa/lib/memcopy.S +++ b/arch/xtensa/lib/memcopy.S @@ -9,6 +9,7 @@ * Copyright (C) 2002 - 2012 Tensilica Inc. */ +#include #include #include @@ -108,10 +109,7 @@ addi a5, a5, 2 j .Ldstaligned # dst is now aligned, return to main algorithm - .align 4 - .global memcpy - .type memcpy,@function -memcpy: +ENTRY(memcpy) entry sp, 16 # minimal stack frame # a2/ dst, a3/ src, a4/ len @@ -273,14 +271,14 @@ memcpy: s8i a6, a5, 0 retw +ENDPROC(memcpy) /* * void bcopy(const void *src, void *dest, size_t n); */ - .align 4 - .global bcopy - .type bcopy,@function -bcopy: + +ENTRY(bcopy) + entry sp, 16 # minimal stack frame # a2=src, a3=dst, a4=len mov a5, a3 @@ -288,6 +286,8 @@ bcopy: mov a2, a5 j .Lmovecommon # go to common code for memmove+bcopy +ENDPROC(bcopy) + /* * void *memmove(void *dst, const void *src, size_t len); * @@ -376,10 +376,7 @@ bcopy: j .Lbackdstaligned # dst is now aligned, # return to main algorithm - .align 4 - .global memmove - .type memmove,@function -memmove: +ENTRY(memmove) entry sp, 16 # minimal stack frame # a2/ dst, a3/ src, a4/ len @@ -551,11 +548,4 @@ memmove: s8i a6, a5, 0 retw - -/* - * Local Variables: - * mode:fundamental - * comment-start: "# " - * comment-start-skip: "# *" - * End: - */ +ENDPROC(memmove) diff --git a/arch/xtensa/lib/memset.S b/arch/xtensa/lib/memset.S index 7a724edaf4f1..a6cd04ba966f 100644 --- a/arch/xtensa/lib/memset.S +++ b/arch/xtensa/lib/memset.S @@ -11,6 +11,7 @@ * Copyright (C) 2002 Tensilica Inc. */ +#include #include #include @@ -30,10 +31,8 @@ */ .text -.align 4 -.global memset -.type memset,@function -memset: +ENTRY(memset) + entry sp, 16 # minimal stack frame # a2/ dst, a3/ c, a4/ length extui a3, a3, 0, 8 # mask to just 8 bits @@ -141,6 +140,7 @@ EX(10f) s8i a3, a5, 0 .Lbytesetdone: retw +ENDPROC(memset) .section .fixup, "ax" .align 4 diff --git a/arch/xtensa/lib/strncpy_user.S b/arch/xtensa/lib/strncpy_user.S index 827e1b393f3f..5fce16b67dca 100644 --- a/arch/xtensa/lib/strncpy_user.S +++ b/arch/xtensa/lib/strncpy_user.S @@ -12,6 +12,7 @@ */ #include +#include #include #include @@ -47,10 +48,8 @@ # a12/ tmp .text -.align 4 -.global __strncpy_user -.type __strncpy_user,@function -__strncpy_user: +ENTRY(__strncpy_user) + entry sp, 16 # minimal stack frame # a2/ dst, a3/ src, a4/ len mov a11, a2 # leave dst in return value register @@ -202,6 +201,7 @@ EX(10f) s8i a9, a11, 0 sub a2, a11, a2 # compute strlen retw +ENDPROC(__strncpy_user) .section .fixup, "ax" .align 4 diff --git a/arch/xtensa/lib/strnlen_user.S b/arch/xtensa/lib/strnlen_user.S index 9404ac46ce4c..0b956ce7f386 100644 --- a/arch/xtensa/lib/strnlen_user.S +++ b/arch/xtensa/lib/strnlen_user.S @@ -11,6 +11,7 @@ * Copyright (C) 2002 Tensilica Inc. */ +#include #include #include @@ -42,10 +43,8 @@ # a10/ tmp .text -.align 4 -.global __strnlen_user -.type __strnlen_user,@function -__strnlen_user: +ENTRY(__strnlen_user) + entry sp, 16 # minimal stack frame # a2/ s, a3/ len addi a4, a2, -4 # because we overincrement at the end; @@ -133,6 +132,8 @@ EX(10f) l32i a9, a4, 0 # get word with first two bytes of string sub a2, a4, a2 # subtract to get length retw +ENDPROC(__strnlen_user) + .section .fixup, "ax" .align 4 10: diff --git a/arch/xtensa/lib/usercopy.S b/arch/xtensa/lib/usercopy.S index 0959b6e71f11..64ab1971324f 100644 --- a/arch/xtensa/lib/usercopy.S +++ b/arch/xtensa/lib/usercopy.S @@ -53,14 +53,13 @@ * a11/ original length */ +#include #include #include .text - .align 4 - .global __xtensa_copy_user - .type __xtensa_copy_user,@function -__xtensa_copy_user: +ENTRY(__xtensa_copy_user) + entry sp, 16 # minimal stack frame # a2/ dst, a3/ src, a4/ len mov a5, a2 # copy dst so that a2 is return value @@ -267,6 +266,7 @@ EX(10f) s8i a6, a5, 0 movi a2, 0 # return success for len bytes copied retw +ENDPROC(__xtensa_copy_user) .section .fixup, "ax" .align 4 -- cgit v1.2.3 From f4431396be5b26a9960daf502d129b1b5d126f5e Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 4 Dec 2017 10:47:43 -0800 Subject: xtensa: consolidate kernel stack size related definitions Define kernel stack size in kmem_layout and use it in current_thread_info, GET_THREAD_INFO, THREAD_SIZE and THERAD_SIZE_ORDER definitions. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/current.h | 4 ++-- arch/xtensa/include/asm/kmem_layout.h | 3 +++ arch/xtensa/include/asm/ptrace.h | 3 +-- arch/xtensa/include/asm/thread_info.h | 13 +++++++------ 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/current.h b/arch/xtensa/include/asm/current.h index 47e46dcf5d49..5d98a7ad4251 100644 --- a/arch/xtensa/include/asm/current.h +++ b/arch/xtensa/include/asm/current.h @@ -11,6 +11,8 @@ #ifndef _XTENSA_CURRENT_H #define _XTENSA_CURRENT_H +#include + #ifndef __ASSEMBLY__ #include @@ -26,8 +28,6 @@ static inline struct task_struct *get_current(void) #else -#define CURRENT_SHIFT 13 - #define GET_CURRENT(reg,sp) \ GET_THREAD_INFO(reg,sp); \ l32i reg, reg, TI_TASK \ diff --git a/arch/xtensa/include/asm/kmem_layout.h b/arch/xtensa/include/asm/kmem_layout.h index 561f8729bcde..28f9260a766c 100644 --- a/arch/xtensa/include/asm/kmem_layout.h +++ b/arch/xtensa/include/asm/kmem_layout.h @@ -71,4 +71,7 @@ #endif +#define KERNEL_STACK_SHIFT 13 +#define KERNEL_STACK_SIZE (1 << KERNEL_STACK_SHIFT) + #endif diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h index e2d9c5eb10bd..05beae3c6376 100644 --- a/arch/xtensa/include/asm/ptrace.h +++ b/arch/xtensa/include/asm/ptrace.h @@ -10,6 +10,7 @@ #ifndef _XTENSA_PTRACE_H #define _XTENSA_PTRACE_H +#include #include /* @@ -38,8 +39,6 @@ * +-----------------------+ -------- */ -#define KERNEL_STACK_SIZE (2 * PAGE_SIZE) - /* Offsets for exception_handlers[] (3 x 64-entries x 4-byte tables). */ #define EXC_TABLE_KSTK 0x004 /* Kernel Stack */ diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 7be2400f745a..71c9865218ce 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -11,7 +11,9 @@ #ifndef _XTENSA_THREAD_INFO_H #define _XTENSA_THREAD_INFO_H -#ifdef __KERNEL__ +#include + +#define CURRENT_SHIFT KERNEL_STACK_SHIFT #ifndef __ASSEMBLY__ # include @@ -84,7 +86,7 @@ struct thread_info { static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("extui %0,a1,0,13\n\t" + __asm__("extui %0, a1, 0, "__stringify(CURRENT_SHIFT)"\n\t" "xor %0, a1, %0" : "=&r" (ti) : ); return ti; } @@ -93,7 +95,7 @@ static inline struct thread_info *current_thread_info(void) /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg,sp) \ - extui reg, sp, 0, 13; \ + extui reg, sp, 0, CURRENT_SHIFT; \ xor reg, sp, reg #endif @@ -130,8 +132,7 @@ static inline struct thread_info *current_thread_info(void) */ #define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */ -#define THREAD_SIZE 8192 //(2*PAGE_SIZE) -#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE KERNEL_STACK_SIZE +#define THREAD_SIZE_ORDER (KERNEL_STACK_SHIFT - PAGE_SHIFT) -#endif /* __KERNEL__ */ #endif /* _XTENSA_THREAD_INFO */ -- cgit v1.2.3 From aa6476f76c1678d5d1087b39d3047601f0139ef0 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 8 Aug 2017 14:06:14 -0700 Subject: xtensa: print hardware config ID on startup Print hardware config ID on startup and config ID recorded in the configuration if it doesn't match one read from the hardware. Signed-off-by: Max Filippov --- arch/xtensa/kernel/setup.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 253a0178f1bd..3732c91b7200 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -317,6 +317,13 @@ static inline int mem_reserve(unsigned long start, unsigned long end) void __init setup_arch(char **cmdline_p) { + pr_info("config ID: %08x:%08x\n", + get_sr(SREG_EPC), get_sr(SREG_EXCSAVE)); + if (get_sr(SREG_EPC) != XCHAL_HW_CONFIGID0 || + get_sr(SREG_EXCSAVE) != XCHAL_HW_CONFIGID1) + pr_info("built for config ID: %08x:%08x\n", + XCHAL_HW_CONFIGID0, XCHAL_HW_CONFIGID1); + *cmdline_p = command_line; platform_setup(cmdline_p); strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); @@ -582,12 +589,14 @@ c_show(struct seq_file *f, void *slot) "model\t\t: Xtensa " XCHAL_HW_VERSION_NAME "\n" "core ID\t\t: " XCHAL_CORE_ID "\n" "build ID\t: 0x%x\n" + "config ID\t: %08x:%08x\n" "byte order\t: %s\n" "cpu MHz\t\t: %lu.%02lu\n" "bogomips\t: %lu.%02lu\n", num_online_cpus(), cpumask_pr_args(cpu_online_mask), XCHAL_BUILD_UNIQUE_ID, + get_sr(SREG_EPC), get_sr(SREG_EXCSAVE), XCHAL_HAVE_BE ? "big" : "little", ccount_freq/1000000, (ccount_freq/10000) % 100, -- cgit v1.2.3 From 03dd604e1d515ca1ab02aaae12162e0a077858e9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 11 Dec 2017 12:54:44 +0100 Subject: x86/apic: Remove local var in flat_send_IPI_allbutself() No code changed: # arch/x86/kernel/apic/apic_flat_64.o: text data bss dec hex filename 1838 624 0 2462 99e apic_flat_64.o.before 1838 624 0 2462 99e apic_flat_64.o.after md5: aa2ae687d94bc4534f86ae6865dabd6a apic_flat_64.o.before.asm 42148da76ba8f9a236c33f8803bd2a6b apic_flat_64.o.after.asm md5 sum is different due to asm output offsets changing. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171211115444.26577-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index aa85690e9b64..f58a49769bc6 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -84,12 +84,8 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) static void flat_send_IPI_allbutself(int vector) { int cpu = smp_processor_id(); -#ifdef CONFIG_HOTPLUG_CPU - int hotplug = 1; -#else - int hotplug = 0; -#endif - if (hotplug || vector == NMI_VECTOR) { + + if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) { if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { unsigned long mask = cpumask_bits(cpu_online_mask)[0]; -- cgit v1.2.3 From 6d60ce384d1d5ca32b595244db4077a419acc687 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Mon, 27 Nov 2017 08:51:39 +0100 Subject: x86/mm/kmmio: Fix mmiotrace for page unaligned addresses If something calls ioremap() with an address not aligned to PAGE_SIZE, the returned address might be not aligned as well. This led to a probe registered on exactly the returned address, but the entire page was armed for mmiotracing. On calling iounmap() the address passed to unregister_kmmio_probe() was PAGE_SIZE aligned by the caller leading to a complete freeze of the machine. We should always page align addresses while (un)registerung mappings, because the mmiotracer works on top of pages, not mappings. We still keep track of the probes based on their real addresses and lengths though, because the mmiotrace still needs to know what are mapped memory regions. Also move the call to mmiotrace_iounmap() prior page aligning the address, so that all probes are unregistered properly, otherwise the kernel ends up failing memory allocations randomly after disabling the mmiotracer. Tested-by: Lyude Signed-off-by: Karol Herbst Acked-by: Pekka Paalanen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: nouveau@lists.freedesktop.org Link: http://lkml.kernel.org/r/20171127075139.4928-1-kherbst@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 4 ++-- arch/x86/mm/kmmio.c | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6e4573b1da34..c45b6ec5357b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -404,11 +404,11 @@ void iounmap(volatile void __iomem *addr) return; } + mmiotrace_iounmap(addr); + addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); - mmiotrace_iounmap(addr); - /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c21c2ed04612..58477ec3d66d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned long flags; int ret = 0; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); unsigned int l; pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); - if (get_kmmio_probe(p->addr)) { + if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; } - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) { ret = -EINVAL; goto out; @@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p) kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { - if (add_kmmio_fault_page(p->addr + size)) + if (add_kmmio_fault_page(addr + size)) pr_err("Unable to set page fault.\n"); size += page_level_size(l); } @@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; unsigned int l; pte_t *pte; - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { - release_kmmio_fault_page(p->addr + size, &release_list); + release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); -- cgit v1.2.3 From 86ad5c97ce5ccdda1459d35370fd5e105721bb8d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Dec 2017 14:49:14 +0300 Subject: RISC-V: Logical vs Bitwise typo In the current code, there is a ! logical NOT where a bitwise ~ NOT was intended. It means that we never return -EINVAL. Signed-off-by: Dan Carpenter Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sys_riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index a2ae936a093e..79c78668258e 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -70,7 +70,7 @@ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end, bool local = (flags & SYS_RISCV_FLUSH_ICACHE_LOCAL) != 0; /* Check the reserved flags. */ - if (unlikely(flags & !SYS_RISCV_FLUSH_ICACHE_ALL)) + if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_ALL)) return -EINVAL; flush_icache_mm(mm, local); -- cgit v1.2.3 From 3cfa5008081db845c6c53d531ec34e9c84a9fd99 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 5 Dec 2017 17:48:11 -0800 Subject: RISC-V: Resurrect smp_mb__after_spinlock() I removed this last week because of an incorrect comment: smp_mb__after_spinlock() is actually still used, and is necessary on RISC-V. It's been resurrected, with a comment that describes what it actually does this time. Thanks to Andrea for finding the bug! Fixes: 3343eb6806f3 ("RISC-V: Remove smb_mb__{before,after}_spinlock()") CC: Andrea Parri Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/barrier.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index 773c4e039cd7..c0319cbf1eec 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -38,6 +38,25 @@ #define smp_rmb() RISCV_FENCE(r,r) #define smp_wmb() RISCV_FENCE(w,w) +/* + * This is a very specific barrier: it's currently only used in two places in + * the kernel, both in the scheduler. See include/linux/spinlock.h for the two + * orderings it guarantees, but the "critical section is RCsc" guarantee + * mandates a barrier on RISC-V. The sequence looks like: + * + * lr.aq lock + * sc lock <= LOCKED + * smp_mb__after_spinlock() + * // critical section + * lr lock + * sc.rl lock <= UNLOCKED + * + * The AQ/RL pair provides a RCpc critical section, but there's not really any + * way we can take advantage of that here because the ordering is only enforced + * on that one lock. Thus, we're just doing a full fence. + */ +#define smp_mb__after_spinlock() RISCV_FENCE(rw,rw) + #include #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From 27b0174525325bf18919597016483a709f3372f8 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 8 Dec 2017 11:23:23 -0800 Subject: RISC-V: Remove unused CONFIG_HVC_RISCV_SBI code This is code that probably should never have made it into the kernel in the first place: it depends on a driver that hadn't been reviewed yet. During the HVC_SBI_RISCV review process a better way of doing this was suggested, but that means this code is defunct. It's compile-time disabled in 4.15 because the driver isn't in, so I think it's safe to just remove this for now. CC: Greg KH Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 8fbb6749910d..cb7b0c63014e 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -38,10 +38,6 @@ #include #include -#ifdef CONFIG_HVC_RISCV_SBI -#include -#endif - #ifdef CONFIG_DUMMY_CONSOLE struct screen_info screen_info = { .orig_video_lines = 30, @@ -212,13 +208,6 @@ static void __init setup_bootmem(void) void __init setup_arch(char **cmdline_p) { -#if defined(CONFIG_HVC_RISCV_SBI) - if (likely(early_console == NULL)) { - early_console = &riscv_sbi_early_console_dev; - register_console(early_console); - } -#endif - #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -- cgit v1.2.3 From f24e5834a2c3f6c5f814a417f858226f0a010ade Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Mon, 4 Dec 2017 14:13:05 +0000 Subject: arm64: Initialise high_memory global variable earlier The high_memory global variable is used by cma_declare_contiguous(.) before it is defined. We don't notice this as we compute __pa(high_memory - 1), and it looks like we're processing a VA from the direct linear map. This problem becomes apparent when we flip the kernel virtual address space and the linear map is moved to the bottom of the kernel VA space. This patch moves the initialisation of high_memory before it used. Cc: Fixes: f7426b983a6a ("mm: cma: adjust address limit to avoid hitting low/high memory boundary") Signed-off-by: Steve Capper Signed-off-by: Will Deacon --- arch/arm64/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 5960bef0170d..00e7b900ca41 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -476,6 +476,8 @@ void __init arm64_memblock_init(void) reserve_elfcorehdr(); + high_memory = __va(memblock_end_of_DRAM() - 1) + 1; + dma_contiguous_reserve(arm64_dma_phys_limit); memblock_allow_resize(); @@ -502,7 +504,6 @@ void __init bootmem_init(void) sparse_init(); zone_sizes_init(min, max); - high_memory = __va((max << PAGE_SHIFT) - 1) + 1; memblock_dump_all(); } -- cgit v1.2.3 From 8781bcbc5e69d7da69e84c7044ca0284848d5d01 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Fri, 1 Dec 2017 17:22:14 +0000 Subject: arm64: mm: Fix pte_mkclean, pte_mkdirty semantics On systems with hardware dirty bit management, the ltp madvise09 unit test fails due to dirty bit information being lost and pages being incorrectly freed. This was bisected to: arm64: Ignore hardware dirty bit updates in ptep_set_wrprotect() Reverting this commit leads to a separate problem, that the unit test retains pages that should have been dropped due to the function madvise_free_pte_range(.) not cleaning pte's properly. Currently pte_mkclean only clears the software dirty bit, thus the following code sequence can appear: pte = pte_mkclean(pte); if (pte_dirty(pte)) // this condition can return true with HW DBM! This patch also adjusts pte_mkclean to set PTE_RDONLY thus effectively clearing both the SW and HW dirty information. In order for this to function on systems without HW DBM, we need to also adjust pte_mkdirty to remove the read only bit from writable pte's to avoid infinite fault loops. Cc: Fixes: 64c26841b349 ("arm64: Ignore hardware dirty bit updates in ptep_set_wrprotect()") Reported-by: Bhupinder Thakur Tested-by: Bhupinder Thakur Reviewed-by: Catalin Marinas Signed-off-by: Steve Capper Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 149d05fb9421..3ff03a755c32 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -149,12 +149,20 @@ static inline pte_t pte_mkwrite(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return clear_pte_bit(pte, __pgprot(PTE_DIRTY)); + pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY)); + pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); + + return pte; } static inline pte_t pte_mkdirty(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_DIRTY)); + pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); + + if (pte_write(pte)) + pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); + + return pte; } static inline pte_t pte_mkold(pte_t pte) @@ -641,28 +649,23 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while preserving the hardware update of - * the Access Flag. + * ptep_set_wrprotect - mark read-only while trasferring potential hardware + * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. */ #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte, pte; - /* - * ptep_set_wrprotect() is only called on CoW mappings which are - * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE && - * PTE_RDONLY) or writable and software-dirty (PTE_WRITE && - * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and - * protection_map[]. There is no race with the hardware update of the - * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM) - * is set. - */ - VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep), - "%s: potential race with hardware DBM", __func__); pte = READ_ONCE(*ptep); do { old_pte = pte; + /* + * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY + * clear), set the PTE_DIRTY bit. + */ + if (pte_hw_dirty(pte)) + pte = pte_mkdirty(pte); pte = pte_wrprotect(pte); pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), pte_val(old_pte), pte_val(pte)); -- cgit v1.2.3 From e7ed9d9bd0375c74fe6e27d8bc73d3c6f4c8c3bc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 30 Nov 2017 16:12:02 -0800 Subject: uprobes/x86: Emulate push insns for uprobe on x86 Uprobe is a tracing mechanism for userspace programs. Typical uprobe will incur overhead of two traps. First trap is caused by replaced trap insn, and the second trap is to execute the original displaced insn in user space. To reduce the overhead, kernel provides hooks for architectures to emulate the original insn and skip the second trap. In x86, emulation is done for certain branch insns. This patch extends the emulation to "push " insns. These insns are typical in the beginning of the function. For example, bcc in https://github.com/iovisor/bcc repo provides tools to measure funclantency, detect memleak, etc. The tools will place uprobes in the beginning of function and possibly uretprobes at the end of function. This patch is able to reduce the trap overhead for uprobe from 2 to 1. Without this patch, uretprobe will typically incur three traps. With this patch, if the function starts with "push" insn, the number of traps can be reduced from 3 to 2. An experiment was conducted on two local VMs, fedora 26 64-bit VM and 32-bit VM, both 4 processors and 4GB memory, booted with latest tip repo (and this patch). The host is MacBook with intel i7 processor. The test program looks like: #include #include #include #include static void test() __attribute__((noinline)); void test() {} int main() { struct timeval start, end; gettimeofday(&start, NULL); for (int i = 0; i < 1000000; i++) { test(); } gettimeofday(&end, NULL); printf("%ld\n", ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec))); return 0; } The program is compiled without optimization, and the first insn for function "test" is "push %rbp". The host is relatively idle. Before the test run, the uprobe is inserted as below for uprobe: echo 'p :' > /sys/kernel/debug/tracing/uprobe_events echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable and for uretprobe: echo 'r :' > /sys/kernel/debug/tracing/uprobe_events echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable Unit: microsecond(usec) per loop iteration x86_64 W/ this patch W/O this patch uprobe 1.55 3.1 uretprobe 2.0 3.6 x86_32 W/ this patch W/O this patch uprobe 1.41 3.5 uretprobe 1.75 4.0 You can see that this patch significantly reduced the overhead, 50% for uprobe and 44% for uretprobe on x86_64, and even more on x86_32. Signed-off-by: Yonghong Song Reviewed-by: Oleg Nesterov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/20171201001202.3706564-1-yhs@fb.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 4 ++ arch/x86/kernel/uprobes.c | 107 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 74f4c2ff6427..d8bfa98fca98 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -53,6 +53,10 @@ struct arch_uprobe { u8 fixups; u8 ilen; } defparam; + struct { + u8 reg_offset; /* to the start of pt_regs */ + u8 ilen; + } push; }; }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index a3755d293a48..85c7ef23d99f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) return 0; } -static int push_ret_address(struct pt_regs *regs, unsigned long ip) +static int emulate_push_stack(struct pt_regs *regs, unsigned long val) { unsigned long new_sp = regs->sp - sizeof_long(); - if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) + if (copy_to_user((void __user *)new_sp, &val, sizeof_long())) return -EFAULT; regs->sp = new_sp; @@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(); /* Pop incorrect return address */ - if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) + if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } /* popf; tell the caller to not touch TF */ @@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) * * But there is corner case, see the comment in ->post_xol(). */ - if (push_ret_address(regs, new_ip)) + if (emulate_push_stack(regs, new_ip)) return false; } else if (!check_jmp_cond(auprobe, regs)) { offs = 0; @@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) return true; } +static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset; + + if (emulate_push_stack(regs, *src_ptr)) + return false; + regs->ip += auprobe->push.ilen; + return true; +} + static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { BUG_ON(!branch_is_call(auprobe)); @@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = { .post_xol = branch_post_xol_op, }; +static const struct uprobe_xol_ops push_xol_ops = { + .emulate = push_emulate_op, +}; + /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { @@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return 0; } +/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */ +static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn), reg_offset = 0; + + if (opc1 < 0x50 || opc1 > 0x57) + return -ENOSYS; + + if (insn->length > 2) + return -ENOSYS; + if (insn->length == 2) { + /* only support rex_prefix 0x41 (x64 only) */ +#ifdef CONFIG_X86_64 + if (insn->rex_prefix.nbytes != 1 || + insn->rex_prefix.bytes[0] != 0x41) + return -ENOSYS; + + switch (opc1) { + case 0x50: + reg_offset = offsetof(struct pt_regs, r8); + break; + case 0x51: + reg_offset = offsetof(struct pt_regs, r9); + break; + case 0x52: + reg_offset = offsetof(struct pt_regs, r10); + break; + case 0x53: + reg_offset = offsetof(struct pt_regs, r11); + break; + case 0x54: + reg_offset = offsetof(struct pt_regs, r12); + break; + case 0x55: + reg_offset = offsetof(struct pt_regs, r13); + break; + case 0x56: + reg_offset = offsetof(struct pt_regs, r14); + break; + case 0x57: + reg_offset = offsetof(struct pt_regs, r15); + break; + } +#else + return -ENOSYS; +#endif + } else { + switch (opc1) { + case 0x50: + reg_offset = offsetof(struct pt_regs, ax); + break; + case 0x51: + reg_offset = offsetof(struct pt_regs, cx); + break; + case 0x52: + reg_offset = offsetof(struct pt_regs, dx); + break; + case 0x53: + reg_offset = offsetof(struct pt_regs, bx); + break; + case 0x54: + reg_offset = offsetof(struct pt_regs, sp); + break; + case 0x55: + reg_offset = offsetof(struct pt_regs, bp); + break; + case 0x56: + reg_offset = offsetof(struct pt_regs, si); + break; + case 0x57: + reg_offset = offsetof(struct pt_regs, di); + break; + } + } + + auprobe->push.reg_offset = reg_offset; + auprobe->push.ilen = insn->length; + auprobe->ops = &push_xol_ops; + return 0; +} + /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. @@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret != -ENOSYS) return ret; + ret = push_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + /* * Figure out which fixups default_post_xol_op() will need to perform, * and annotate defparam->fixups accordingly. -- cgit v1.2.3 From f79ce87fa49da778a1ad54c7d3c6755e13cf8489 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Nov 2017 22:51:20 +0800 Subject: x86/build: Don't verify mtools configuration file for isoimage If mtools.conf is not generated before, 'make isoimage' could complain: Kernel: arch/x86/boot/bzImage is ready (#597) GENIMAGE arch/x86/boot/image.iso *** Missing file: arch/x86/boot/mtools.conf arch/x86/boot/Makefile:144: recipe for target 'isoimage' failed mtools.conf is not used for isoimage generation, so do not check it. Signed-off-by: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 4366d57af1 ("x86/build: Factor out fdimage/isoimage generation commands to standalone script") Link: http://lkml.kernel.org/r/1512053480-8083-1-git-send-email-changbin.du@intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/genimage.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 49f4970f693b..c9e8499fbfe7 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -44,9 +44,9 @@ FDINITRD=$6 # Make sure the files actually exist verify "$FBZIMAGE" -verify "$MTOOLSRC" genbzdisk() { + verify "$MTOOLSRC" mformat a: syslinux $FIMAGE echo "$KCMDLINE" | mcopy - a:syslinux.cfg @@ -57,6 +57,7 @@ genbzdisk() { } genfdimage144() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null mformat v: syslinux $FIMAGE @@ -68,6 +69,7 @@ genfdimage144() { } genfdimage288() { + verify "$MTOOLSRC" dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null mformat w: syslinux $FIMAGE -- cgit v1.2.3 From 0a373d4fc248cb707821d7dad54ce6d5bcb0cdfe Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 30 Nov 2017 15:35:54 +0300 Subject: x86/unwinder/guess: Prevent using CONFIG_UNWINDER_GUESS=y with CONFIG_STACKDEPOT=y Stackdepot doesn't work well with CONFIG_UNWINDER_GUESS=y. The 'guess' unwinder generate awfully large and inaccurate stacktraces, thus stackdepot can't deduplicate stacktraces because they all look like unique. Eventually stackdepot reaches its capacity limit: WARNING: CPU: 0 PID: 545 at lib/stackdepot.c:119 depot_save_stack+0x28e/0x550 Call Trace: ? kasan_kmalloc+0x144/0x160 ? depot_save_stack+0x1f5/0x550 ? do_raw_spin_unlock+0xda/0xf0 ? preempt_count_sub+0x13/0xc0 <...90 lines...> ? do_raw_spin_unlock+0xda/0xf0 Add a STACKDEPOT=n dependency to UNWINDER_GUESS to avoid the problem. Reported-by: kernel test robot Reported-by: Fengguang Wu Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Acked-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171130123554.4330-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..672441c008c7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT + depends on !STACKDEPOT ---help--- This option enables the "guess" unwinder for unwinding kernel stack traces. It scans the stack and reports every kernel text address it -- cgit v1.2.3 From 81bf665d00baf1aef01118c6c9e51520e57c0757 Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Tue, 12 Dec 2017 02:12:31 +0530 Subject: x86/headers: Remove duplicate #includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: ard.biesheuvel@linaro.org Cc: boris.ostrovsky@oracle.com Cc: geert@linux-m68k.org Cc: jgross@suse.com Cc: linux-efi@vger.kernel.org Cc: luto@kernel.org Cc: matt@codeblueprint.co.uk Cc: thomas.lendacky@amd.com Cc: tim.c.chen@linux.intel.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1513024951-9221-1-git-send-email-pravin.shedge4linux@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/itmt.c | 1 - arch/x86/kernel/process.c | 1 - arch/x86/kernel/setup.c | 1 - arch/x86/kernel/smpboot.c | 1 - arch/x86/platform/efi/efi_64.c | 1 - arch/x86/xen/spinlock.c | 2 -- 6 files changed, 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index f73f475d0573..d177940aa090 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bb988a24db92..d6321855f9da 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8af2e8d0c0a1..c8e04472a141 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,7 +114,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 05a97d5fe298..d44b64d571b4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -75,7 +75,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6a151ce70e86..1e5184d7ce7a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 02f3445a2b5f..cd97a62394e7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -23,8 +23,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); static bool xen_pvspin = true; -#include - static void xen_qlock_kick(int cpu) { int irq = per_cpu(lock_kicker_irq, cpu); -- cgit v1.2.3 From 86c9e8126e9fbcbf06c36e285168b880369a537c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 12 Dec 2017 10:48:54 +0000 Subject: arm64: mm: Fix false positives in set_pte_at access/dirty race detection Jiankang reports that our race detection in set_pte_at is firing when copying the page tables in dup_mmap as a result of a fork(). In this situation, the page table isn't actually live and so there is no way that we can race with a concurrent update from the hardware page table walker. This patch reworks the race detection so that we require either the mm to match the current active_mm (i.e. currently installed in our TTBR0) or the mm_users count to be greater than 1, implying that the page table could be live in another CPU. The mm_users check might still be racy, but we'll avoid false positives and it's not realistic to validate that all the necessary locks are held as part of this assertion. Cc: Yisheng Xie Reported-by: Jiankang Chen Tested-by: Jiankang Chen Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 3ff03a755c32..bdcc7f1c9d06 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -42,6 +42,8 @@ #include #include #include +#include +#include extern void __pte_error(const char *file, int line, unsigned long val); extern void __pmd_error(const char *file, int line, unsigned long val); @@ -215,9 +217,6 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } -struct mm_struct; -struct vm_area_struct; - extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); /* @@ -246,7 +245,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (pte_valid(*ptep) && pte_valid(pte)) { + if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(*ptep) && pte_valid(pte) && + (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) { VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(*ptep), pte_val(pte)); -- cgit v1.2.3 From c622cc013cece073722592cff1ac6643a33b1622 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 11 Dec 2017 16:42:31 -0600 Subject: arm64: Define cputype macros for Falkor CPU Add cputype definition macros for Qualcomm Datacenter Technologies Falkor CPU in cputype.h. It's unfortunate that the first revision of the Falkor CPU used the wrong part number 0x800, got fixed in v2 chip with part number 0xC00, and would be used the same value for future revisions. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 235e77d98261..cbf08d7cbf30 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -91,6 +91,7 @@ #define BRCM_CPU_PART_VULCAN 0x516 #define QCOM_CPU_PART_FALKOR_V1 0x800 +#define QCOM_CPU_PART_FALKOR 0xC00 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) @@ -99,6 +100,7 @@ #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) +#define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR) #ifndef __ASSEMBLY__ -- cgit v1.2.3 From 932b50c7c1c65e6f23002e075b97ee083c4a9e71 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 11 Dec 2017 16:42:32 -0600 Subject: arm64: Add software workaround for Falkor erratum 1041 The ARM architecture defines the memory locations that are permitted to be accessed as the result of a speculative instruction fetch from an exception level for which all stages of translation are disabled. Specifically, the core is permitted to speculatively fetch from the 4KB region containing the current program counter 4K and next 4K. When translation is changed from enabled to disabled for the running exception level (SCTLR_ELn[M] changed from a value of 1 to 0), the Falkor core may errantly speculatively access memory locations outside of the 4KB region permitted by the architecture. The errant memory access may lead to one of the following unexpected behaviors. 1) A System Error Interrupt (SEI) being raised by the Falkor core due to the errant memory access attempting to access a region of memory that is protected by a slave-side memory protection unit. 2) Unpredictable device behavior due to a speculative read from device memory. This behavior may only occur if the instruction cache is disabled prior to or coincident with translation being changed from enabled to disabled. The conditions leading to this erratum will not occur when either of the following occur: 1) A higher exception level disables translation of a lower exception level (e.g. EL2 changing SCTLR_EL1[M] from a value of 1 to 0). 2) An exception level disabling its stage-1 translation if its stage-2 translation is enabled (e.g. EL1 changing SCTLR_EL1[M] from a value of 1 to 0 when HCR_EL2[VM] has a value of 1). To avoid the errant behavior, software must execute an ISB immediately prior to executing the MSR that will change SCTLR_ELn[M] from 1 to 0. Signed-off-by: Shanker Donthineni Signed-off-by: Will Deacon --- Documentation/arm64/silicon-errata.txt | 1 + arch/arm64/Kconfig | 12 +++++++++++- arch/arm64/include/asm/assembler.h | 10 ++++++++++ arch/arm64/kernel/cpu-reset.S | 1 + arch/arm64/kernel/efi-entry.S | 2 ++ arch/arm64/kernel/head.S | 1 + arch/arm64/kernel/relocate_kernel.S | 1 + arch/arm64/kvm/hyp-init.S | 1 + 8 files changed, 28 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 304bf22bb83c..fc1c884fea10 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -75,3 +75,4 @@ stable kernels. | Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | | Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | +| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a93339f5178f..c9a7e9e1414f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -557,7 +557,6 @@ config QCOM_QDF2400_ERRATUM_0065 If unsure, say Y. - config SOCIONEXT_SYNQUACER_PREITS bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y @@ -576,6 +575,17 @@ config HISILICON_ERRATUM_161600802 a 128kB offset to be applied to the target address in this commands. If unsure, say Y. + +config QCOM_FALKOR_ERRATUM_E1041 + bool "Falkor E1041: Speculative instruction fetches might cause errant memory access" + default y + help + Falkor CPU may speculatively fetch instructions from an improper + memory location when MMU translation is changed from SCTLR_ELn[M]=1 + to SCTLR_ELn[M]=0. Prefix an ISB instruction to fix the problem. + + If unsure, say Y. + endmenu diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index aef72d886677..8b168280976f 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -512,4 +512,14 @@ alternative_else_nop_endif #endif .endm +/** + * Errata workaround prior to disable MMU. Insert an ISB immediately prior + * to executing the MSR that will change SCTLR_ELn[M] from a value of 1 to 0. + */ + .macro pre_disable_mmu_workaround +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_E1041 + isb +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 65f42d257414..2a752cb2a0f3 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -37,6 +37,7 @@ ENTRY(__cpu_soft_restart) mrs x12, sctlr_el1 ldr x13, =SCTLR_ELx_FLAGS bic x12, x12, x13 + pre_disable_mmu_workaround msr sctlr_el1, x12 isb diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index 4e6ad355bd05..6b9736c3fb56 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -96,6 +96,7 @@ ENTRY(entry) mrs x0, sctlr_el2 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el2, x0 isb b 2f @@ -103,6 +104,7 @@ ENTRY(entry) mrs x0, sctlr_el1 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el1, x0 isb 2: diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 67e86a0f57ac..e3cb9fbf96b6 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -750,6 +750,7 @@ __primary_switch: * to take into account by discarding the current kernel mapping and * creating a new one. */ + pre_disable_mmu_workaround msr sctlr_el1, x20 // disable the MMU isb bl __create_page_tables // recreate kernel mapping diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index ce704a4aeadd..f407e422a720 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -45,6 +45,7 @@ ENTRY(arm64_relocate_new_kernel) mrs x0, sctlr_el2 ldr x1, =SCTLR_ELx_FLAGS bic x0, x0, x1 + pre_disable_mmu_workaround msr sctlr_el2, x0 isb 1: diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 3f9615582377..870828c364c5 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -151,6 +151,7 @@ reset: mrs x5, sctlr_el2 ldr x6, =SCTLR_ELx_FLAGS bic x5, x5, x6 // Clear SCTL_M and etc + pre_disable_mmu_workaround msr sctlr_el2, x5 isb -- cgit v1.2.3 From 0e17cada2a5b4dc847082e1db0e3f84599ffd436 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 12 Dec 2017 11:53:26 +0000 Subject: arm64: hw_breakpoint: Use linux/uaccess.h instead of asm/uaccess.h The only inclusion of asm/uaccess.h should be by linux/uaccess.h. All other headers should use the latter. Reported-by: Al Viro Signed-off-by: Will Deacon --- arch/arm64/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 749f81779420..74bb56f656ef 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -36,7 +37,6 @@ #include #include #include -#include /* Breakpoint currently in use for each BRP. */ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]); -- cgit v1.2.3 From 6b63dd119eb4eee44733ca435168ce05487b8644 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Nov 2017 00:10:25 +0900 Subject: x86/tools: Rename test_get_len to insn_decoder_test Rename test_get_len test command to insn_decoder_test as it a more meaningful name. This also changes some comments in related files. Note that this also removes the paragraph about writing to the Free Software Foundation's mailing address. Signed-off-by: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/151153622537.22827.14928774603980883278.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/tools/Makefile | 10 +-- arch/x86/tools/distill.awk | 2 +- arch/x86/tools/insn_decoder_test.c | 169 ++++++++++++++++++++++++++++++++++++ arch/x86/tools/test_get_len.c | 173 ------------------------------------- 4 files changed, 175 insertions(+), 179 deletions(-) create mode 100644 arch/x86/tools/insn_decoder_test.c delete mode 100644 arch/x86/tools/test_get_len.c (limited to 'arch') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 972b8e8d939c..b0d75684d313 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -17,24 +17,24 @@ distill_awk = $(srctree)/arch/x86/tools/distill.awk chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk quiet_cmd_posttest = TEST $@ - cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) + cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose) quiet_cmd_sanitytest = TEST $@ cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000 -posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity +posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity $(call cmd,posttest) $(call cmd,sanitytest) -hostprogs-y += test_get_len insn_sanity +hostprogs-y += insn_decoder_test insn_sanity # -I needed for generated C source and C source which in the kernel tree. -HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/ +HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ # Dependencies are also needed. -$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c +$(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk index e0edeccc1429..80cd7d53bd07 100644 --- a/arch/x86/tools/distill.awk +++ b/arch/x86/tools/distill.awk @@ -1,6 +1,6 @@ #!/bin/awk -f # SPDX-License-Identifier: GPL-2.0 -# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len +# Usage: objdump -d a.out | awk -f distill.awk | ./insn_decoder_test # Distills the disassembly as follows: # - Removes all lines except the disassembled instructions. # - For instructions that exceed 1 line (7 bytes), crams all the hex bytes diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c new file mode 100644 index 000000000000..8be7264cb723 --- /dev/null +++ b/arch/x86/tools/insn_decoder_test.c @@ -0,0 +1,169 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include +#include +#include +#include +#include + +#define unlikely(cond) (cond) + +#include +#include +#include + +/* + * Test of instruction analysis in general and insn_get_length() in + * particular. See if insn_get_length() and the disassembler agree + * on the length of each instruction in an elf disassembly. + * + * Usage: objdump -d a.out | awk -f distill.awk | ./insn_decoder_test + */ + +const char *prog; +static int verbose; +static int x86_64; + +static void usage(void) +{ + fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" + " %s [-y|-n] [-v]\n", prog); + fprintf(stderr, "\t-y 64bit mode\n"); + fprintf(stderr, "\t-n 32bit mode\n"); + fprintf(stderr, "\t-v verbose mode\n"); + exit(1); +} + +static void malformed_line(const char *line, int line_nr) +{ + fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); + exit(3); +} + +static void dump_field(FILE *fp, const char *name, const char *indent, + struct insn_field *field) +{ + fprintf(fp, "%s.%s = {\n", indent, name); + fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", + indent, field->value, field->bytes[0], field->bytes[1], + field->bytes[2], field->bytes[3]); + fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, + field->got, field->nbytes); +} + +static void dump_insn(FILE *fp, struct insn *insn) +{ + fprintf(fp, "Instruction = {\n"); + dump_field(fp, "prefixes", "\t", &insn->prefixes); + dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); + dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); + dump_field(fp, "opcode", "\t", &insn->opcode); + dump_field(fp, "modrm", "\t", &insn->modrm); + dump_field(fp, "sib", "\t", &insn->sib); + dump_field(fp, "displacement", "\t", &insn->displacement); + dump_field(fp, "immediate1", "\t", &insn->immediate1); + dump_field(fp, "immediate2", "\t", &insn->immediate2); + fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", + insn->attr, insn->opnd_bytes, insn->addr_bytes); + fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", + insn->length, insn->x86_64, insn->kaddr); +} + +static void parse_args(int argc, char **argv) +{ + int c; + prog = argv[0]; + while ((c = getopt(argc, argv, "ynv")) != -1) { + switch (c) { + case 'y': + x86_64 = 1; + break; + case 'n': + x86_64 = 0; + break; + case 'v': + verbose = 1; + break; + default: + usage(); + } + } +} + +#define BUFSIZE 256 + +int main(int argc, char **argv) +{ + char line[BUFSIZE], sym[BUFSIZE] = ""; + unsigned char insn_buf[16]; + struct insn insn; + int insns = 0; + int warnings = 0; + + parse_args(argc, argv); + + while (fgets(line, BUFSIZE, stdin)) { + char copy[BUFSIZE], *s, *tab1, *tab2; + int nb = 0; + unsigned int b; + + if (line[0] == '<') { + /* Symbol line */ + strcpy(sym, line); + continue; + } + + insns++; + memset(insn_buf, 0, 16); + strcpy(copy, line); + tab1 = strchr(copy, '\t'); + if (!tab1) + malformed_line(line, insns); + s = tab1 + 1; + s += strspn(s, " "); + tab2 = strchr(s, '\t'); + if (!tab2) + malformed_line(line, insns); + *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ + while (s < tab2) { + if (sscanf(s, "%x", &b) == 1) { + insn_buf[nb++] = (unsigned char) b; + s += 3; + } else + break; + } + /* Decode an instruction */ + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); + insn_get_length(&insn); + if (insn.length != nb) { + warnings++; + fprintf(stderr, "Warning: %s found difference at %s\n", + prog, sym); + fprintf(stderr, "Warning: %s", line); + fprintf(stderr, "Warning: objdump says %d bytes, but " + "insn_get_length() says %d\n", nb, + insn.length); + if (verbose) + dump_insn(stderr, &insn); + } + } + if (warnings) + fprintf(stderr, "Warning: decoded and checked %d" + " instructions with %d warnings\n", insns, warnings); + else + fprintf(stdout, "Success: decoded and checked %d" + " instructions\n", insns); + return 0; +} diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c deleted file mode 100644 index ecf31e0358c8..000000000000 --- a/arch/x86/tools/test_get_len.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2009 - */ - -#include -#include -#include -#include -#include - -#define unlikely(cond) (cond) - -#include -#include -#include - -/* - * Test of instruction analysis in general and insn_get_length() in - * particular. See if insn_get_length() and the disassembler agree - * on the length of each instruction in an elf disassembly. - * - * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len - */ - -const char *prog; -static int verbose; -static int x86_64; - -static void usage(void) -{ - fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " %s [-y|-n] [-v]\n", prog); - fprintf(stderr, "\t-y 64bit mode\n"); - fprintf(stderr, "\t-n 32bit mode\n"); - fprintf(stderr, "\t-v verbose mode\n"); - exit(1); -} - -static void malformed_line(const char *line, int line_nr) -{ - fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); - exit(3); -} - -static void dump_field(FILE *fp, const char *name, const char *indent, - struct insn_field *field) -{ - fprintf(fp, "%s.%s = {\n", indent, name); - fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", - indent, field->value, field->bytes[0], field->bytes[1], - field->bytes[2], field->bytes[3]); - fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, - field->got, field->nbytes); -} - -static void dump_insn(FILE *fp, struct insn *insn) -{ - fprintf(fp, "Instruction = {\n"); - dump_field(fp, "prefixes", "\t", &insn->prefixes); - dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); - dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); - dump_field(fp, "opcode", "\t", &insn->opcode); - dump_field(fp, "modrm", "\t", &insn->modrm); - dump_field(fp, "sib", "\t", &insn->sib); - dump_field(fp, "displacement", "\t", &insn->displacement); - dump_field(fp, "immediate1", "\t", &insn->immediate1); - dump_field(fp, "immediate2", "\t", &insn->immediate2); - fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", - insn->attr, insn->opnd_bytes, insn->addr_bytes); - fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", - insn->length, insn->x86_64, insn->kaddr); -} - -static void parse_args(int argc, char **argv) -{ - int c; - prog = argv[0]; - while ((c = getopt(argc, argv, "ynv")) != -1) { - switch (c) { - case 'y': - x86_64 = 1; - break; - case 'n': - x86_64 = 0; - break; - case 'v': - verbose = 1; - break; - default: - usage(); - } - } -} - -#define BUFSIZE 256 - -int main(int argc, char **argv) -{ - char line[BUFSIZE], sym[BUFSIZE] = ""; - unsigned char insn_buf[16]; - struct insn insn; - int insns = 0; - int warnings = 0; - - parse_args(argc, argv); - - while (fgets(line, BUFSIZE, stdin)) { - char copy[BUFSIZE], *s, *tab1, *tab2; - int nb = 0; - unsigned int b; - - if (line[0] == '<') { - /* Symbol line */ - strcpy(sym, line); - continue; - } - - insns++; - memset(insn_buf, 0, 16); - strcpy(copy, line); - tab1 = strchr(copy, '\t'); - if (!tab1) - malformed_line(line, insns); - s = tab1 + 1; - s += strspn(s, " "); - tab2 = strchr(s, '\t'); - if (!tab2) - malformed_line(line, insns); - *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ - while (s < tab2) { - if (sscanf(s, "%x", &b) == 1) { - insn_buf[nb++] = (unsigned char) b; - s += 3; - } else - break; - } - /* Decode an instruction */ - insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); - insn_get_length(&insn); - if (insn.length != nb) { - warnings++; - fprintf(stderr, "Warning: %s found difference at %s\n", - prog, sym); - fprintf(stderr, "Warning: %s", line); - fprintf(stderr, "Warning: objdump says %d bytes, but " - "insn_get_length() says %d\n", nb, - insn.length); - if (verbose) - dump_insn(stderr, &insn); - } - } - if (warnings) - fprintf(stderr, "Warning: decoded and checked %d" - " instructions with %d warnings\n", insns, warnings); - else - fprintf(stdout, "Success: decoded and checked %d" - " instructions\n", insns); - return 0; -} -- cgit v1.2.3 From 98fe07fccc3e25889186277a5158c0a658d528a4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Nov 2017 00:10:54 +0900 Subject: x86/tools: Rename distill.awk to objdump_reformat.awk Rename distill.awk to objdump_reformat.awk because it more clearly expresses its purpose of re-formatting the output of objdump so that insn_decoder_test can read it. Signed-off-by: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/151153625409.22827.10470603625519700259.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/tools/Makefile | 4 ++-- arch/x86/tools/distill.awk | 48 ------------------------------------- arch/x86/tools/insn_decoder_test.c | 6 ++--- arch/x86/tools/objdump_reformat.awk | 48 +++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 53 deletions(-) delete mode 100644 arch/x86/tools/distill.awk create mode 100644 arch/x86/tools/objdump_reformat.awk (limited to 'arch') diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index b0d75684d313..09af7ff53044 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -13,11 +13,11 @@ else posttest_64bit = -n endif -distill_awk = $(srctree)/arch/x86/tools/distill.awk +reformatter = $(srctree)/arch/x86/tools/objdump_reformat.awk chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk quiet_cmd_posttest = TEST $@ - cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose) + cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose) quiet_cmd_sanitytest = TEST $@ cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000 diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk deleted file mode 100644 index 80cd7d53bd07..000000000000 --- a/arch/x86/tools/distill.awk +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/awk -f -# SPDX-License-Identifier: GPL-2.0 -# Usage: objdump -d a.out | awk -f distill.awk | ./insn_decoder_test -# Distills the disassembly as follows: -# - Removes all lines except the disassembled instructions. -# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes -# into a single line. -# - Remove bad(or prefix only) instructions - -BEGIN { - prev_addr = "" - prev_hex = "" - prev_mnemonic = "" - bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" - fwait_expr = "^9b " - fwait_str="9b\tfwait" -} - -/^ *[0-9a-f]+ <[^>]*>:/ { - # Symbol entry - printf("%s%s\n", $2, $1) -} - -/^ *[0-9a-f]+:/ { - if (split($0, field, "\t") < 3) { - # This is a continuation of the same insn. - prev_hex = prev_hex field[2] - } else { - # Skip bad instructions - if (match(prev_mnemonic, bad_expr)) - prev_addr = "" - # Split fwait from other f* instructions - if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") { - printf "%s\t%s\n", prev_addr, fwait_str - sub(fwait_expr, "", prev_hex) - } - if (prev_addr != "") - printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic - prev_addr = field[1] - prev_hex = field[2] - prev_mnemonic = field[3] - } -} - -END { - if (prev_addr != "") - printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic -} diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index 8be7264cb723..286d2e3b9d57 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -29,7 +29,7 @@ * particular. See if insn_get_length() and the disassembler agree * on the length of each instruction in an elf disassembly. * - * Usage: objdump -d a.out | awk -f distill.awk | ./insn_decoder_test + * Usage: objdump -d a.out | awk -f objdump_reformat.awk | ./insn_decoder_test */ const char *prog; @@ -38,8 +38,8 @@ static int x86_64; static void usage(void) { - fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " %s [-y|-n] [-v]\n", prog); + fprintf(stderr, "Usage: objdump -d a.out | awk -f objdump_reformat.awk" + " | %s [-y|-n] [-v]\n", prog); fprintf(stderr, "\t-y 64bit mode\n"); fprintf(stderr, "\t-n 32bit mode\n"); fprintf(stderr, "\t-v verbose mode\n"); diff --git a/arch/x86/tools/objdump_reformat.awk b/arch/x86/tools/objdump_reformat.awk new file mode 100644 index 000000000000..f418c91b71f0 --- /dev/null +++ b/arch/x86/tools/objdump_reformat.awk @@ -0,0 +1,48 @@ +#!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 +# Usage: objdump -d a.out | awk -f objdump_reformat.awk | ./insn_decoder_test +# Reformats the disassembly as follows: +# - Removes all lines except the disassembled instructions. +# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes +# into a single line. +# - Remove bad(or prefix only) instructions + +BEGIN { + prev_addr = "" + prev_hex = "" + prev_mnemonic = "" + bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" + fwait_expr = "^9b " + fwait_str="9b\tfwait" +} + +/^ *[0-9a-f]+ <[^>]*>:/ { + # Symbol entry + printf("%s%s\n", $2, $1) +} + +/^ *[0-9a-f]+:/ { + if (split($0, field, "\t") < 3) { + # This is a continuation of the same insn. + prev_hex = prev_hex field[2] + } else { + # Skip bad instructions + if (match(prev_mnemonic, bad_expr)) + prev_addr = "" + # Split fwait from other f* instructions + if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") { + printf "%s\t%s\n", prev_addr, fwait_str + sub(fwait_expr, "", prev_hex) + } + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic + prev_addr = field[1] + prev_hex = field[2] + prev_mnemonic = field[3] + } +} + +END { + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic +} -- cgit v1.2.3 From 10c91577d5e631773a6394e14cf60125389b71ae Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Nov 2017 00:11:22 +0900 Subject: x86/tools: Standardize output format of insn_decode_test Standardize warning, error, and success printout format of insn_decode_test so that user can easily understand which test tool caused the messages. Signed-off-by: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/151153628279.22827.4869104298276788693.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/tools/insn_decoder_test.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index 286d2e3b9d57..a3b4fd954931 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -17,6 +17,7 @@ #include #include #include +#include #define unlikely(cond) (cond) @@ -48,10 +49,21 @@ static void usage(void) static void malformed_line(const char *line, int line_nr) { - fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); + fprintf(stderr, "%s: error: malformed line %d:\n%s", + prog, line_nr, line); exit(3); } +static void pr_warn(const char *fmt, ...) +{ + va_list ap; + + fprintf(stderr, "%s: warning: ", prog); + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); +} + static void dump_field(FILE *fp, const char *name, const char *indent, struct insn_field *field) { @@ -149,21 +161,20 @@ int main(int argc, char **argv) insn_get_length(&insn); if (insn.length != nb) { warnings++; - fprintf(stderr, "Warning: %s found difference at %s\n", - prog, sym); - fprintf(stderr, "Warning: %s", line); - fprintf(stderr, "Warning: objdump says %d bytes, but " - "insn_get_length() says %d\n", nb, - insn.length); + pr_warn("Found an x86 instruction decoder bug, " + "please report this.\n", sym); + pr_warn("%s", line); + pr_warn("objdump says %d bytes, but insn_get_length() " + "says %d\n", nb, insn.length); if (verbose) dump_insn(stderr, &insn); } } if (warnings) - fprintf(stderr, "Warning: decoded and checked %d" - " instructions with %d warnings\n", insns, warnings); + pr_warn("Decoded and checked %d instructions with %d " + "failures\n", insns, warnings); else - fprintf(stdout, "Success: decoded and checked %d" - " instructions\n", insns); + fprintf(stdout, "%s: success: Decoded and checked %d" + " instructions\n", prog, insns); return 0; } -- cgit v1.2.3 From 0f3922a9b99eca76c6578cd84191573378f2c988 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 8 Dec 2017 04:17:28 -0700 Subject: x86/Xen: don't report ancient LAPIC version Unconditionally reporting a value seen on the P4 or older invokes functionality like io_apic_get_unique_id() on 32-bit builds, resulting in a panic() with sufficiently many CPUs and/or IO-APICs. Doing what that function does would be the hypervisor's responsibility anyway, so makes no sense to be used when running on Xen. Uniformly report a more modern version; this shouldn't matter much as both LAPIC and IO-APIC are being managed entirely / mostly by the hypervisor. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index b5e48da7fbff..c14048553c18 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -56,7 +56,7 @@ static u32 xen_apic_read(u32 reg) return 0; if (reg == APIC_LVR) - return 0x10; + return 0x14; #ifdef CONFIG_X86_32 if (reg == APIC_LDR) return SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); -- cgit v1.2.3 From 17278a91e04f858155d54bee5528ba4fbcec6f87 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Nov 2017 12:01:20 +0000 Subject: MIPS: CPS: Fix r1 .set mt assembler warning MIPS CPS has a build warning on kernels configured for MIPS32R1 or MIPS64R1, due to the use of .set mt without a prior .set mips{32,64}r2: arch/mips/kernel/cps-vec.S Assembler messages: arch/mips/kernel/cps-vec.S:238: Warning: the `mt' extension requires MIPS32 revision 2 or greater Add .set MIPS_ISA_LEVEL_RAW before .set mt to silence the warning. Fixes: 245a7868d2f2 ("MIPS: smp-cps: rework core/VPE initialisation") Signed-off-by: James Hogan Cc: Paul Burton Cc: James Hogan Cc: James Hogan Cc: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17699/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/cps-vec.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S index c7ed26029cbb..e68e6e04063a 100644 --- a/arch/mips/kernel/cps-vec.S +++ b/arch/mips/kernel/cps-vec.S @@ -235,6 +235,7 @@ LEAF(mips_cps_core_init) has_mt t0, 3f .set push + .set MIPS_ISA_LEVEL_RAW .set mt /* Only allow 1 TC per VPE to execute... */ @@ -388,6 +389,7 @@ LEAF(mips_cps_boot_vpes) #elif defined(CONFIG_MIPS_MT) .set push + .set MIPS_ISA_LEVEL_RAW .set mt /* If the core doesn't support MT then return */ -- cgit v1.2.3 From a23f06f06dbe54696e8d4f156b317e8c9961c345 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:31 +0100 Subject: bpf: fix build issues on um due to mising bpf_perf_event.h Since c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build on i386 or x86_64: [...] CC init/main.o In file included from ../include/linux/perf_event.h:18:0, from ../include/linux/trace_events.h:10, from ../include/trace/syscall.h:7, from ../include/linux/syscalls.h:82, from ../init/main.c:20: ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: asm/bpf_perf_event.h: No such file or directory #include [...] Lets add missing bpf_perf_event.h also to um arch. This seems to be the only one still missing. Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") Reported-by: Randy Dunlap Suggested-by: Richard Weinberger Signed-off-by: Daniel Borkmann Tested-by: Randy Dunlap Cc: Hendrik Brueckner Cc: Richard Weinberger Acked-by: Alexei Starovoitov Acked-by: Richard Weinberger Signed-off-by: Alexei Starovoitov --- arch/um/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 50a32c33d729..73c57f614c9e 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += barrier.h +generic-y += bpf_perf_event.h generic-y += bug.h generic-y += clkdev.h generic-y += current.h -- cgit v1.2.3 From a03fe72572c12e98f4173f8a535f32468e48b6ec Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:51:35 +0000 Subject: MIPS: Factor out NT_PRFPREG regset access helpers In preparation to fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") FCSR access regression factor out NT_PRFPREG regset access helpers for the non-MSA and the MSA variants respectively, to avoid having to deal with excessive indentation in the actual fix. No functional change, however use `target->thread.fpu.fpr[0]' rather than `target->thread.fpu.fpr[i]' for FGR holding type size determination as there's no `i' variable to refer to anymore, and for the factored out `i' variable declaration use `unsigned int' rather than `unsigned' as its type, following the common style. Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17925/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 108 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index efbd8df8b665..62e8ffd9370a 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -419,25 +419,36 @@ static int gpr64_set(struct task_struct *target, #endif /* CONFIG_64BIT */ -static int fpr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * !CONFIG_CPU_HAS_MSA variant. FP context's general register slots + * correspond 1:1 to buffer slots. + */ +static int fpr_get_fpa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + void **kbuf, void __user **ubuf) { - unsigned i; - int err; - u64 fpr_val; - - /* XXX fcr31 */ + return user_regset_copyout(pos, count, kbuf, ubuf, + &target->thread.fpu, + 0, sizeof(elf_fpregset_t)); +} - if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * CONFIG_CPU_HAS_MSA variant. Only lower 64 bits of FP context's + * general register slots are copied to buffer slots. + */ +static int fpr_get_msa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + void **kbuf, void __user **ubuf) +{ + unsigned int i; + u64 fpr_val; + int err; for (i = 0; i < NUM_FPU_REGS; i++) { fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); - err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + err = user_regset_copyout(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); if (err) @@ -447,27 +458,54 @@ static int fpr_get(struct task_struct *target, return 0; } -static int fpr_set(struct task_struct *target, +/* Copy the floating-point context to the supplied NT_PRFPREG buffer. */ +static int fpr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + void *kbuf, void __user *ubuf) { - unsigned i; int err; - u64 fpr_val; /* XXX fcr31 */ - init_fp_ctx(target); + if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) + err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf); + else + err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf); + + return err; +} - if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * !CONFIG_CPU_HAS_MSA variant. Buffer slots correspond 1:1 to FP + * context's general register slots. + */ +static int fpr_set_fpa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + const void **kbuf, const void __user **ubuf) +{ + return user_regset_copyin(pos, count, kbuf, ubuf, + &target->thread.fpu, + 0, sizeof(elf_fpregset_t)); +} + +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * CONFIG_CPU_HAS_MSA variant. Buffer slots are copied to lower 64 + * bits only of FP context's general register slots. + */ +static int fpr_set_msa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + const void **kbuf, const void __user **ubuf) +{ + unsigned int i; + u64 fpr_val; + int err; BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); - for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) { - err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) { + err = user_regset_copyin(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); if (err) @@ -478,6 +516,26 @@ static int fpr_set(struct task_struct *target, return 0; } +/* Copy the supplied NT_PRFPREG buffer to the floating-point context. */ +static int fpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int err; + + /* XXX fcr31 */ + + init_fp_ctx(target); + + if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) + err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf); + else + err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf); + + return err; +} + enum mips_regset { REGSET_GPR, REGSET_FPR, -- cgit v1.2.3 From dc24d0edf33c3e15099688b6bbdf7bdc24bf6e91 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:52:15 +0000 Subject: MIPS: Guard against any partial write attempt with PTRACE_SETREGSET Complement commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") and ensure that no partial register write attempt is made with PTRACE_SETREGSET, as we do not preinitialize any temporaries used to hold incoming register data and consequently random data could be written. It is the responsibility of the caller, such as `ptrace_regset', to arrange for writes to span whole registers only, so here we only assert that it has indeed happened. Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17926/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 62e8ffd9370a..7fcadaaf330f 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -516,7 +516,15 @@ static int fpr_set_msa(struct task_struct *target, return 0; } -/* Copy the supplied NT_PRFPREG buffer to the floating-point context. */ +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context. + * + * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', + * which is supposed to have been guaranteed by the kernel before + * calling us, e.g. in `ptrace_regset'. We enforce that requirement, + * so that we can safely avoid preinitializing temporaries for + * partial register writes. + */ static int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, @@ -524,6 +532,8 @@ static int fpr_set(struct task_struct *target, { int err; + BUG_ON(count % sizeof(elf_fpreg_t)); + /* XXX fcr31 */ init_fp_ctx(target); -- cgit v1.2.3 From 80b3ffce0196ea50068885d085ff981e4b8396f4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:53:14 +0000 Subject: MIPS: Consistently handle buffer counter with PTRACE_SETREGSET Update commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") bug and consistently consume all data supplied to `fpr_set_msa' with the ptrace(2) PTRACE_SETREGSET request, such that a zero data buffer counter is returned where insufficient data has been given to fill a whole number of FP general registers. In reality this is not going to happen, as the caller is supposed to only supply data covering a whole number of registers and it is verified in `ptrace_regset' and again asserted in `fpr_set', however structuring code such that the presence of trailing partial FP general register data causes `fpr_set_msa' to return with a non-zero data buffer counter makes it appear that this trailing data will be used if there are subsequent writes made to FP registers, which is going to be the case with the FCSR once the missing write to that register has been fixed. Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") Signed-off-by: Maciej W. Rozycki Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.11+ Patchwork: https://patchwork.linux-mips.org/patch/17927/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 7fcadaaf330f..47a01d5f26ea 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -504,7 +504,7 @@ static int fpr_set_msa(struct task_struct *target, int err; BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); - for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) { + for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) { err = user_regset_copyin(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); -- cgit v1.2.3 From be07a6a1188372b6d19a3307ec33211fc9c9439d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:54:33 +0000 Subject: MIPS: Fix an FCSR access API regression with NT_PRFPREG and MSA Fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") public API regression, then activated by commit 1db1af84d6df ("MIPS: Basic MSA context switching support"), that caused the FCSR register not to be read or written for CONFIG_CPU_HAS_MSA kernel configurations (regardless of actual presence or absence of the MSA feature in a given processor) with ptrace(2) PTRACE_GETREGSET and PTRACE_SETREGSET requests nor recorded in core dumps. This is because with !CONFIG_CPU_HAS_MSA configurations the whole of `elf_fpregset_t' array is bulk-copied as it is, which includes the FCSR in one half of the last, 33rd slot, whereas with CONFIG_CPU_HAS_MSA configurations array elements are copied individually, and then only the leading 32 FGR slots while the remaining slot is ignored. Correct the code then such that only FGR slots are copied in the respective !MSA and MSA helpers an then the FCSR slot is handled separately in common code. Use `ptrace_setfcr31' to update the FCSR too, so that the read-only mask is respected. Retrieving a correct value of FCSR is important in debugging not only for the human to be able to get the right interpretation of the situation, but for correct operation of GDB as well. This is because the condition code bits in FSCR are used by GDB to determine the location to place a breakpoint at when single-stepping through an FPU branch instruction. If such a breakpoint is placed incorrectly (i.e. with the condition reversed), then it will be missed, likely causing the debuggee to run away from the control of GDB and consequently breaking the process of investigation. Fortunately GDB continues using the older PTRACE_GETFPREGS ptrace(2) request which is unaffected, so the regression only really hits with post-mortem debug sessions using a core dump file, in which case execution, and consequently single-stepping through branches is not possible. Of course core files created by buggy kernels out there will have the value of FCSR recorded clobbered, but such core files cannot be corrected and the person using them simply will have to be aware that the value of FCSR retrieved is not reliable. Which also means we can likely get away without defining a replacement API which would ensure a correct value of FSCR to be retrieved, or none at all. This is based on previous work by Alex Smith, extensively rewritten. Signed-off-by: Alex Smith Signed-off-by: James Hogan Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: Paul Burton Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17928/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 47a01d5f26ea..0a939593ccb7 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -422,7 +422,7 @@ static int gpr64_set(struct task_struct *target, /* * Copy the floating-point context to the supplied NT_PRFPREG buffer, * !CONFIG_CPU_HAS_MSA variant. FP context's general register slots - * correspond 1:1 to buffer slots. + * correspond 1:1 to buffer slots. Only general registers are copied. */ static int fpr_get_fpa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -430,13 +430,14 @@ static int fpr_get_fpa(struct task_struct *target, { return user_regset_copyout(pos, count, kbuf, ubuf, &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + 0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); } /* * Copy the floating-point context to the supplied NT_PRFPREG buffer, * CONFIG_CPU_HAS_MSA variant. Only lower 64 bits of FP context's - * general register slots are copied to buffer slots. + * general register slots are copied to buffer slots. Only general + * registers are copied. */ static int fpr_get_msa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -458,20 +459,29 @@ static int fpr_get_msa(struct task_struct *target, return 0; } -/* Copy the floating-point context to the supplied NT_PRFPREG buffer. */ +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. + */ static int fpr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); int err; - /* XXX fcr31 */ - if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf); else err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf); + if (err) + return err; + + err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.fcr31, + fcr31_pos, fcr31_pos + sizeof(u32)); return err; } @@ -479,7 +489,7 @@ static int fpr_get(struct task_struct *target, /* * Copy the supplied NT_PRFPREG buffer to the floating-point context, * !CONFIG_CPU_HAS_MSA variant. Buffer slots correspond 1:1 to FP - * context's general register slots. + * context's general register slots. Only general registers are copied. */ static int fpr_set_fpa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -487,13 +497,14 @@ static int fpr_set_fpa(struct task_struct *target, { return user_regset_copyin(pos, count, kbuf, ubuf, &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + 0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); } /* * Copy the supplied NT_PRFPREG buffer to the floating-point context, * CONFIG_CPU_HAS_MSA variant. Buffer slots are copied to lower 64 - * bits only of FP context's general register slots. + * bits only of FP context's general register slots. Only general + * registers are copied. */ static int fpr_set_msa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -518,6 +529,8 @@ static int fpr_set_msa(struct task_struct *target, /* * Copy the supplied NT_PRFPREG buffer to the floating-point context. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. * * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', * which is supposed to have been guaranteed by the kernel before @@ -530,18 +543,30 @@ static int fpr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); + u32 fcr31; int err; BUG_ON(count % sizeof(elf_fpreg_t)); - /* XXX fcr31 */ - init_fp_ctx(target); if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf); else err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf); + if (err) + return err; + + if (count > 0) { + err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fcr31, + fcr31_pos, fcr31_pos + sizeof(u32)); + if (err) + return err; + + ptrace_setfcr31(target, fcr31); + } return err; } -- cgit v1.2.3 From 006501e039eec411842bb3150c41358867d320c2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:55:40 +0000 Subject: MIPS: Also verify sizeof `elf_fpreg_t' with PTRACE_SETREGSET Complement commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") and like with the PTRACE_GETREGSET ptrace(2) request also apply a BUILD_BUG_ON check for the size of the `elf_fpreg_t' type in the PTRACE_SETREGSET request handler. Signed-off-by: Maciej W. Rozycki Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.11+ Patchwork: https://patchwork.linux-mips.org/patch/17929/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 0a939593ccb7..256908951a7c 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -447,6 +447,7 @@ static int fpr_get_msa(struct task_struct *target, u64 fpr_val; int err; + BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); for (i = 0; i < NUM_FPU_REGS; i++) { fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); err = user_regset_copyout(pos, count, kbuf, ubuf, -- cgit v1.2.3 From c8c5a3a24d395b14447a9a89d61586a913840a3b Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:56:54 +0000 Subject: MIPS: Disallow outsized PTRACE_SETREGSET NT_PRFPREG regset accesses Complement commit c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout") and also reject outsized PTRACE_SETREGSET requests to the NT_PRFPREG regset, like with the NT_PRSTATUS regset. Signed-off-by: Maciej W. Rozycki Fixes: c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.17+ Patchwork: https://patchwork.linux-mips.org/patch/17930/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 256908951a7c..0b23b1ad99e6 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -550,6 +550,9 @@ static int fpr_set(struct task_struct *target, BUG_ON(count % sizeof(elf_fpreg_t)); + if (pos + count > sizeof(elf_fpregset_t)) + return -EIO; + init_fp_ctx(target); if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) -- cgit v1.2.3 From f41d84dddc66b164ac16acf3f584c276146f1c48 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 12 Dec 2017 17:59:15 +0530 Subject: powerpc/perf: Dereference BHRB entries safely It's theoretically possible that branch instructions recorded in BHRB (Branch History Rolling Buffer) entries have already been unmapped before they are processed by the kernel. Hence, trying to dereference such memory location will result in a crash. eg: Unable to handle kernel paging request for data at address 0xd000000019c41764 Faulting instruction address: 0xc000000000084a14 NIP [c000000000084a14] branch_target+0x4/0x70 LR [c0000000000eb828] record_and_restart+0x568/0x5c0 Call Trace: [c0000000000eb3b4] record_and_restart+0xf4/0x5c0 (unreliable) [c0000000000ec378] perf_event_interrupt+0x298/0x460 [c000000000027964] performance_monitor_exception+0x54/0x70 [c000000000009ba4] performance_monitor_common+0x114/0x120 Fix it by deferefencing the addresses safely. Fixes: 691231846ceb ("powerpc/perf: Fix setting of "to" addresses for BHRB") Cc: stable@vger.kernel.org # v3.10+ Suggested-by: Naveen N. Rao Signed-off-by: Ravi Bangoria Reviewed-by: Naveen N. Rao [mpe: Use probe_kernel_read() which is clearer, tweak change log] Signed-off-by: Michael Ellerman --- arch/powerpc/perf/core-book3s.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 153812966365..fce545774d50 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr) int ret; __u64 target; - if (is_kernel_addr(addr)) - return branch_target((unsigned int *)addr); + if (is_kernel_addr(addr)) { + if (probe_kernel_read(&instr, (void *)addr, sizeof(instr))) + return 0; + + return branch_target(&instr); + } /* Userspace: need copy instruction here then translate it */ pagefault_disable(); -- cgit v1.2.3 From ad2b6e01024ef23bddc3ce0bcb115ecd8c520b7e Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Tue, 5 Dec 2017 11:00:38 +0530 Subject: powerpc/perf/imc: Fix nest-imc cpuhotplug callback failure Oops is observed during boot: Faulting instruction address: 0xc000000000248340 cpu 0x0: Vector: 380 (Data Access Out of Range) at [c000000ff66fb850] pc: c000000000248340: event_function_call+0x50/0x1f0 lr: c00000000024878c: perf_remove_from_context+0x3c/0x100 sp: c000000ff66fbad0 msr: 9000000000009033 dar: 7d20e2a6f92d03c0 pid = 14, comm = cpuhp/0 While registering the cpuhotplug callbacks for nest-imc, if we fail in the cpuhotplug online path for any random node in a multi node system (because the opal call to stop nest-imc counters fails for that node), ppc_nest_imc_cpu_offline() will get invoked for other nodes who successfully returned from cpuhotplug online path. This call trace is generated since in the ppc_nest_imc_cpu_offline() path we are trying to migrate the event context, when nest-imc counters are not even initialized. Patch to add a check to ensure that nest-imc is registered before migrating the event context. Fixes: 885dcd709ba9 ("powerpc/perf: Add nest IMC PMU support") Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 0ead3cd73caa..f1b940714d65 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -309,6 +309,19 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu) if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask)) return 0; + /* + * Check whether nest_imc is registered. We could end up here if the + * cpuhotplug callback registration fails. i.e, callback invokes the + * offline path for all successfully registered nodes. At this stage, + * nest_imc pmu will not be registered and we should return here. + * + * We return with a zero since this is not an offline failure. And + * cpuhp_setup_state() returns the actual failure reason to the caller, + * which in turn will call the cleanup routine. + */ + if (!nest_pmus) + return 0; + /* * Now that this cpu is one of the designated, * find a next cpu a) which is online and b) in same chip. -- cgit v1.2.3 From 110df8bd3e418b3476cae80babe8add48a8ea523 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Thu, 7 Dec 2017 22:53:27 +0530 Subject: powerpc/perf: Fix kfree memory allocated for nest pmus imc_common_cpuhp_mem_free() is the common function for all IMC (In-memory Collection counters) domains to unregister cpuhotplug callback and free memory. Since kfree of memory allocated for nest-imc (per_nest_pmu_arr) is in the common code, all domains (core/nest/thread) can do the kfree in the failure case. This could potentially create a call trace as shown below, where core(/thread/nest) imc pmu initialization fails and in the failure path imc_common_cpuhp_mem_free() free the memory(per_nest_pmu_arr), which is allocated by successfully registered nest units. The call trace is generated in a scenario where core-imc initialization is made to fail and a cpuhotplug is performed in a p9 system. During cpuhotplug ppc_nest_imc_cpu_offline() tries to access per_nest_pmu_arr, which is already freed by core-imc. NIP [c000000000cb6a94] mutex_lock+0x34/0x90 LR [c000000000cb6a88] mutex_lock+0x28/0x90 Call Trace: mutex_lock+0x28/0x90 (unreliable) perf_pmu_migrate_context+0x90/0x3a0 ppc_nest_imc_cpu_offline+0x190/0x1f0 cpuhp_invoke_callback+0x160/0x820 cpuhp_thread_fun+0x1bc/0x270 smpboot_thread_fn+0x250/0x290 kthread+0x1a8/0x1b0 ret_from_kernel_thread+0x5c/0x74 To address this scenario do the kfree(per_nest_pmu_arr) only in case of nest-imc initialization failure, and when there is no other nest units registered. Fixes: 73ce9aec65b1 ("powerpc/perf: Fix IMC_MAX_PMU macro") Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index f1b940714d65..be4e7f84f70a 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1184,6 +1184,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) if (nest_pmus == 1) { cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); kfree(nest_imc_refc); + kfree(per_nest_pmu_arr); } if (nest_pmus > 0) @@ -1208,7 +1209,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); kfree(pmu_ptr); - kfree(per_nest_pmu_arr); return; } @@ -1322,6 +1322,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id ret = nest_pmu_cpumask_init(); if (ret) { mutex_unlock(&nest_init_lock); + kfree(nest_imc_refc); + kfree(per_nest_pmu_arr); goto err_free; } } -- cgit v1.2.3 From a5f1005517534aeb1fac20180badfbf0896c183c Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Fri, 1 Dec 2017 18:47:32 +0100 Subject: s390/pci: handle insufficient resources during dma tlb flush In a virtualized setup lazy flushing can lead to the hypervisor running out of resources when lots of guest pages need to be pinned. In this situation simply trigger a global flush to give the hypervisor a chance to free some of these resources. Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Reviewed-by: Pierre Morel Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci_dma.c | 21 +++++++++++++++++++-- arch/s390/pci/pci_insn.c | 3 +++ 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index f7aa5a77827e..2d15d84c20ed 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -181,6 +181,9 @@ out_unlock: static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, size_t size, int flags) { + unsigned long irqflags; + int ret; + /* * With zdev->tlb_refresh == 0, rpcit is not required to establish new * translations when previously invalid translation-table entries are @@ -196,8 +199,22 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, return 0; } - return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, - PAGE_ALIGN(size)); + ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, + PAGE_ALIGN(size)); + if (ret == -ENOMEM && !s390_iommu_strict) { + /* enable the hypervisor to free some resources */ + if (zpci_refresh_global(zdev)) + goto out; + + spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags); + bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap, + zdev->lazy_bitmap, zdev->iommu_pages); + bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages); + spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags); + ret = 0; + } +out: + return ret; } static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa, diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 19bcb3b45a70..f069929e8211 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -89,6 +89,9 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) if (cc) zpci_err_insn(cc, status, addr, range); + if (cc == 1 && (status == 4 || status == 16)) + return -ENOMEM; + return (cc) ? -EIO : 0; } -- cgit v1.2.3 From faa75e147b583417273902552c61cf3250a44308 Mon Sep 17 00:00:00 2001 From: Dongjiu Geng Date: Wed, 13 Dec 2017 18:36:47 +0800 Subject: arm64: fault: avoid send SIGBUS two times do_sea() calls arm64_notify_die() which will always signal user-space. It also returns whether APEI claimed the external abort as a RAS notification. If it returns failure do_mem_abort() will signal user-space too. do_mem_abort() wants to know if we handled the error, we always call arm64_notify_die() so can always return success. Signed-off-by: Dongjiu Geng Reviewed-by: James Morse Reviewed-by: Xie XiuQi Signed-off-by: Will Deacon --- arch/arm64/mm/fault.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 22168cd0dde7..9b7f89df49db 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -574,7 +574,6 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) { struct siginfo info; const struct fault_info *inf; - int ret = 0; inf = esr_to_fault_info(esr); pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n", @@ -589,7 +588,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) if (interrupts_enabled(regs)) nmi_enter(); - ret = ghes_notify_sea(); + ghes_notify_sea(); if (interrupts_enabled(regs)) nmi_exit(); @@ -604,7 +603,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) info.si_addr = (void __user *)addr; arm64_notify_die("", regs, &info, esr); - return ret; + return 0; } static const struct fault_info fault_info[] = { -- cgit v1.2.3 From cd8165c3d5fb07667328434835f2968a87caee67 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 Dec 2017 09:29:51 +0000 Subject: ARM: dts: vf610-zii-dev: use XAUI for DSA link ports Use XAUI rather than XGMII for DSA link ports, as this is the interface mode that the switches actually use. XAUI is the 4 lane bus with clock per direction, whereas XGMII is a 32 bit bus with clock. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- arch/arm/boot/dts/vf610-zii-dev-rev-c.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts index 02a6227c717c..15a685dc2aa2 100644 --- a/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts +++ b/arch/arm/boot/dts/vf610-zii-dev-rev-c.dts @@ -121,7 +121,7 @@ switch0port10: port@10 { reg = <10>; label = "dsa"; - phy-mode = "xgmii"; + phy-mode = "xaui"; link = <&switch1port10>; }; }; @@ -208,7 +208,7 @@ switch1port10: port@10 { reg = <10>; label = "dsa"; - phy-mode = "xgmii"; + phy-mode = "xaui"; link = <&switch0port10>; }; }; -- cgit v1.2.3 From 1d08a044cf12aee37dfd54837558e3295287b343 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 13 Dec 2017 11:45:42 +0000 Subject: arm64: fix CONFIG_DEBUG_WX address reporting In ptdump_check_wx(), we pass walk_pgd() a start address of 0 (rather than VA_START) for the init_mm. This means that any reported W&X addresses are offset by VA_START, which is clearly wrong and can make them appear like userspace addresses. Fix this by telling the ptdump code that we're walking init_mm starting at VA_START. We don't need to update the addr_markers, since these are still valid bounds regardless. Cc: Fixes: 1404d6f13e47 ("arm64: dump: Add checking for writable and exectuable pages") Signed-off-by: Mark Rutland Cc: Kees Cook Cc: Laura Abbott Reported-by: Timur Tabi Signed-off-by: Will Deacon --- arch/arm64/mm/dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index ca74a2aace42..7b60d62ac593 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -389,7 +389,7 @@ void ptdump_check_wx(void) .check_wx = true, }; - walk_pgd(&st, &init_mm, 0); + walk_pgd(&st, &init_mm, VA_START); note_page(&st, 0, 0, 0); if (st.wx_pages || st.uxn_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", -- cgit v1.2.3 From 3fab39997a98b97138c886978af660c4f6c7e9e6 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Thu, 14 Dec 2017 14:03:44 +0000 Subject: arm64/sve: Report SVE to userspace via CPUID only if supported Currently, the SVE field in ID_AA64PFR0_EL1 is visible unconditionally to userspace via the CPU ID register emulation, irrespective of the kernel config. This means that if a kernel configured with CONFIG_ARM64_SVE=n is run on SVE-capable hardware, userspace will see SVE reported as present in the ID regs even though the kernel forbids execution of SVE instructions. This patch makes the exposure of the SVE field in ID_AA64PFR0_EL1 conditional on CONFIG_ARM64_SVE=y. Since future architecture features are likely to encounter a similar requirement, this patch adds a suitable helper macros for use when declaring config-conditional ID register fields. Fixes: 43994d824e84 ("arm64/sve: Detect SVE and activate runtime support") Reviewed-by: Suzuki K Poulose Reported-by: Mark Rutland Signed-off-by: Dave Martin Cc: Suzuki Poulose Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 3 +++ arch/arm64/kernel/cpufeature.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index ac67cfc2585a..060e3a4008ab 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -60,6 +60,9 @@ enum ftr_type { #define FTR_VISIBLE true /* Feature visible to the user space */ #define FTR_HIDDEN false /* Feature is hidden from the user */ +#define FTR_VISIBLE_IF_IS_ENABLED(config) \ + (IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN) + struct arm64_ftr_bits { bool sign; /* Value is signed ? */ bool visible; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c5ba0097887f..a73a5928f09b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -145,7 +145,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_linux.c | 1 + fs/nfsd/auth.c | 3 +++ include/linux/cred.h | 1 + kernel/groups.c | 5 +++-- kernel/uid16.c | 1 + net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 + net/sunrpc/auth_gss/svcauth_gss.c | 1 + net/sunrpc/svcauth_unix.c | 2 ++ 8 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index f04db3779b34..59eea9c65d3e 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 697f8ae7792d..f650e475d8f0 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } } else { gi = get_group_info(rqgi); diff --git a/include/linux/cred.h b/include/linux/cred.h index 099058e1178b..631286535d0f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); +extern void groups_sort(struct group_info *); /* * The security context of a task diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index c4778cae58ef..444380f968f1 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -231,6 +231,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, goto out_free_groups; creds->cr_group_info->gid[i] = kgid; } + groups_sort(creds->cr_group_info); return 0; out_free_groups: diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 5dd4e6c9fef2..26531193fce4 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -481,6 +481,7 @@ static int rsc_parse(struct cache_detail *cd, goto out; rsci.cred.cr_group_info->gid[i] = kgid; } + groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 740b67d5a733..af7f28fb8102 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -520,6 +520,7 @@ static int unix_gid_parse(struct cache_detail *cd, ug.gi->gid[i] = kgid; } + groups_sort(ug.gi); ugp = unix_gid_lookup(cd, uid); if (ugp) { struct cache_head *ch; @@ -819,6 +820,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); cred->cr_group_info->gid[i] = kgid; } + groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; return SVC_DENIED; -- cgit v1.2.3 From f29810335965ac1f7bcb501ee2af5f039f792416 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Thu, 14 Dec 2017 03:01:52 -0500 Subject: KVM/x86: Check input paging mode when cs.l is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: WARNING: CPU: 0 PID: 27962 at arch/x86/kvm/emulate.c:5631 x86_emulate_insn+0x557/0x15f0 [kvm] Modules linked in: kvm_intel kvm [last unloaded: kvm] CPU: 0 PID: 27962 Comm: syz-executor Tainted: G B W 4.15.0-rc2-next-20171208+ #32 Hardware name: Intel Corporation S1200SP/S1200SP, BIOS S1200SP.86B.01.03.0006.040720161253 04/07/2016 RIP: 0010:x86_emulate_insn+0x557/0x15f0 [kvm] RSP: 0018:ffff8807234476d0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff88072d0237a0 RCX: ffffffffa0065c4d RDX: 1ffff100e5a046f9 RSI: 0000000000000003 RDI: ffff88072d0237c8 RBP: ffff880723447728 R08: ffff88072d020000 R09: ffffffffa008d240 R10: 0000000000000002 R11: ffffed00e7d87db3 R12: ffff88072d0237c8 R13: ffff88072d023870 R14: ffff88072d0238c2 R15: ffffffffa008d080 FS: 00007f8a68666700(0000) GS:ffff880802200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000002009506c CR3: 000000071fec4005 CR4: 00000000003626f0 Call Trace: x86_emulate_instruction+0x3bc/0xb70 [kvm] ? reexecute_instruction.part.162+0x130/0x130 [kvm] vmx_handle_exit+0x46d/0x14f0 [kvm_intel] ? trace_event_raw_event_kvm_entry+0xe7/0x150 [kvm] ? handle_vmfunc+0x2f0/0x2f0 [kvm_intel] ? wait_lapic_expire+0x25/0x270 [kvm] vcpu_enter_guest+0x720/0x1ef0 [kvm] ... When CS.L is set, vcpu should run in the 64 bit paging mode. Current kvm set_sregs function doesn't have such check when userspace inputs sreg values. This will lead unexpected behavior. This patch is to add checks for CS.L, EFER.LME, EFER.LMA and CR4.PAE when get SREG inputs from userspace in order to avoid unexpected behavior. Suggested-by: Paolo Bonzini Reported-by: Dmitry Vyukov Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Dmitry Vyukov Cc: Jim Mattson Signed-off-by: Tianyu Lan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 56d036b9ad75..3a82f2d4333b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7494,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); +int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) { + /* + * When EFER.LME and CR0.PG are set, the processor is in + * 64-bit mode (though maybe in a 32-bit code segment). + * CR4.PAE and EFER.LMA must be set. + */ + if (!(sregs->cr4 & X86_CR4_PAE_BIT) + || !(sregs->efer & EFER_LMA)) + return -EINVAL; + } else { + /* + * Not in 64-bit mode: EFER.LMA is clear and the code + * segment cannot be 64-bit. + */ + if (sregs->efer & EFER_LMA || sregs->cs.l) + return -EINVAL; + } + + return 0; +} + int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -7506,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, (sregs->cr4 & X86_CR4_OSXSAVE)) return -EINVAL; + if (kvm_valid_sregs(vcpu, sregs)) + return -EINVAL; + apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) -- cgit v1.2.3 From 090edbe23ff57940fca7f57d9165ce57a826bd7a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:05 -0800 Subject: x86/power/64: Use struct desc_ptr for the IDT in struct saved_context x86_64's saved_context nonsensically used separate idt_limit and idt_base fields and then cast &idt_limit to struct desc_ptr *. This was correct (with -fno-strict-aliasing), but it's confusing, served no purpose, and required #ifdeffery. Simplify this by using struct desc_ptr directly. No change in functionality. Tested-by: Jarkko Nikula Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Link: http://lkml.kernel.org/r/967909ce38d341b01d45eff53e278e2728a3a93a.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/suspend_64.h | 3 +-- arch/x86/power/cpu.c | 11 +---------- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7306e911faee..600e9e0aea51 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -30,8 +30,7 @@ struct saved_context { u16 gdt_pad; /* Unused */ struct desc_ptr gdt_desc; u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct desc_ptr idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 5191de14f4df..472bc8c8212b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -82,12 +82,8 @@ static void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ -#ifdef CONFIG_X86_32 store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif + /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit @@ -219,12 +215,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ -#ifdef CONFIG_X86_32 load_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - load_idt((const struct desc_ptr *)&ctxt->idt_limit); -#endif #ifdef CONFIG_X86_64 /* -- cgit v1.2.3 From 896c80bef4d3b357814a476663158aaf669d0fb3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:06 -0800 Subject: x86/power/32: Move SYSENTER MSR restoration to fix_processor_context() x86_64 restores system call MSRs in fix_processor_context(), and x86_32 restored them along with segment registers. The 64-bit variant makes more sense, so move the 32-bit code to match the 64-bit code. No side effects are expected to runtime behavior. Tested-by: Jarkko Nikula Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Link: http://lkml.kernel.org/r/65158f8d7ee64dd6bbc6c1c83b3b34aaa854e3ae.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/power/cpu.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 472bc8c8212b..033c61e6891b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -174,6 +174,9 @@ static void fix_processor_context(void) write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ +#else + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ @@ -237,12 +240,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) loadsegment(fs, ctxt->fs); loadsegment(gs, ctxt->gs); loadsegment(ss, ctxt->ss); - - /* - * sysenter MSRs - */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); #else /* CONFIG_X86_64 */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); -- cgit v1.2.3 From 7ee18d677989e99635027cee04c878950e0752b9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Dec 2017 13:19:07 -0800 Subject: x86/power: Make restore_processor_context() sane My previous attempt to fix a couple of bugs in __restore_processor_context(): 5b06bbcfc2c6 ("x86/power: Fix some ordering bugs in __restore_processor_context()") ... introduced yet another bug, breaking suspend-resume. Rather than trying to come up with a minimal fix, let's try to clean it up for real. This patch fixes quite a few things: - The old code saved a nonsensical subset of segment registers. The only registers that need to be saved are those that contain userspace state or those that can't be trivially restored without percpu access working. (On x86_32, we can restore percpu access by writing __KERNEL_PERCPU to %fs. On x86_64, it's easier to save and restore the kernel's GSBASE.) With this patch, we restore hardcoded values to the kernel state where applicable and explicitly restore the user state after fixing all the descriptor tables. - We used to use an unholy mix of inline asm and C helpers for segment register access. Let's get rid of the inline asm. This fixes the reported s2ram hangs and make the code all around more logical. Analyzed-by: Linus Torvalds Reported-by: Jarkko Nikula Reported-by: Pavel Machek Tested-by: Jarkko Nikula Tested-by: Pavel Machek Signed-off-by: Andy Lutomirski Acked-by: Rafael J. Wysocki Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Zhang Rui Fixes: 5b06bbcfc2c6 ("x86/power: Fix some ordering bugs in __restore_processor_context()") Link: http://lkml.kernel.org/r/398ee68e5c0f766425a7b746becfc810840770ff.1513286253.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/suspend_32.h | 8 +++- arch/x86/include/asm/suspend_64.h | 16 +++++++- arch/x86/power/cpu.c | 79 ++++++++++++++++++++------------------- 3 files changed, 62 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 982c325dad33..8be6afb58471 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,7 +12,13 @@ /* image of the saved processor state */ struct saved_context { - u16 es, fs, gs, ss; + /* + * On x86_32, all segment registers, with the possible exception of + * gs, are saved at kernel entry in pt_regs. + */ +#ifdef CONFIG_X86_32_LAZY_GS + u16 gs; +#endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 600e9e0aea51..a7af9f53c0cb 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -20,8 +20,20 @@ */ struct saved_context { struct pt_regs regs; - u16 ds, es, fs, gs, ss; - unsigned long gs_base, gs_kernel_base, fs_base; + + /* + * User CS and SS are saved in current_pt_regs(). The rest of the + * segment selectors need to be saved and restored here. + */ + u16 ds, es, fs, gs; + + /* + * Usermode FSBASE and GSBASE may not match the fs and gs selectors, + * so we save them separately. We save the kernelmode GSBASE to + * restore percpu access after resume. + */ + unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 033c61e6891b..36a28eddb435 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -99,22 +99,18 @@ static void __save_processor_state(struct saved_context *ctxt) /* * segment registers */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); +#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); +#endif +#ifdef CONFIG_X86_64 + savesegment(gs, ctxt->gs); + savesegment(fs, ctxt->fs); + savesegment(ds, ctxt->ds); + savesegment(es, ctxt->es); rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrl(MSR_EFER, ctxt->efer); @@ -189,9 +185,12 @@ static void fix_processor_context(void) } /** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + * + * The asm code that gets us here will have restored a usable GDT, although + * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { @@ -214,46 +213,50 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); + /* Restore the IDT. */ + load_idt(&ctxt->idt); + /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). + * Just in case the asm code got us here with the SS, DS, or ES + * out of sync with the GDT, update them. */ - load_idt(&ctxt->idt); + loadsegment(ss, __KERNEL_DS); + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); -#ifdef CONFIG_X86_64 /* - * We need GSBASE restored before percpu access can work. - * percpu access can happen in exception handlers or in complicated - * helpers like load_gs_index(). + * Restore percpu access. Percpu access can happen in exception + * handlers or in complicated helpers like load_gs_index(). */ - wrmsrl(MSR_GS_BASE, ctxt->gs_base); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); +#else + loadsegment(fs, __KERNEL_PERCPU); + loadsegment(gs, __KERNEL_STACK_CANARY); #endif + /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ fix_processor_context(); /* - * Restore segment registers. This happens after restoring the GDT - * and LDT, which happen in fix_processor_context(). + * Now that we have descriptor tables fully restored and working + * exception handling, restore the usermode segments. */ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_64 + loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); /* - * Restore FSBASE and user GSBASE after reloading the respective - * segment selectors. + * Restore FSBASE and GSBASE after restoring the selectors, since + * restoring the selectors clobbers the bases. Keep in mind + * that MSR_KERNEL_GS_BASE is horribly misnamed. */ wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); +#elif defined(CONFIG_X86_32_LAZY_GS) + loadsegment(gs, ctxt->gs); #endif do_fpu_end(); -- cgit v1.2.3 From f5b5fab1780c98b74526dbac527574bd02dc16f8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Dec 2017 10:38:36 -0800 Subject: x86/decoder: Fix and update the opcodes map Update x86-opcode-map.txt based on the October 2017 Intel SDM publication. Fix INVPID to INVVPID. Add UD0 and UD1 instruction opcodes. Also sync the objtool and perf tooling copies of this file. Signed-off-by: Randy Dunlap Acked-by: Masami Hiramatsu Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/aac062d7-c0f6-96e3-5c92-ed299e2bd3da@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 13 +++++++++++-- tools/objtool/arch/x86/insn/x86-opcode-map.txt | 15 ++++++++++++--- tools/perf/util/intel-pt-decoder/x86-opcode-map.txt | 15 ++++++++++++--- 3 files changed, 35 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index c4d55919fac1..e0b85930dd77 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/insn/x86-opcode-map.txt index 12e377184ee4..e0b85930dd77 100644 --- a/tools/objtool/arch/x86/insn/x86-opcode-map.txt +++ b/tools/objtool/arch/x86/insn/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt index 12e377184ee4..e0b85930dd77 100644 --- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM -- cgit v1.2.3 From 9f37e797547cca9d14fe1f0f43f5c89b261ff0b0 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 15 Dec 2017 14:16:04 +0100 Subject: s390: fix preemption race in disable_sacf_uaccess With CONFIG_PREEMPT=y there is a possible race in disable_sacf_uaccess. The new set_fs value needs to be stored the the task structure first, the control register update needs to be second. Otherwise a preemptive schedule may interrupt the code right after the control register update has been done and the next time the task is scheduled we get an incorrect value in the control register due to the old set_fs setting. Fixes: 0aaba41b58 ("s390: remove all code using the access register mode") Signed-off-by: Martin Schwidefsky --- arch/s390/lib/uaccess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index cae5a1e16cbd..c4f8039a35e8 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -89,11 +89,11 @@ EXPORT_SYMBOL(enable_sacf_uaccess); void disable_sacf_uaccess(mm_segment_t old_fs) { + current->thread.mm_segment = old_fs; if (old_fs == USER_DS && test_facility(27)) { __ctl_load(S390_lowcore.user_asce, 1, 1); clear_cpu_flag(CIF_ASCE_PRIMARY); } - current->thread.mm_segment = old_fs; } EXPORT_SYMBOL(disable_sacf_uaccess); -- cgit v1.2.3 From c739f930be1dd5fd949030e3475a884fe06dae9b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 12 Dec 2017 07:56:36 -0800 Subject: x86/espfix/64: Fix espfix double-fault handling on 5-level systems Using PGDIR_SHIFT to identify espfix64 addresses on 5-level systems was wrong, and it resulted in panics due to unhandled double faults. Use P4D_SHIFT instead, which is correct on 4-level and 5-level machines. This fixes a panic when running x86 selftests on 5-level machines. Signed-off-by: Andy Lutomirski Acked-by: Kirill A. Shutemov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 1d33b219563f ("x86/espfix: Add support for 5-level paging") Link: http://lkml.kernel.org/r/24c898b4f44fdf8c22d93703850fb384ef87cfdc.1513035461.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b7b0f74a2150..c751518936ac 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -355,7 +355,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * No need for ist_enter here because we don't use RCU. */ - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { -- cgit v1.2.3 From a4544831370618cb3627e27ffcc27d1cc857868f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 15 Dec 2017 16:07:22 +0000 Subject: arm64: fpsimd: Fix copying of FP state from signal frame into task struct Commit 9de52a755cfb6da5 ("arm64: fpsimd: Fix failure to restore FPSIMD state after signals") fixed an issue reported in our FPSIMD signal restore code but inadvertently introduced another issue which tends to manifest as random SEGVs in userspace. The problem is that when we copy the struct fpsimd_state from the kernel stack (populated from the signal frame) into the struct held in the current thread_struct, we blindly copy uninitialised stack into the "cpu" field, which means that context-switching of the FP registers is no longer reliable. This patch fixes the problem by copying only the user_fpsimd member of struct fpsimd_state. We should really rework the function prototypes to take struct user_fpsimd_state * instead, but let's just get this fixed for now. Cc: Dave Martin Fixes: 9de52a755cfb6da5 ("arm64: fpsimd: Fix failure to restore FPSIMD state after signals") Reported-by: Geert Uytterhoeven Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 540a1e010eb5..fae81f7964b4 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1043,7 +1043,7 @@ void fpsimd_update_current_state(struct fpsimd_state *state) local_bh_disable(); - current->thread.fpsimd_state = *state; + current->thread.fpsimd_state.user_fpsimd = state->user_fpsimd; if (system_supports_sve() && test_thread_flag(TIF_SVE)) fpsimd_to_sve(current); -- cgit v1.2.3 From 6d59b7dbf72ed20d0138e2f9b75ca3d4a9d4faca Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Dec 2017 21:07:23 +0100 Subject: bpf, s390x: do not reload skb pointers in non-skb context The assumption of unconditionally reloading skb pointers on BPF helper calls where bpf_helper_changes_pkt_data() holds true is wrong. There can be different contexts where the BPF helper would enforce a reload such as in case of XDP. Here, we do have a struct xdp_buff instead of struct sk_buff as context, thus this will access garbage. JITs only ever need to deal with cached skb pointer reload when ld_abs/ind was seen, therefore guard the reload behind SEEN_SKB only. Tested on s390x. Fixes: 9db7f2b81880 ("s390/bpf: recache skb->data/hlen for skb_vlan_push/pop") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Michael Holzheu Signed-off-by: Alexei Starovoitov --- arch/s390/net/bpf_jit_comp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e81c16838b90..9557d8b516df 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -55,8 +55,7 @@ struct bpf_jit { #define SEEN_LITERAL 8 /* code uses literals */ #define SEEN_FUNC 16 /* calls C functions */ #define SEEN_TAIL_CALL 32 /* code uses tail calls */ -#define SEEN_SKB_CHANGE 64 /* code changes skb data */ -#define SEEN_REG_AX 128 /* code uses constant blinding */ +#define SEEN_REG_AX 64 /* code uses constant blinding */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) /* @@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth) EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, 152); } - if (jit->seen & SEEN_SKB) + if (jit->seen & SEEN_SKB) { emit_load_skb_data_hlen(jit); - if (jit->seen & SEEN_SKB_CHANGE) /* stg %b1,ST_OFF_SKBP(%r0,%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15, STK_OFF_SKBP); + } } /* @@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if (bpf_helper_changes_pkt_data((void *)func)) { - jit->seen |= SEEN_SKB_CHANGE; + if ((jit->seen & SEEN_SKB) && + bpf_helper_changes_pkt_data((void *)func)) { /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, REG_15, STK_OFF_SKBP); -- cgit v1.2.3 From 87338c8e2cbb317b5f757e6172f94e2e3799cd20 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Dec 2017 21:07:24 +0100 Subject: bpf, ppc64: do not reload skb pointers in non-skb context The assumption of unconditionally reloading skb pointers on BPF helper calls where bpf_helper_changes_pkt_data() holds true is wrong. There can be different contexts where the helper would enforce a reload such as in case of XDP. Here, we do have a struct xdp_buff instead of struct sk_buff as context, thus this will access garbage. JITs only ever need to deal with cached skb pointer reload when ld_abs/ind was seen, therefore guard the reload behind SEEN_SKB. Fixes: 156d0e290e96 ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF") Signed-off-by: Daniel Borkmann Reviewed-by: Naveen N. Rao Acked-by: Alexei Starovoitov Tested-by: Sandipan Das Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/bpf_jit_comp64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 46d74e81aff1..d183b4801bdb 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -763,7 +763,8 @@ emit_clear: func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_pkt_data(func)) + if ((ctx->seen & SEEN_SKB) && + bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -772,7 +773,8 @@ emit_clear: PPC_MR(b2p[BPF_REG_0], 3); /* refresh skb cache */ - if (bpf_helper_changes_pkt_data(func)) { + if ((ctx->seen & SEEN_SKB) && + bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); -- cgit v1.2.3 From 07aee94394547721ac168cbf4e1c09c14a5fe671 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Dec 2017 21:07:26 +0100 Subject: bpf, sparc: fix usage of wrong reg for load_skb_regs after call When LD_ABS/IND is used in the program, and we have a BPF helper call that changes packet data (bpf_helper_changes_pkt_data() returns true), then in case of sparc JIT, we try to reload cached skb data from bpf2sparc[BPF_REG_6]. However, there is no such guarantee or assumption that skb sits in R6 at this point, all helpers changing skb data only have a guarantee that skb sits in R1. Therefore, store BPF R1 in L7 temporarily and after procedure call use L7 to reload cached skb data. skb sitting in R6 is only true at the time when LD_ABS/IND is executed. Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") Signed-off-by: Daniel Borkmann Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/sparc/net/bpf_jit_comp_64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 5765e7e711f7..ff5f9cb3039a 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) u8 *func = ((u8 *)__bpf_call_base) + imm; ctx->saw_call = true; + if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) + emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx); emit_call((u32 *)func, ctx); emit_nop(ctx); emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx); - if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind) - load_skb_regs(ctx, bpf2sparc[BPF_REG_6]); + if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func)) + load_skb_regs(ctx, L7); break; } -- cgit v1.2.3 From f6f3732162b5ae3c771b9285a5a32d72b8586920 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 15 Dec 2017 18:53:22 -0800 Subject: Revert "mm: replace p??_write with pte_access_permitted in fault + gup paths" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commits 5c9d2d5c269c, c7da82b894e9, and e7fe7b5cae90. We'll probably need to revisit this, but basically we should not complicate the get_user_pages_fast() case, and checking the actual page table protection key bits will require more care anyway, since the protection keys depend on the exact state of the VM in question. Particularly when doing a "remote" page lookup (ie in somebody elses VM, not your own), you need to be much more careful than this was. Dave Hansen says: "So, the underlying bug here is that we now a get_user_pages_remote() and then go ahead and do the p*_access_permitted() checks against the current PKRU. This was introduced recently with the addition of the new p??_access_permitted() calls. We have checks in the VMA path for the "remote" gups and we avoid consulting PKRU for them. This got missed in the pkeys selftests because I did a ptrace read, but not a *write*. I also didn't explicitly test it against something where a COW needed to be done" It's also not entirely clear that it makes sense to check the protection key bits at this level at all. But one possible eventual solution is to make the get_user_pages_fast() case just abort if it sees protection key bits set, which makes us fall back to the regular get_user_pages() case, which then has a vma and can do the check there if we want to. We'll see. Somewhat related to this all: what we _do_ want to do some day is to check the PAGE_USER bit - it should obviously always be set for user pages, but it would be a good check to have back. Because we have no generic way to test for it, we lost it as part of moving over from the architecture-specific x86 GUP implementation to the generic one in commit e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"). Cc: Peter Zijlstra Cc: Dan Williams Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Andrew Morton Cc: Al Viro Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 6 ------ arch/sparc/mm/gup.c | 4 ++-- fs/dax.c | 3 +-- mm/gup.c | 2 +- mm/hmm.c | 8 ++++---- mm/huge_memory.c | 6 +++--- mm/memory.c | 8 ++++---- 7 files changed, 15 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57d7bc92e0b8..0a6b0286c32e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1264,12 +1264,6 @@ static inline pud_t pud_mkwrite(pud_t pud) return pud; } -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; -} - static inline pud_t pud_mkclean(pud_t pud) { if (pud_large(pud)) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 33c0f8bb0f33..5335ba3c850e 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, if (!(pmd_val(pmd) & _PAGE_VALID)) return 0; - if (!pmd_access_permitted(pmd, write)) + if (write && !pmd_write(pmd)) return 0; refs = 0; @@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, if (!(pud_val(pud) & _PAGE_VALID)) return 0; - if (!pud_access_permitted(pud, write)) + if (write && !pud_write(pud)) return 0; refs = 0; diff --git a/fs/dax.c b/fs/dax.c index 78b72c48374e..95981591977a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -627,8 +627,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) - && !pmd_access_permitted(*pmdp, WRITE)) + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) goto unlock_pmd; flush_cache_page(vma, address, pfn); diff --git a/mm/gup.c b/mm/gup.c index d3fb60e5bfac..e0d82b6706d7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -66,7 +66,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_access_permitted(pte, WRITE) || + return pte_write(pte) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); } diff --git a/mm/hmm.c b/mm/hmm.c index 3a5c172af560..ea19742a5d60 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -391,11 +391,11 @@ again: if (pmd_protnone(pmd)) return hmm_vma_walk_clear(start, end, walk); - if (!pmd_access_permitted(pmd, write_fault)) + if (write_fault && !pmd_write(pmd)) return hmm_vma_walk_clear(start, end, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_access_permitted(pmd, WRITE) ? HMM_PFN_WRITE : 0; + flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; for (; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; return 0; @@ -456,11 +456,11 @@ again: continue; } - if (!pte_access_permitted(pte, write_fault)) + if (write_fault && !pte_write(pte)) goto fault; pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_access_permitted(pte, WRITE) ? HMM_PFN_WRITE : 0; + pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; continue; fault: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2f5e774902..0e7ded98d114 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -870,7 +870,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, */ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - if (!pmd_access_permitted(*pmd, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; if (pmd_present(*pmd) && pmd_devmap(*pmd)) @@ -1012,7 +1012,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pud_lockptr(mm, pud)); - if (!pud_access_permitted(*pud, flags & FOLL_WRITE)) + if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; if (pud_present(*pud) && pud_devmap(*pud)) @@ -1386,7 +1386,7 @@ out_unlock: */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_access_permitted(pmd, WRITE) || + return pmd_write(pmd) || ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); } diff --git a/mm/memory.c b/mm/memory.c index cfaba6287702..ca5674cbaff2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3949,7 +3949,7 @@ static int handle_pte_fault(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; if (vmf->flags & FAULT_FLAG_WRITE) { - if (!pte_access_permitted(entry, WRITE)) + if (!pte_write(entry)) return do_wp_page(vmf); entry = pte_mkdirty(entry); } @@ -4014,7 +4014,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* NUMA case for anonymous PUDs would go here */ - if (dirty && !pud_access_permitted(orig_pud, WRITE)) { + if (dirty && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4047,7 +4047,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); - if (dirty && !pmd_access_permitted(orig_pmd, WRITE)) { + if (dirty && !pmd_write(orig_pmd)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4337,7 +4337,7 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = *ptep; - if (!pte_access_permitted(pte, flags & FOLL_WRITE)) + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; *prot = pgprot_val(pte_pgprot(pte)); -- cgit v1.2.3 From 5f0e3fe6b1504d4e6530294ec87c473aa6d2d02f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 14 Nov 2017 09:10:11 -0500 Subject: x86/build: Make isoimage work on Debian Debian does not ship a 'mkisofs' symlink to genisoimage. All modern distros ship genisoimage, so just use that directly. That requires renaming the 'genisoimage' function. Also neaten up the 'for' loop while I'm in here. Signed-off-by: Matthew Wilcox Cc: Changbin Du Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/boot/genimage.sh | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index c9e8499fbfe7..6a10d52a4145 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -80,39 +80,43 @@ genfdimage288() { mcopy $FBZIMAGE w:linux } -genisoimage() { +geniso() { tmp_dir=`dirname $FIMAGE`/isoimage rm -rf $tmp_dir mkdir $tmp_dir - for i in lib lib64 share end ; do + for i in lib lib64 share ; do for j in syslinux ISOLINUX ; do if [ -f /usr/$i/$j/isolinux.bin ] ; then isolinux=/usr/$i/$j/isolinux.bin - cp $isolinux $tmp_dir fi done for j in syslinux syslinux/modules/bios ; do if [ -f /usr/$i/$j/ldlinux.c32 ]; then ldlinux=/usr/$i/$j/ldlinux.c32 - cp $ldlinux $tmp_dir fi done if [ -n "$isolinux" -a -n "$ldlinux" ] ; then break fi - if [ $i = end -a -z "$isolinux" ] ; then - echo 'Need an isolinux.bin file, please install syslinux/isolinux.' - exit 1 - fi done + if [ -z "$isolinux" ] ; then + echo 'Need an isolinux.bin file, please install syslinux/isolinux.' + exit 1 + fi + if [ -z "$ldlinux" ] ; then + echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.' + exit 1 + fi + cp $isolinux $tmp_dir + cp $ldlinux $tmp_dir cp $FBZIMAGE $tmp_dir/linux echo "$KCMDLINE" > $tmp_dir/isolinux.cfg if [ -f "$FDINITRD" ] ; then cp "$FDINITRD" $tmp_dir/initrd.img fi - mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \ - -c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \ - $tmp_dir + genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \ + -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \ + -boot-info-table $tmp_dir isohybrid $FIMAGE 2>/dev/null || true rm -rf $tmp_dir } @@ -121,6 +125,6 @@ case $1 in bzdisk) genbzdisk;; fdimage144) genfdimage144;; fdimage288) genfdimage288;; - isoimage) genisoimage;; + isoimage) geniso;; *) echo 'Unknown image format'; exit 1; esac -- cgit v1.2.3 From 40d1a07b333ef1f7fce11ee20b8f4281d1a75fb9 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 27 Mar 2017 02:44:47 -0700 Subject: xtensa: enable stack protector The implementation is adopted from the ARM arch. GCC 7.3, 8 or newer is required for building the xtensa kernel with SSP. Signed-off-by: Max Filippov --- .../features/debug/stackprotector/arch-support.txt | 2 +- arch/xtensa/Kconfig | 1 + arch/xtensa/boot/lib/Makefile | 4 +++ arch/xtensa/include/asm/stackprotector.h | 40 ++++++++++++++++++++++ arch/xtensa/kernel/asm-offsets.c | 3 ++ arch/xtensa/kernel/entry.S | 6 ++++ arch/xtensa/kernel/process.c | 6 ++++ 7 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 arch/xtensa/include/asm/stackprotector.h (limited to 'arch') diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt index d7acd7bd3619..59a4c9ffb7f3 100644 --- a/Documentation/features/debug/stackprotector/arch-support.txt +++ b/Documentation/features/debug/stackprotector/arch-support.txt @@ -35,5 +35,5 @@ | um: | TODO | | unicore32: | TODO | | x86: | ok | - | xtensa: | TODO | + | xtensa: | ok | ----------------------- diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index eb1f196c3f6e..fffe05b698ac 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -15,6 +15,7 @@ config XTENSA select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK + select HAVE_CC_STACKPROTECTOR select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG select HAVE_DMA_CONTIGUOUS diff --git a/arch/xtensa/boot/lib/Makefile b/arch/xtensa/boot/lib/Makefile index d2a7f48564a4..2fe182915b63 100644 --- a/arch/xtensa/boot/lib/Makefile +++ b/arch/xtensa/boot/lib/Makefile @@ -15,6 +15,10 @@ CFLAGS_REMOVE_inftrees.o = -pg CFLAGS_REMOVE_inffast.o = -pg endif +CFLAGS_REMOVE_inflate.o += -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_zmem.o += -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_inftrees.o += -fstack-protector -fstack-protector-strong +CFLAGS_REMOVE_inffast.o += -fstack-protector -fstack-protector-strong quiet_cmd_copy_zlib = COPY $@ cmd_copy_zlib = cat $< > $@ diff --git a/arch/xtensa/include/asm/stackprotector.h b/arch/xtensa/include/asm/stackprotector.h new file mode 100644 index 000000000000..e368f94fd2af --- /dev/null +++ b/arch/xtensa/include/asm/stackprotector.h @@ -0,0 +1,40 @@ +/* + * GCC stack protector support. + * + * (This is directly adopted from the ARM implementation) + * + * Stack protector works by putting predefined pattern at the start of + * the stack frame and verifying that it hasn't been overwritten when + * returning from the function. The pattern is called stack canary + * and gcc expects it to be defined by a global variable called + * "__stack_chk_guard" on Xtensa. This unfortunately means that on SMP + * we cannot have a different canary value per task. + */ + +#ifndef _ASM_STACKPROTECTOR_H +#define _ASM_STACKPROTECTOR_H 1 + +#include +#include + +extern unsigned long __stack_chk_guard; + +/* + * Initialize the stackprotector canary value. + * + * NOTE: this must only be called from functions that never return, + * and it must always be inlined. + */ +static __always_inline void boot_init_stack_canary(void) +{ + unsigned long canary; + + /* Try to get a semi random initial value. */ + get_random_bytes(&canary, sizeof(canary)); + canary ^= LINUX_VERSION_CODE; + + current->stack_canary = canary; + __stack_chk_guard = current->stack_canary; +} + +#endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index bcb5beb81177..3b119b372efd 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -76,6 +76,9 @@ int main(void) DEFINE(TASK_PID, offsetof (struct task_struct, pid)); DEFINE(TASK_THREAD, offsetof (struct task_struct, thread)); DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, stack)); +#ifdef CONFIG_CC_STACKPROTECTOR + DEFINE(TASK_STACK_CANARY, offsetof(struct task_struct, stack_canary)); +#endif DEFINE(TASK_STRUCT_SIZE, sizeof (struct task_struct)); /* offsets in thread_info struct */ diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index a27a9a65635b..5caff0744f3c 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -1971,6 +1971,12 @@ ENTRY(_switch_to) s32i a1, a2, THREAD_SP # save stack pointer #endif +#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP) + movi a6, __stack_chk_guard + l32i a8, a3, TASK_STACK_CANARY + s32i a8, a6, 0 +#endif + /* Disable ints while we manipulate the stack pointer. */ irq_save a14, a3 diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index ff4f0ecb03dd..8dd0593fb2c4 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -58,6 +58,12 @@ void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); +#ifdef CONFIG_CC_STACKPROTECTOR +#include +unsigned long __stack_chk_guard __read_mostly; +EXPORT_SYMBOL(__stack_chk_guard); +#endif + #if XTENSA_HAVE_COPROCESSORS void coprocessor_release_all(struct thread_info *ti) -- cgit v1.2.3 From c130d3be84afb9b5a30ce4f715f88a1c1dcc4114 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 15 Dec 2017 12:00:30 -0800 Subject: xtensa: clean up custom-controlled debug output Replace #ifdef'fed/commented out debug printk statements with pr_debug. Replace printk statements with pr_* equivalents. Signed-off-by: Max Filippov --- arch/xtensa/kernel/module.c | 19 +++++++--------- arch/xtensa/kernel/pci.c | 30 +++++++++---------------- arch/xtensa/kernel/setup.c | 7 +++--- arch/xtensa/kernel/signal.c | 8 ++----- arch/xtensa/kernel/traps.c | 25 ++++++++++++--------- arch/xtensa/lib/pci-auto.c | 45 ++++++++----------------------------- arch/xtensa/mm/cache.c | 3 --- arch/xtensa/mm/fault.c | 22 ++++++++---------- arch/xtensa/mm/tlb.c | 6 ++--- arch/xtensa/platforms/iss/console.c | 4 ++-- arch/xtensa/platforms/iss/network.c | 14 +++++------- 11 files changed, 66 insertions(+), 117 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/kernel/module.c b/arch/xtensa/kernel/module.c index b715237bae61..902845ddacb7 100644 --- a/arch/xtensa/kernel/module.c +++ b/arch/xtensa/kernel/module.c @@ -22,8 +22,6 @@ #include #include -#undef DEBUG_RELOCATE - static int decode_calln_opcode (unsigned char *location) { @@ -58,10 +56,9 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, unsigned char *location; uint32_t value; -#ifdef DEBUG_RELOCATE - printk("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); -#endif + pr_debug("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { location = (char *)sechdrs[sechdrs[relsec].sh_info].sh_addr + rela[i].r_offset; @@ -87,7 +84,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, value -= ((unsigned long)location & -4) + 4; if ((value & 3) != 0 || ((value + (1 << 19)) >> 20) != 0) { - printk("%s: relocation out of range, " + pr_err("%s: relocation out of range, " "section %d reloc %d " "sym '%s'\n", mod->name, relsec, i, @@ -111,7 +108,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, value -= (((unsigned long)location + 3) & -4); if ((value & 3) != 0 || (signed int)value >> 18 != -1) { - printk("%s: relocation out of range, " + pr_err("%s: relocation out of range, " "section %d reloc %d " "sym '%s'\n", mod->name, relsec, i, @@ -156,7 +153,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, case R_XTENSA_SLOT12_OP: case R_XTENSA_SLOT13_OP: case R_XTENSA_SLOT14_OP: - printk("%s: unexpected FLIX relocation: %u\n", + pr_err("%s: unexpected FLIX relocation: %u\n", mod->name, ELF32_R_TYPE(rela[i].r_info)); return -ENOEXEC; @@ -176,13 +173,13 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, case R_XTENSA_SLOT12_ALT: case R_XTENSA_SLOT13_ALT: case R_XTENSA_SLOT14_ALT: - printk("%s: unexpected ALT relocation: %u\n", + pr_err("%s: unexpected ALT relocation: %u\n", mod->name, ELF32_R_TYPE(rela[i].r_info)); return -ENOEXEC; default: - printk("%s: unexpected relocation: %u\n", + pr_err("%s: unexpected relocation: %u\n", mod->name, ELF32_R_TYPE(rela[i].r_info)); return -ENOEXEC; diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c index 903963ee495d..d981f01c8d89 100644 --- a/arch/xtensa/kernel/pci.c +++ b/arch/xtensa/kernel/pci.c @@ -29,14 +29,6 @@ #include #include -#undef DEBUG - -#ifdef DEBUG -#define DBG(x...) printk(x) -#else -#define DBG(x...) -#endif - /* PCI Controller */ @@ -101,8 +93,8 @@ pcibios_enable_resources(struct pci_dev *dev, int mask) for(idx=0; idx<6; idx++) { r = &dev->resource[idx]; if (!r->start && r->end) { - printk (KERN_ERR "PCI: Device %s not available because " - "of resource collisions\n", pci_name(dev)); + pr_err("PCI: Device %s not available because " + "of resource collisions\n", pci_name(dev)); return -EINVAL; } if (r->flags & IORESOURCE_IO) @@ -113,7 +105,7 @@ pcibios_enable_resources(struct pci_dev *dev, int mask) if (dev->resource[PCI_ROM_RESOURCE].start) cmd |= PCI_COMMAND_MEMORY; if (cmd != old_cmd) { - printk("PCI: Enabling device %s (%04x -> %04x)\n", + pr_info("PCI: Enabling device %s (%04x -> %04x)\n", pci_name(dev), old_cmd, cmd); pci_write_config_word(dev, PCI_COMMAND, cmd); } @@ -144,8 +136,8 @@ static void __init pci_controller_apertures(struct pci_controller *pci_ctrl, res = &pci_ctrl->io_resource; if (!res->flags) { if (io_offset) - printk (KERN_ERR "I/O resource not set for host" - " bridge %d\n", pci_ctrl->index); + pr_err("I/O resource not set for host bridge %d\n", + pci_ctrl->index); res->start = 0; res->end = IO_SPACE_LIMIT; res->flags = IORESOURCE_IO; @@ -159,8 +151,8 @@ static void __init pci_controller_apertures(struct pci_controller *pci_ctrl, if (!res->flags) { if (i > 0) continue; - printk(KERN_ERR "Memory resource not set for " - "host bridge %d\n", pci_ctrl->index); + pr_err("Memory resource not set for host bridge %d\n", + pci_ctrl->index); res->start = 0; res->end = ~0U; res->flags = IORESOURCE_MEM; @@ -176,7 +168,7 @@ static int __init pcibios_init(void) struct pci_bus *bus; int next_busno = 0, ret; - printk("PCI: Probing PCI hardware\n"); + pr_info("PCI: Probing PCI hardware\n"); /* Scan all of the recorded PCI controllers. */ for (pci_ctrl = pci_ctrl_head; pci_ctrl; pci_ctrl = pci_ctrl->next) { @@ -232,7 +224,7 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) for (idx=0; idx<6; idx++) { r = &dev->resource[idx]; if (!r->start && r->end) { - printk(KERN_ERR "PCI: Device %s not available because " + pr_err("PCI: Device %s not available because " "of resource collisions\n", pci_name(dev)); return -EINVAL; } @@ -242,8 +234,8 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) cmd |= PCI_COMMAND_MEMORY; } if (cmd != old_cmd) { - printk("PCI: Enabling device %s (%04x -> %04x)\n", - pci_name(dev), old_cmd, cmd); + pr_info("PCI: Enabling device %s (%04x -> %04x)\n", + pci_name(dev), old_cmd, cmd); pci_write_config_word(dev, PCI_COMMAND, cmd); } diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 3732c91b7200..cf7516c52a19 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -156,7 +156,7 @@ static int __init parse_bootparam(const bp_tag_t* tag) /* Boot parameters must start with a BP_TAG_FIRST tag. */ if (tag->id != BP_TAG_FIRST) { - printk(KERN_WARNING "Invalid boot parameters!\n"); + pr_warn("Invalid boot parameters!\n"); return 0; } @@ -165,15 +165,14 @@ static int __init parse_bootparam(const bp_tag_t* tag) /* Parse all tags. */ while (tag != NULL && tag->id != BP_TAG_LAST) { - for (t = &__tagtable_begin; t < &__tagtable_end; t++) { + for (t = &__tagtable_begin; t < &__tagtable_end; t++) { if (tag->id == t->tag) { t->parse(tag); break; } } if (t == &__tagtable_end) - printk(KERN_WARNING "Ignoring tag " - "0x%08x\n", tag->id); + pr_warn("Ignoring tag 0x%08x\n", tag->id); tag = (bp_tag_t*)((unsigned long)(tag + 1) + tag->size); } diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index d427e784ab44..f88e7a0b232c 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -28,8 +28,6 @@ #include #include -#define DEBUG_SIG 0 - extern struct task_struct *coproc_owners[]; struct rt_sigframe @@ -399,10 +397,8 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, regs->areg[8] = (unsigned long) &frame->uc; regs->threadptr = tp; -#if DEBUG_SIG - printk("SIG rt deliver (%s:%d): signal=%d sp=%p pc=%08x\n", - current->comm, current->pid, sig, frame, regs->pc); -#endif + pr_debug("SIG rt deliver (%s:%d): signal=%d sp=%p pc=%08lx\n", + current->comm, current->pid, sig, frame, regs->pc); return 0; } diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index bae697a06a98..9a1fef9c1cc6 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -178,13 +179,14 @@ __die_if_kernel(const char *str, struct pt_regs *regs, long err) void do_unhandled(struct pt_regs *regs, unsigned long exccause) { __die_if_kernel("Caught unhandled exception - should not happen", - regs, SIGKILL); + regs, SIGKILL); /* If in user mode, send SIGILL signal to current process */ - printk("Caught unhandled exception in '%s' " - "(pid = %d, pc = %#010lx) - should not happen\n" - "\tEXCCAUSE is %ld\n", - current->comm, task_pid_nr(current), regs->pc, exccause); + pr_info_ratelimited("Caught unhandled exception in '%s' " + "(pid = %d, pc = %#010lx) - should not happen\n" + "\tEXCCAUSE is %ld\n", + current->comm, task_pid_nr(current), regs->pc, + exccause); force_sig(SIGILL, current); } @@ -305,8 +307,8 @@ do_illegal_instruction(struct pt_regs *regs) /* If in user mode, send SIGILL signal to current process. */ - printk("Illegal Instruction in '%s' (pid = %d, pc = %#010lx)\n", - current->comm, task_pid_nr(current), regs->pc); + pr_info_ratelimited("Illegal Instruction in '%s' (pid = %d, pc = %#010lx)\n", + current->comm, task_pid_nr(current), regs->pc); force_sig(SIGILL, current); } @@ -325,13 +327,14 @@ do_unaligned_user (struct pt_regs *regs) siginfo_t info; __die_if_kernel("Unhandled unaligned exception in kernel", - regs, SIGKILL); + regs, SIGKILL); current->thread.bad_vaddr = regs->excvaddr; current->thread.error_code = -3; - printk("Unaligned memory access to %08lx in '%s' " - "(pid = %d, pc = %#010lx)\n", - regs->excvaddr, current->comm, task_pid_nr(current), regs->pc); + pr_info_ratelimited("Unaligned memory access to %08lx in '%s' " + "(pid = %d, pc = %#010lx)\n", + regs->excvaddr, current->comm, + task_pid_nr(current), regs->pc); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = BUS_ADRALN; diff --git a/arch/xtensa/lib/pci-auto.c b/arch/xtensa/lib/pci-auto.c index 34d05abbd921..a2b558161d6d 100644 --- a/arch/xtensa/lib/pci-auto.c +++ b/arch/xtensa/lib/pci-auto.c @@ -49,17 +49,6 @@ * */ - -/* define DEBUG to print some debugging messages. */ - -#undef DEBUG - -#ifdef DEBUG -# define DBG(x...) printk(x) -#else -# define DBG(x...) -#endif - static int pciauto_upper_iospc; static int pciauto_upper_memspc; @@ -97,7 +86,7 @@ pciauto_setup_bars(struct pci_dev *dev, int bar_limit) { bar_size &= PCI_BASE_ADDRESS_IO_MASK; upper_limit = &pciauto_upper_iospc; - DBG("PCI Autoconfig: BAR %d, I/O, ", bar_nr); + pr_debug("PCI Autoconfig: BAR %d, I/O, ", bar_nr); } else { @@ -107,7 +96,7 @@ pciauto_setup_bars(struct pci_dev *dev, int bar_limit) bar_size &= PCI_BASE_ADDRESS_MEM_MASK; upper_limit = &pciauto_upper_memspc; - DBG("PCI Autoconfig: BAR %d, Mem, ", bar_nr); + pr_debug("PCI Autoconfig: BAR %d, Mem, ", bar_nr); } /* Allocate a base address (bar_size is negative!) */ @@ -125,7 +114,8 @@ pciauto_setup_bars(struct pci_dev *dev, int bar_limit) if (found_mem64) pci_write_config_dword(dev, (bar+=4), 0x00000000); - DBG("size=0x%x, address=0x%x\n", ~bar_size + 1, *upper_limit); + pr_debug("size=0x%x, address=0x%x\n", + ~bar_size + 1, *upper_limit); } } @@ -150,7 +140,7 @@ pciauto_setup_irq(struct pci_controller* pci_ctrl,struct pci_dev *dev,int devfn) if (irq == -1) irq = 0; - DBG("PCI Autoconfig: Interrupt %d, pin %d\n", irq, pin); + pr_debug("PCI Autoconfig: Interrupt %d, pin %d\n", irq, pin); pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } @@ -289,8 +279,8 @@ int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus) int iosave, memsave; - DBG("PCI Autoconfig: Found P2P bridge, device %d\n", - PCI_SLOT(pci_devfn)); + pr_debug("PCI Autoconfig: Found P2P bridge, device %d\n", + PCI_SLOT(pci_devfn)); /* Allocate PCI I/O and/or memory space */ pciauto_setup_bars(dev, PCI_BASE_ADDRESS_1); @@ -306,23 +296,6 @@ int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus) } - -#if 0 - /* Skip legacy mode IDE controller */ - - if ((pci_class >> 16) == PCI_CLASS_STORAGE_IDE) { - - unsigned char prg_iface; - pci_read_config_byte(dev, PCI_CLASS_PROG, &prg_iface); - - if (!(prg_iface & PCIAUTO_IDE_MODE_MASK)) { - DBG("PCI Autoconfig: Skipping legacy mode " - "IDE controller\n"); - continue; - } - } -#endif - /* * Found a peripheral, enable some standard * settings @@ -337,8 +310,8 @@ int __init pciauto_bus_scan(struct pci_controller *pci_ctrl, int current_bus) pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0x80); /* Allocate PCI I/O and/or memory space */ - DBG("PCI Autoconfig: Found Bus %d, Device %d, Function %d\n", - current_bus, PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn) ); + pr_debug("PCI Autoconfig: Found Bus %d, Device %d, Function %d\n", + current_bus, PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn)); pciauto_setup_bars(dev, PCI_BASE_ADDRESS_5); pciauto_setup_irq(pci_ctrl, dev, pci_devfn); diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 3c75c4e597da..57dc231a0709 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -33,9 +33,6 @@ #include #include -//#define printd(x...) printk(x) -#define printd(x...) do { } while(0) - /* * Note: * The kernel provides one architecture bit PG_arch_1 in the page flags that diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index a14df5aa98c8..8b9b6f44bb06 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -25,8 +25,6 @@ DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST; void bad_page_fault(struct pt_regs*, unsigned long, int); -#undef DEBUG_PAGE_FAULT - /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -68,10 +66,10 @@ void do_page_fault(struct pt_regs *regs) exccause == EXCCAUSE_ITLB_MISS || exccause == EXCCAUSE_FETCH_CACHE_ATTRIBUTE) ? 1 : 0; -#ifdef DEBUG_PAGE_FAULT - printk("[%s:%d:%08x:%d:%08x:%s%s]\n", current->comm, current->pid, - address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); -#endif + pr_debug("[%s:%d:%08x:%d:%08lx:%s%s]\n", + current->comm, current->pid, + address, exccause, regs->pc, + is_write ? "w" : "", is_exec ? "x" : ""); if (user_mode(regs)) flags |= FAULT_FLAG_USER; @@ -247,10 +245,8 @@ bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) /* Are we prepared to handle this kernel fault? */ if ((entry = search_exception_tables(regs->pc)) != NULL) { -#ifdef DEBUG_PAGE_FAULT - printk(KERN_DEBUG "%s: Exception at pc=%#010lx (%lx)\n", - current->comm, regs->pc, entry->fixup); -#endif + pr_debug("%s: Exception at pc=%#010lx (%lx)\n", + current->comm, regs->pc, entry->fixup); current->thread.bad_uaddr = address; regs->pc = entry->fixup; return; @@ -259,9 +255,9 @@ bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) /* Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - printk(KERN_ALERT "Unable to handle kernel paging request at virtual " - "address %08lx\n pc = %08lx, ra = %08lx\n", - address, regs->pc, regs->areg[0]); + pr_alert("Unable to handle kernel paging request at virtual " + "address %08lx\n pc = %08lx, ra = %08lx\n", + address, regs->pc, regs->areg[0]); die("Oops", regs, sig); do_exit(sig); } diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c index 35c822286bbe..59153d0aa890 100644 --- a/arch/xtensa/mm/tlb.c +++ b/arch/xtensa/mm/tlb.c @@ -95,10 +95,8 @@ void local_flush_tlb_range(struct vm_area_struct *vma, if (mm->context.asid[cpu] == NO_CONTEXT) return; -#if 0 - printk("[tlbrange<%02lx,%08lx,%08lx>]\n", - (unsigned long)mm->context.asid[cpu], start, end); -#endif + pr_debug("[tlbrange<%02lx,%08lx,%08lx>]\n", + (unsigned long)mm->context.asid[cpu], start, end); local_irq_save(flags); if (end-start + (PAGE_SIZE-1) <= _TLB_ENTRIES << PAGE_SHIFT) { diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c index 0140a22551c8..6fc0a946ad64 100644 --- a/arch/xtensa/platforms/iss/console.c +++ b/arch/xtensa/platforms/iss/console.c @@ -186,7 +186,7 @@ int __init rs_init(void) serial_driver = alloc_tty_driver(SERIAL_MAX_NUM_LINES); - printk ("%s %s\n", serial_name, serial_version); + pr_info("%s %s\n", serial_name, serial_version); /* Initialize the tty_driver structure */ @@ -215,7 +215,7 @@ static __exit void rs_exit(void) int error; if ((error = tty_unregister_driver(serial_driver))) - printk("ISS_SERIAL: failed to unregister serial driver (%d)\n", + pr_err("ISS_SERIAL: failed to unregister serial driver (%d)\n", error); put_tty_driver(serial_driver); tty_port_destroy(&serial_port); diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 66a5d15a9e0e..538d17e8fbc1 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -16,6 +16,8 @@ * */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include #include #include @@ -609,8 +611,6 @@ struct iss_net_init { * those fields. They will be later initialized in iss_net_init. */ -#define ERR KERN_ERR "iss_net_setup: " - static int __init iss_net_setup(char *str) { struct iss_net_private *device = NULL; @@ -622,14 +622,14 @@ static int __init iss_net_setup(char *str) end = strchr(str, '='); if (!end) { - printk(ERR "Expected '=' after device number\n"); + pr_err("Expected '=' after device number\n"); return 1; } *end = 0; rc = kstrtouint(str, 0, &n); *end = '='; if (rc < 0) { - printk(ERR "Failed to parse '%s'\n", str); + pr_err("Failed to parse '%s'\n", str); return 1; } str = end; @@ -645,13 +645,13 @@ static int __init iss_net_setup(char *str) spin_unlock(&devices_lock); if (device && device->index == n) { - printk(ERR "Device %u already configured\n", n); + pr_err("Device %u already configured\n", n); return 1; } new = alloc_bootmem(sizeof(*new)); if (new == NULL) { - printk(ERR "Alloc_bootmem failed\n"); + pr_err("Alloc_bootmem failed\n"); return 1; } @@ -663,8 +663,6 @@ static int __init iss_net_setup(char *str) return 1; } -#undef ERR - __setup("eth", iss_net_setup); /* -- cgit v1.2.3 From f21a79cab3773bc17aa845b7738c7f200778a260 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 15 Dec 2017 16:08:16 -0800 Subject: xtensa: clean up exception handling structure Instead of using flat array of longs use normal C structure and generate EXC_TABLE_* constants in the asm-offsets.c Signed-off-by: Max Filippov --- arch/xtensa/include/asm/ptrace.h | 12 ------------ arch/xtensa/include/asm/regs.h | 1 + arch/xtensa/include/asm/traps.h | 23 +++++++++++++++++++++++ arch/xtensa/kernel/asm-offsets.c | 13 +++++++++++++ arch/xtensa/kernel/traps.c | 39 +++++++++++++++++++-------------------- 5 files changed, 56 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h index 05beae3c6376..3a5c5918aea3 100644 --- a/arch/xtensa/include/asm/ptrace.h +++ b/arch/xtensa/include/asm/ptrace.h @@ -39,18 +39,6 @@ * +-----------------------+ -------- */ -/* Offsets for exception_handlers[] (3 x 64-entries x 4-byte tables). */ - -#define EXC_TABLE_KSTK 0x004 /* Kernel Stack */ -#define EXC_TABLE_DOUBLE_SAVE 0x008 /* Double exception save area for a0 */ -#define EXC_TABLE_FIXUP 0x00c /* Fixup handler */ -#define EXC_TABLE_PARAM 0x010 /* For passing a parameter to fixup */ -#define EXC_TABLE_SYSCALL_SAVE 0x014 /* For fast syscall handler */ -#define EXC_TABLE_FAST_USER 0x100 /* Fast user exception handler */ -#define EXC_TABLE_FAST_KERNEL 0x200 /* Fast kernel exception handler */ -#define EXC_TABLE_DEFAULT 0x300 /* Default C-Handler */ -#define EXC_TABLE_SIZE 0x400 - #ifndef __ASSEMBLY__ #include diff --git a/arch/xtensa/include/asm/regs.h b/arch/xtensa/include/asm/regs.h index 881a1134a4b4..477594e5817f 100644 --- a/arch/xtensa/include/asm/regs.h +++ b/arch/xtensa/include/asm/regs.h @@ -76,6 +76,7 @@ #define EXCCAUSE_COPROCESSOR5_DISABLED 37 #define EXCCAUSE_COPROCESSOR6_DISABLED 38 #define EXCCAUSE_COPROCESSOR7_DISABLED 39 +#define EXCCAUSE_N 64 /* PS register fields. */ diff --git a/arch/xtensa/include/asm/traps.h b/arch/xtensa/include/asm/traps.h index 2e69aa4b843f..5bd197097b77 100644 --- a/arch/xtensa/include/asm/traps.h +++ b/arch/xtensa/include/asm/traps.h @@ -12,6 +12,29 @@ #include +/* + * Per-CPU exception handling data structure. + * EXCSAVE1 points to it. + */ +struct exc_table { + /* Kernel Stack */ + void *kstk; + /* Double exception save area for a0 */ + unsigned long double_save; + /* Fixup handler */ + void *fixup; + /* For passing a parameter to fixup */ + void *fixup_param; + /* For fast syscall handler */ + unsigned long syscall_save; + /* Fast user exception handlers */ + void *fast_user_handler[EXCCAUSE_N]; + /* Fast kernel exception handlers */ + void *fast_kernel_handler[EXCCAUSE_N]; + /* Default C-Handlers */ + void *default_handler[EXCCAUSE_N]; +}; + /* * handler must be either of the following: * void (*)(struct pt_regs *regs); diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index 3b119b372efd..022cf918ec20 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -132,5 +132,18 @@ int main(void) offsetof(struct debug_table, icount_level_save)); #endif + /* struct exc_table */ + DEFINE(EXC_TABLE_KSTK, offsetof(struct exc_table, kstk)); + DEFINE(EXC_TABLE_DOUBLE_SAVE, offsetof(struct exc_table, double_save)); + DEFINE(EXC_TABLE_FIXUP, offsetof(struct exc_table, fixup)); + DEFINE(EXC_TABLE_PARAM, offsetof(struct exc_table, fixup_param)); + DEFINE(EXC_TABLE_SYSCALL_SAVE, + offsetof(struct exc_table, syscall_save)); + DEFINE(EXC_TABLE_FAST_USER, + offsetof(struct exc_table, fast_user_handler)); + DEFINE(EXC_TABLE_FAST_KERNEL, + offsetof(struct exc_table, fast_kernel_handler)); + DEFINE(EXC_TABLE_DEFAULT, offsetof(struct exc_table, default_handler)); + return 0; } diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 9a1fef9c1cc6..32c5207f1226 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -159,8 +159,7 @@ COPROCESSOR(7), * 2. it is a temporary memory buffer for the exception handlers. */ -DEFINE_PER_CPU(unsigned long, exc_table[EXC_TABLE_SIZE/4]); - +DEFINE_PER_CPU(struct exc_table, exc_table); DEFINE_PER_CPU(struct debug_table, debug_table); void die(const char*, struct pt_regs*, long); @@ -368,28 +367,28 @@ do_debug(struct pt_regs *regs) } -static void set_handler(int idx, void *handler) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) - per_cpu(exc_table, cpu)[idx] = (unsigned long)handler; -} +#define set_handler(type, cause, handler) \ + do { \ + unsigned int cpu; \ + \ + for_each_possible_cpu(cpu) \ + per_cpu(exc_table, cpu).type[cause] = (handler);\ + } while (0) /* Set exception C handler - for temporary use when probing exceptions */ void * __init trap_set_handler(int cause, void *handler) { - void *previous = (void *)per_cpu(exc_table, 0)[ - EXC_TABLE_DEFAULT / 4 + cause]; - set_handler(EXC_TABLE_DEFAULT / 4 + cause, handler); + void *previous = per_cpu(exc_table, 0).default_handler[cause]; + + set_handler(default_handler, cause, handler); return previous; } static void trap_init_excsave(void) { - unsigned long excsave1 = (unsigned long)this_cpu_ptr(exc_table); + unsigned long excsave1 = (unsigned long)this_cpu_ptr(&exc_table); __asm__ __volatile__("wsr %0, excsave1\n" : : "a" (excsave1)); } @@ -421,10 +420,10 @@ void __init trap_init(void) /* Setup default vectors. */ - for(i = 0; i < 64; i++) { - set_handler(EXC_TABLE_FAST_USER/4 + i, user_exception); - set_handler(EXC_TABLE_FAST_KERNEL/4 + i, kernel_exception); - set_handler(EXC_TABLE_DEFAULT/4 + i, do_unhandled); + for (i = 0; i < EXCCAUSE_N; i++) { + set_handler(fast_user_handler, i, user_exception); + set_handler(fast_kernel_handler, i, kernel_exception); + set_handler(default_handler, i, do_unhandled); } /* Setup specific handlers. */ @@ -436,11 +435,11 @@ void __init trap_init(void) void *handler = dispatch_init_table[i].handler; if (fast == 0) - set_handler (EXC_TABLE_DEFAULT/4 + cause, handler); + set_handler(default_handler, cause, handler); if (fast && fast & USER) - set_handler (EXC_TABLE_FAST_USER/4 + cause, handler); + set_handler(fast_user_handler, cause, handler); if (fast && fast & KRNL) - set_handler (EXC_TABLE_FAST_KERNEL/4 + cause, handler); + set_handler(fast_kernel_handler, cause, handler); } /* Initialize EXCSAVE_1 to hold the address of the exception table. */ -- cgit v1.2.3 From 501c26e82df8d253851b80082778eeb37e4bab5c Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 15 Dec 2017 16:20:56 -0800 Subject: xtensa: implement early_trap_init Paging on xtensa architecture requires functioning exception handling because hardware cannot transparently access page tables that are not currently mapped by TLB. Exception handling is set up late in the initialization process, but working paging is needed for KASAN. Provide early_trap_init that sets up minimal exception handling sufficient for KASAN to work. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/traps.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/xtensa/include/asm/traps.h b/arch/xtensa/include/asm/traps.h index 5bd197097b77..f5cd7a7e65e0 100644 --- a/arch/xtensa/include/asm/traps.h +++ b/arch/xtensa/include/asm/traps.h @@ -42,6 +42,18 @@ struct exc_table { */ extern void * __init trap_set_handler(int cause, void *handler); extern void do_unhandled(struct pt_regs *regs, unsigned long exccause); +void fast_second_level_miss(void); + +/* Initialize minimal exc_table structure sufficient for basic paging */ +static inline void __init early_trap_init(void) +{ + static struct exc_table exc_table __initdata = { + .fast_kernel_handler[EXCCAUSE_DTLB_MISS] = + fast_second_level_miss, + }; + __asm__ __volatile__("wsr %0, excsave1\n" : : "a" (&exc_table)); +} + void secondary_trap_init(void); static inline void spill_registers(void) -- cgit v1.2.3 From c2edb35ae342fedb5a39312c0fa676b74973887a Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 15 Dec 2017 20:45:35 -0800 Subject: xtensa: extract init_kio KIO region placement may be specified in the device tree, that's why it's initialized with the rest of MMU after the early_init_devtree. In order to support KASAN the MMU must be initialized earlier. Separate KIO initialization from the rest of MMU initialization. Reinitialize KIO if its location is specified in the device tree. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/mmu_context.h | 1 + arch/xtensa/include/asm/nommu_context.h | 4 ++++ arch/xtensa/kernel/setup.c | 10 ++++++---- arch/xtensa/mm/mmu.c | 30 +++++++++++++++++------------- 4 files changed, 28 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/mmu_context.h b/arch/xtensa/include/asm/mmu_context.h index f7e186dfc4e4..de5e6cbbafe4 100644 --- a/arch/xtensa/include/asm/mmu_context.h +++ b/arch/xtensa/include/asm/mmu_context.h @@ -52,6 +52,7 @@ DECLARE_PER_CPU(unsigned long, asid_cache); #define ASID_INSERT(x) (0x03020001 | (((x) & ASID_MASK) << 8)) void init_mmu(void); +void init_kio(void); static inline void set_rasid_register (unsigned long val) { diff --git a/arch/xtensa/include/asm/nommu_context.h b/arch/xtensa/include/asm/nommu_context.h index 2cebdbbdb633..37251b2ef871 100644 --- a/arch/xtensa/include/asm/nommu_context.h +++ b/arch/xtensa/include/asm/nommu_context.h @@ -3,6 +3,10 @@ static inline void init_mmu(void) { } +static inline void init_kio(void) +{ +} + static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index cf7516c52a19..960212e72a70 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -207,6 +207,8 @@ static int __init xtensa_dt_io_area(unsigned long node, const char *uname, /* round down to nearest 256MB boundary */ xtensa_kio_paddr &= 0xf0000000; + init_kio(); + return 1; } #else @@ -245,6 +247,10 @@ void __init early_init_devtree(void *params) void __init init_arch(bp_tag_t *bp_start) { + /* Initialize MMU. */ + + init_mmu(); + /* Parse boot parameters */ if (bp_start) @@ -262,10 +268,6 @@ void __init init_arch(bp_tag_t *bp_start) /* Early hook for platforms */ platform_init(bp_start); - - /* Initialize MMU. */ - - init_mmu(); } /* diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index 358d748d9083..54c01e3ebf05 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -82,6 +82,23 @@ void init_mmu(void) set_itlbcfg_register(0); set_dtlbcfg_register(0); #endif + init_kio(); + local_flush_tlb_all(); + + /* Set rasid register to a known value. */ + + set_rasid_register(ASID_INSERT(ASID_USER_FIRST)); + + /* Set PTEVADDR special register to the start of the page + * table, which is in kernel mappable space (ie. not + * statically mapped). This register's value is undefined on + * reset. + */ + set_ptevaddr_register(XCHAL_PAGE_TABLE_VADDR); +} + +void init_kio(void) +{ #if XCHAL_HAVE_PTP_MMU && XCHAL_HAVE_SPANNING_WAY && defined(CONFIG_OF) /* * Update the IO area mapping in case xtensa_kio_paddr has changed @@ -95,17 +112,4 @@ void init_mmu(void) write_itlb_entry(__pte(xtensa_kio_paddr + CA_BYPASS), XCHAL_KIO_BYPASS_VADDR + 6); #endif - - local_flush_tlb_all(); - - /* Set rasid register to a known value. */ - - set_rasid_register(ASID_INSERT(ASID_USER_FIRST)); - - /* Set PTEVADDR special register to the start of the page - * table, which is in kernel mappable space (ie. not - * statically mapped). This register's value is undefined on - * reset. - */ - set_ptevaddr_register(PGTABLE_START); } -- cgit v1.2.3 From d4e337fe822354895334dbaded61f08206dcac25 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 15 Dec 2017 20:46:55 -0800 Subject: xtensa: don't clear swapper_pg_dir in paging_init swapper_pg_dir is located in the .bss, so it's zero-initialized anyway. With KASAN enabled paging_init will be called after KASAN initialization, it must not erase page directory entries set up for KASAN shadow map. Signed-off-by: Max Filippov --- arch/xtensa/mm/mmu.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index 54c01e3ebf05..9d1ecfc53670 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -56,7 +56,6 @@ static void __init fixedrange_init(void) void __init paging_init(void) { - memset(swapper_pg_dir, 0, PAGE_SIZE); #ifdef CONFIG_HIGHMEM fixedrange_init(); pkmap_page_table = init_pmd(PKMAP_BASE, LAST_PKMAP); -- cgit v1.2.3 From 1af1e8a39dc0fab5e50f10462c636da8c1e0cfbb Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 3 Dec 2017 19:09:41 -0800 Subject: xtensa: move fixmap and kmap just above the KSEG The virtual address space between the page table and the VMALLOC region is big enough to host KASAN shadow map and there's enough space between the VMALLOC area and KSEG for the fixmap and kmap. Move fixmap and kmap to the gap between VMALLOC area and KSEG, just above the KSEG. Reorder entries in the kernel memory layout printing code. Drop duplicate PGTABLE_START definition, use XCHAL_PAGE_TABLE_VADDR instead. Signed-off-by: Max Filippov --- Documentation/xtensa/mmu.txt | 72 +++++++++++++++++++-------------------- arch/xtensa/include/asm/fixmap.h | 4 +-- arch/xtensa/include/asm/highmem.h | 2 +- arch/xtensa/include/asm/page.h | 2 -- arch/xtensa/mm/init.c | 12 +++---- 5 files changed, 45 insertions(+), 47 deletions(-) (limited to 'arch') diff --git a/Documentation/xtensa/mmu.txt b/Documentation/xtensa/mmu.txt index 5de8715d5bec..16921393e366 100644 --- a/Documentation/xtensa/mmu.txt +++ b/Documentation/xtensa/mmu.txt @@ -69,19 +69,8 @@ Default MMUv2-compatible layout. | Userspace | 0x00000000 TASK_SIZE +------------------+ 0x40000000 +------------------+ -| Page table | 0x80000000 -+------------------+ 0x80400000 +| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE +------------------+ -| KMAP area | PKMAP_BASE PTRS_PER_PTE * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -| | (4MB * DCACHE_N_COLORS) -+------------------+ -| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * -| | NR_CPUS * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -+------------------+ FIXADDR_TOP 0xbffff000 +------------------+ | VMALLOC area | VMALLOC_START 0xc0000000 128MB - 64KB +------------------+ VMALLOC_END @@ -92,6 +81,17 @@ Default MMUv2-compatible layout. | remap area 2 | +------------------+ +------------------+ +| KMAP area | PKMAP_BASE PTRS_PER_PTE * +| | DCACHE_N_COLORS * +| | PAGE_SIZE +| | (4MB * DCACHE_N_COLORS) ++------------------+ +| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * +| | NR_CPUS * +| | DCACHE_N_COLORS * +| | PAGE_SIZE ++------------------+ FIXADDR_TOP 0xcffff000 ++------------------+ | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xd0000000 128MB +------------------+ | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xd8000000 128MB @@ -109,19 +109,8 @@ Default MMUv2-compatible layout. | Userspace | 0x00000000 TASK_SIZE +------------------+ 0x40000000 +------------------+ -| Page table | 0x80000000 -+------------------+ 0x80400000 +| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE +------------------+ -| KMAP area | PKMAP_BASE PTRS_PER_PTE * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -| | (4MB * DCACHE_N_COLORS) -+------------------+ -| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * -| | NR_CPUS * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -+------------------+ FIXADDR_TOP 0x9ffff000 +------------------+ | VMALLOC area | VMALLOC_START 0xa0000000 128MB - 64KB +------------------+ VMALLOC_END @@ -132,6 +121,17 @@ Default MMUv2-compatible layout. | remap area 2 | +------------------+ +------------------+ +| KMAP area | PKMAP_BASE PTRS_PER_PTE * +| | DCACHE_N_COLORS * +| | PAGE_SIZE +| | (4MB * DCACHE_N_COLORS) ++------------------+ +| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * +| | NR_CPUS * +| | DCACHE_N_COLORS * +| | PAGE_SIZE ++------------------+ FIXADDR_TOP 0xaffff000 ++------------------+ | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xb0000000 256MB +------------------+ | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 256MB @@ -150,19 +150,8 @@ Default MMUv2-compatible layout. | Userspace | 0x00000000 TASK_SIZE +------------------+ 0x40000000 +------------------+ -| Page table | 0x80000000 -+------------------+ 0x80400000 +| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE +------------------+ -| KMAP area | PKMAP_BASE PTRS_PER_PTE * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -| | (4MB * DCACHE_N_COLORS) -+------------------+ -| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * -| | NR_CPUS * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -+------------------+ FIXADDR_TOP 0x8ffff000 +------------------+ | VMALLOC area | VMALLOC_START 0x90000000 128MB - 64KB +------------------+ VMALLOC_END @@ -173,6 +162,17 @@ Default MMUv2-compatible layout. | remap area 2 | +------------------+ +------------------+ +| KMAP area | PKMAP_BASE PTRS_PER_PTE * +| | DCACHE_N_COLORS * +| | PAGE_SIZE +| | (4MB * DCACHE_N_COLORS) ++------------------+ +| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * +| | NR_CPUS * +| | DCACHE_N_COLORS * +| | PAGE_SIZE ++------------------+ FIXADDR_TOP 0x9ffff000 ++------------------+ | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xa0000000 512MB +------------------+ | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 512MB diff --git a/arch/xtensa/include/asm/fixmap.h b/arch/xtensa/include/asm/fixmap.h index 0d30403b6c95..7e25c1b50ac0 100644 --- a/arch/xtensa/include/asm/fixmap.h +++ b/arch/xtensa/include/asm/fixmap.h @@ -44,7 +44,7 @@ enum fixed_addresses { __end_of_fixed_addresses }; -#define FIXADDR_TOP (VMALLOC_START - PAGE_SIZE) +#define FIXADDR_TOP (XCHAL_KSEG_CACHED_VADDR - PAGE_SIZE) #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START ((FIXADDR_TOP - FIXADDR_SIZE) & PMD_MASK) @@ -63,7 +63,7 @@ static __always_inline unsigned long fix_to_virt(const unsigned int idx) * table. */ BUILD_BUG_ON(FIXADDR_START < - XCHAL_PAGE_TABLE_VADDR + XCHAL_PAGE_TABLE_SIZE); + TLBTEMP_BASE_1 + TLBTEMP_SIZE); BUILD_BUG_ON(idx >= __end_of_fixed_addresses); return __fix_to_virt(idx); } diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 6e070db1022e..04e9340eac4b 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -72,7 +72,7 @@ static inline void *kmap(struct page *page) * page table. */ BUILD_BUG_ON(PKMAP_BASE < - XCHAL_PAGE_TABLE_VADDR + XCHAL_PAGE_TABLE_SIZE); + TLBTEMP_BASE_1 + TLBTEMP_SIZE); BUG_ON(in_interrupt()); if (!PageHighMem(page)) return page_address(page); diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 4ddbfd57a7c8..5d69c11c01b8 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -36,8 +36,6 @@ #define MAX_LOW_PFN PHYS_PFN(0xfffffffful) #endif -#define PGTABLE_START 0x80000000 - /* * Cache aliasing: * diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 720fe4e8b497..6fc1cb093fb3 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -100,23 +100,23 @@ void __init mem_init(void) mem_init_print_info(NULL); pr_info("virtual kernel memory layout:\n" +#ifdef CONFIG_MMU + " vmalloc : 0x%08lx - 0x%08lx (%5lu MB)\n" +#endif #ifdef CONFIG_HIGHMEM " pkmap : 0x%08lx - 0x%08lx (%5lu kB)\n" " fixmap : 0x%08lx - 0x%08lx (%5lu kB)\n" -#endif -#ifdef CONFIG_MMU - " vmalloc : 0x%08lx - 0x%08lx (%5lu MB)\n" #endif " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n", +#ifdef CONFIG_MMU + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, #ifdef CONFIG_HIGHMEM PKMAP_BASE, PKMAP_BASE + LAST_PKMAP * PAGE_SIZE, (LAST_PKMAP*PAGE_SIZE) >> 10, FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10, #endif -#ifdef CONFIG_MMU - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, PAGE_OFFSET, PAGE_OFFSET + (max_low_pfn - min_low_pfn) * PAGE_SIZE, #else -- cgit v1.2.3 From c633544a6154146a210cf158157a1ae7c55473b6 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 3 Dec 2017 13:28:52 -0800 Subject: xtensa: add support for KASAN Cover kernel addresses above 0x90000000 by the shadow map. Enable HAVE_ARCH_KASAN when MMU is enabled. Provide kasan_early_init that fills shadow map with writable copies of kasan_zero_page. Call kasan_early_init right after mmu initialization in the setup_arch. Provide kasan_init that allocates proper shadow map pages from the memblock and puts these pages into the shadow map for addresses from VMALLOC area to the end of KSEG. Call kasan_init right after memblock initialization. Don't use KASAN for the boot code, MMU and KASAN initialization and page fault handler. Make kernel stack size 4 times larger when KASAN is enabled to avoid stack overflows. GCC 7.3, 8 or newer is required to build the xtensa kernel with KASAN. Signed-off-by: Max Filippov --- .../features/debug/KASAN/arch-support.txt | 2 +- Documentation/xtensa/mmu.txt | 6 ++ arch/xtensa/Kconfig | 5 ++ arch/xtensa/boot/lib/Makefile | 2 + arch/xtensa/include/asm/kasan.h | 37 +++++++++ arch/xtensa/include/asm/kmem_layout.h | 4 + arch/xtensa/include/asm/pgtable.h | 3 +- arch/xtensa/include/asm/string.h | 19 +++++ arch/xtensa/kernel/setup.c | 7 +- arch/xtensa/kernel/xtensa_ksyms.c | 3 + arch/xtensa/lib/memcopy.S | 10 ++- arch/xtensa/lib/memset.S | 5 +- arch/xtensa/mm/Makefile | 5 ++ arch/xtensa/mm/init.c | 7 ++ arch/xtensa/mm/kasan_init.c | 95 ++++++++++++++++++++++ 15 files changed, 201 insertions(+), 9 deletions(-) create mode 100644 arch/xtensa/include/asm/kasan.h create mode 100644 arch/xtensa/mm/kasan_init.c (limited to 'arch') diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index 76bbd7fe27b3..8abb013db8d1 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -35,5 +35,5 @@ | um: | TODO | | unicore32: | TODO | | x86: | ok | - | xtensa: | TODO | + | xtensa: | ok | ----------------------- diff --git a/Documentation/xtensa/mmu.txt b/Documentation/xtensa/mmu.txt index 16921393e366..318114de63f3 100644 --- a/Documentation/xtensa/mmu.txt +++ b/Documentation/xtensa/mmu.txt @@ -71,6 +71,8 @@ Default MMUv2-compatible layout. +------------------+ | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE +------------------+ +| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE ++------------------+ 0x8e400000 +------------------+ | VMALLOC area | VMALLOC_START 0xc0000000 128MB - 64KB +------------------+ VMALLOC_END @@ -111,6 +113,8 @@ Default MMUv2-compatible layout. +------------------+ | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE +------------------+ +| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE ++------------------+ 0x8e400000 +------------------+ | VMALLOC area | VMALLOC_START 0xa0000000 128MB - 64KB +------------------+ VMALLOC_END @@ -152,6 +156,8 @@ Default MMUv2-compatible layout. +------------------+ | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE +------------------+ +| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE ++------------------+ 0x8e400000 +------------------+ | VMALLOC area | VMALLOC_START 0x90000000 128MB - 64KB +------------------+ VMALLOC_END diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index fffe05b698ac..f9f95d6e8da8 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -15,6 +15,7 @@ config XTENSA select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK + select HAVE_ARCH_KASAN if MMU select HAVE_CC_STACKPROTECTOR select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG @@ -80,6 +81,10 @@ config VARIANT_IRQ_SWITCH config HAVE_XTENSA_GPIO32 def_bool n +config KASAN_SHADOW_OFFSET + hex + default 0x6e400000 + menu "Processor type and features" choice diff --git a/arch/xtensa/boot/lib/Makefile b/arch/xtensa/boot/lib/Makefile index 2fe182915b63..355127faade1 100644 --- a/arch/xtensa/boot/lib/Makefile +++ b/arch/xtensa/boot/lib/Makefile @@ -15,6 +15,8 @@ CFLAGS_REMOVE_inftrees.o = -pg CFLAGS_REMOVE_inffast.o = -pg endif +KASAN_SANITIZE := n + CFLAGS_REMOVE_inflate.o += -fstack-protector -fstack-protector-strong CFLAGS_REMOVE_zmem.o += -fstack-protector -fstack-protector-strong CFLAGS_REMOVE_inftrees.o += -fstack-protector -fstack-protector-strong diff --git a/arch/xtensa/include/asm/kasan.h b/arch/xtensa/include/asm/kasan.h new file mode 100644 index 000000000000..54be80876e57 --- /dev/null +++ b/arch/xtensa/include/asm/kasan.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_KASAN_H +#define __ASM_KASAN_H + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_KASAN + +#include +#include +#include + +/* Start of area covered by KASAN */ +#define KASAN_START_VADDR __XTENSA_UL_CONST(0x90000000) +/* Start of the shadow map */ +#define KASAN_SHADOW_START (XCHAL_PAGE_TABLE_VADDR + XCHAL_PAGE_TABLE_SIZE) +/* Size of the shadow map */ +#define KASAN_SHADOW_SIZE (-KASAN_START_VADDR >> KASAN_SHADOW_SCALE_SHIFT) +/* Offset for mem to shadow address transformation */ +#define KASAN_SHADOW_OFFSET __XTENSA_UL_CONST(CONFIG_KASAN_SHADOW_OFFSET) + +void __init kasan_early_init(void); +void __init kasan_init(void); + +#else + +static inline void kasan_early_init(void) +{ +} + +static inline void kasan_init(void) +{ +} + +#endif +#endif +#endif diff --git a/arch/xtensa/include/asm/kmem_layout.h b/arch/xtensa/include/asm/kmem_layout.h index 28f9260a766c..2317c835a4db 100644 --- a/arch/xtensa/include/asm/kmem_layout.h +++ b/arch/xtensa/include/asm/kmem_layout.h @@ -71,7 +71,11 @@ #endif +#ifndef CONFIG_KASAN #define KERNEL_STACK_SHIFT 13 +#else +#define KERNEL_STACK_SHIFT 15 +#endif #define KERNEL_STACK_SIZE (1 << KERNEL_STACK_SHIFT) #endif diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 30dd5b2e4ad5..38802259978f 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -12,9 +12,9 @@ #define _XTENSA_PGTABLE_H #define __ARCH_USE_5LEVEL_HACK -#include #include #include +#include /* * We only use two ring levels, user and kernel space. @@ -170,6 +170,7 @@ #define PAGE_SHARED_EXEC \ __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_WRITABLE | _PAGE_HW_EXEC) #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_HW_WRITE) +#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_PRESENT|_PAGE_HW_WRITE|_PAGE_HW_EXEC) #if (DCACHE_WAY_SIZE > PAGE_SIZE) diff --git a/arch/xtensa/include/asm/string.h b/arch/xtensa/include/asm/string.h index 8d5d9dfadb09..586bad9fe187 100644 --- a/arch/xtensa/include/asm/string.h +++ b/arch/xtensa/include/asm/string.h @@ -108,14 +108,33 @@ static inline int strncmp(const char *__cs, const char *__ct, size_t __n) #define __HAVE_ARCH_MEMSET extern void *memset(void *__s, int __c, size_t __count); +extern void *__memset(void *__s, int __c, size_t __count); #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *__to, __const__ void *__from, size_t __n); +extern void *__memcpy(void *__to, __const__ void *__from, size_t __n); #define __HAVE_ARCH_MEMMOVE extern void *memmove(void *__dest, __const__ void *__src, size_t __n); +extern void *__memmove(void *__dest, __const__ void *__src, size_t __n); /* Don't build bcopy at all ... */ #define __HAVE_ARCH_BCOPY +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) + +/* + * For files that are not instrumented (e.g. mm/slub.c) we + * should use not instrumented version of mem* functions. + */ + +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) +#define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif +#endif + #endif /* _XTENSA_STRING_H */ diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 960212e72a70..a931af9075f2 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -36,6 +36,7 @@ #endif #include +#include #include #include #include @@ -251,6 +252,10 @@ void __init init_arch(bp_tag_t *bp_start) init_mmu(); + /* Initialize initial KASAN shadow map */ + + kasan_early_init(); + /* Parse boot parameters */ if (bp_start) @@ -388,7 +393,7 @@ void __init setup_arch(char **cmdline_p) #endif parse_early_param(); bootmem_init(); - + kasan_init(); unflatten_and_copy_device_tree(); #ifdef CONFIG_SMP diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c index 672391003e40..3a443f83ae87 100644 --- a/arch/xtensa/kernel/xtensa_ksyms.c +++ b/arch/xtensa/kernel/xtensa_ksyms.c @@ -41,6 +41,9 @@ EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); +EXPORT_SYMBOL(__memset); +EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(__memmove); EXPORT_SYMBOL(__strncpy_user); EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(copy_page); diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S index 24d650864c3a..c0f6981719d6 100644 --- a/arch/xtensa/lib/memcopy.S +++ b/arch/xtensa/lib/memcopy.S @@ -109,7 +109,8 @@ addi a5, a5, 2 j .Ldstaligned # dst is now aligned, return to main algorithm -ENTRY(memcpy) +ENTRY(__memcpy) +WEAK(memcpy) entry sp, 16 # minimal stack frame # a2/ dst, a3/ src, a4/ len @@ -271,7 +272,7 @@ ENTRY(memcpy) s8i a6, a5, 0 retw -ENDPROC(memcpy) +ENDPROC(__memcpy) /* * void bcopy(const void *src, void *dest, size_t n); @@ -376,7 +377,8 @@ ENDPROC(bcopy) j .Lbackdstaligned # dst is now aligned, # return to main algorithm -ENTRY(memmove) +ENTRY(__memmove) +WEAK(memmove) entry sp, 16 # minimal stack frame # a2/ dst, a3/ src, a4/ len @@ -548,4 +550,4 @@ ENTRY(memmove) s8i a6, a5, 0 retw -ENDPROC(memmove) +ENDPROC(__memmove) diff --git a/arch/xtensa/lib/memset.S b/arch/xtensa/lib/memset.S index a6cd04ba966f..276747dec300 100644 --- a/arch/xtensa/lib/memset.S +++ b/arch/xtensa/lib/memset.S @@ -31,7 +31,8 @@ */ .text -ENTRY(memset) +ENTRY(__memset) +WEAK(memset) entry sp, 16 # minimal stack frame # a2/ dst, a3/ c, a4/ length @@ -140,7 +141,7 @@ EX(10f) s8i a3, a5, 0 .Lbytesetdone: retw -ENDPROC(memset) +ENDPROC(__memset) .section .fixup, "ax" .align 4 diff --git a/arch/xtensa/mm/Makefile b/arch/xtensa/mm/Makefile index 0b3d296a016a..734888a00dc8 100644 --- a/arch/xtensa/mm/Makefile +++ b/arch/xtensa/mm/Makefile @@ -5,3 +5,8 @@ obj-y := init.o misc.o obj-$(CONFIG_MMU) += cache.o fault.o ioremap.o mmu.o tlb.o obj-$(CONFIG_HIGHMEM) += highmem.o +obj-$(CONFIG_KASAN) += kasan_init.o + +KASAN_SANITIZE_fault.o := n +KASAN_SANITIZE_kasan_init.o := n +KASAN_SANITIZE_mmu.o := n diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 6fc1cb093fb3..0d980f05da82 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -100,6 +100,9 @@ void __init mem_init(void) mem_init_print_info(NULL); pr_info("virtual kernel memory layout:\n" +#ifdef CONFIG_KASAN + " kasan : 0x%08lx - 0x%08lx (%5lu MB)\n" +#endif #ifdef CONFIG_MMU " vmalloc : 0x%08lx - 0x%08lx (%5lu MB)\n" #endif @@ -108,6 +111,10 @@ void __init mem_init(void) " fixmap : 0x%08lx - 0x%08lx (%5lu kB)\n" #endif " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n", +#ifdef CONFIG_KASAN + KASAN_SHADOW_START, KASAN_SHADOW_START + KASAN_SHADOW_SIZE, + KASAN_SHADOW_SIZE >> 20, +#endif #ifdef CONFIG_MMU VMALLOC_START, VMALLOC_END, (VMALLOC_END - VMALLOC_START) >> 20, diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c new file mode 100644 index 000000000000..6b532b6bd785 --- /dev/null +++ b/arch/xtensa/mm/kasan_init.c @@ -0,0 +1,95 @@ +/* + * Xtensa KASAN shadow map initialization + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2017 Cadence Design Systems Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +void __init kasan_early_init(void) +{ + unsigned long vaddr = KASAN_SHADOW_START; + pgd_t *pgd = pgd_offset_k(vaddr); + pmd_t *pmd = pmd_offset(pgd, vaddr); + int i; + + for (i = 0; i < PTRS_PER_PTE; ++i) + set_pte(kasan_zero_pte + i, + mk_pte(virt_to_page(kasan_zero_page), PAGE_KERNEL)); + + for (vaddr = 0; vaddr < KASAN_SHADOW_SIZE; vaddr += PMD_SIZE, ++pmd) { + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd((unsigned long)kasan_zero_pte)); + } + early_trap_init(); +} + +static void __init populate(void *start, void *end) +{ + unsigned long n_pages = (end - start) / PAGE_SIZE; + unsigned long n_pmds = n_pages / PTRS_PER_PTE; + unsigned long i, j; + unsigned long vaddr = (unsigned long)start; + pgd_t *pgd = pgd_offset_k(vaddr); + pmd_t *pmd = pmd_offset(pgd, vaddr); + pte_t *pte = memblock_virt_alloc(n_pages * sizeof(pte_t), PAGE_SIZE); + + pr_debug("%s: %p - %p\n", __func__, start, end); + + for (i = j = 0; i < n_pmds; ++i) { + int k; + + for (k = 0; k < PTRS_PER_PTE; ++k, ++j) { + phys_addr_t phys = + memblock_alloc_base(PAGE_SIZE, PAGE_SIZE, + MEMBLOCK_ALLOC_ANYWHERE); + + set_pte(pte + j, pfn_pte(PHYS_PFN(phys), PAGE_KERNEL)); + } + } + + for (i = 0; i < n_pmds ; ++i, pte += PTRS_PER_PTE) + set_pmd(pmd + i, __pmd((unsigned long)pte)); + + local_flush_tlb_all(); + memset(start, 0, end - start); +} + +void __init kasan_init(void) +{ + int i; + + BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_START - + (KASAN_START_VADDR >> KASAN_SHADOW_SCALE_SHIFT)); + BUILD_BUG_ON(VMALLOC_START < KASAN_START_VADDR); + + /* + * Replace shadow map pages that cover addresses from VMALLOC area + * start to the end of KSEG with clean writable pages. + */ + populate(kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)XCHAL_KSEG_BYPASS_VADDR)); + + /* Write protect kasan_zero_page and zero-initialize it again. */ + for (i = 0; i < PTRS_PER_PTE; ++i) + set_pte(kasan_zero_pte + i, + mk_pte(virt_to_page(kasan_zero_page), PAGE_KERNEL_RO)); + + local_flush_tlb_all(); + memset(kasan_zero_page, 0, PAGE_SIZE); + + /* At this point kasan is fully initialized. Enable error messages. */ + current->kasan_depth = 0; + pr_info("KernelAddressSanitizer initialized\n"); +} -- cgit v1.2.3 From e0baa01438d3fa3979f94f98be19ca3df88e0b7c Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 4 Dec 2017 21:20:33 -0800 Subject: xtensa: use __memset in __xtensa_clear_user memset on xtensa is capable of accessing user memory, but KASAN checks if memset function is actually used for that and reports it as an error: ================================================================== BUG: KASAN: user-memory-access in padzero+0x4d/0x58 Write of size 519 at addr 0049ddf9 by task init/1 Call Trace: [] kasan_report+0x160/0x238 [] check_memory_region+0xf8/0x100 [] memset+0x20/0x34 [] padzero+0x4d/0x58 ================================================================== Use __memset in __xtensa_clear_user to avoid that. Signed-off-by: Max Filippov --- arch/xtensa/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h index b8f152b6aaa5..18bbe1caad94 100644 --- a/arch/xtensa/include/asm/uaccess.h +++ b/arch/xtensa/include/asm/uaccess.h @@ -261,7 +261,7 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) static inline unsigned long __xtensa_clear_user(void *addr, unsigned long size) { - if ( ! memset(addr, 0, size) ) + if (!__memset(addr, 0, size)) return size; return 0; } -- cgit v1.2.3 From a8b4db562e7283a1520f9e9730297ecaab7622ea Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Sun, 5 Nov 2017 18:27:51 -0800 Subject: x86/cpufeature: Add User-Mode Instruction Prevention definitions [ Note, this is a Git cherry-pick of the following commit: (limited to the cpufeatures.h file) 3522c2a6a4f3 ("x86/cpufeature: Add User-Mode Instruction Prevention definitions") ... for easier x86 PTI code testing and back-porting. ] User-Mode Instruction Prevention is a security feature present in new Intel processors that, when set, prevents the execution of a subset of instructions if such instructions are executed in user mode (CPL > 0). Attempting to execute such instructions causes a general protection exception. The subset of instructions comprises: * SGDT - Store Global Descriptor Table * SIDT - Store Interrupt Descriptor Table * SLDT - Store Local Descriptor Table * SMSW - Store Machine Status Word * STR - Store Task Register This feature is also added to the list of disabled-features to allow a cleaner handling of build-time configuration. Signed-off-by: Ricardo Neri Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Chen Yucong Cc: Chris Metcalf Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Huang Rui Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Ravi V. Shankar Cc: Shuah Khan Cc: Tony Luck Cc: Vlastimil Babka Cc: ricardo.neri@intel.com Link: http://lkml.kernel.org/r/1509935277-22138-7-git-send-email-ricardo.neri-calderon@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cdf5be866863..c0b0e9e8aa66 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -296,6 +296,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -- cgit v1.2.3 From f2dbad36c55e5d3a91dccbde6e8cae345fe5632f Mon Sep 17 00:00:00 2001 From: Rudolf Marek Date: Tue, 28 Nov 2017 22:01:06 +0100 Subject: x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD [ Note, this is a Git cherry-pick of the following commit: 2b67799bdf25 ("x86: Make X86_BUG_FXSAVE_LEAK detectable in CPUID on AMD") ... for easier x86 PTI code testing and back-porting. ] The latest AMD AMD64 Architecture Programmer's Manual adds a CPUID feature XSaveErPtr (CPUID_Fn80000008_EBX[2]). If this feature is set, the FXSAVE, XSAVE, FXSAVEOPT, XSAVEC, XSAVES / FXRSTOR, XRSTOR, XRSTORS always save/restore error pointers, thus making the X86_BUG_FXSAVE_LEAK workaround obsolete on such CPUs. Signed-Off-By: Rudolf Marek Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Tested-by: Borislav Petkov Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/bdcebe90-62c5-1f05-083c-eba7f08b2540@assembler.cz Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/amd.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66..800104c8a3ed 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -266,6 +266,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd44..bcb75dc97d44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -804,8 +804,11 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x17: init_amd_zn(c); break; } - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); -- cgit v1.2.3 From 2fe1bc1f501d55e5925b4035bcd85781adc76c63 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 31 Aug 2017 14:46:30 -0700 Subject: perf/x86: Enable free running PEBS for REGS_USER/INTR [ Note, this is a Git cherry-pick of the following commit: a47ba4d77e12 ("perf/x86: Enable free running PEBS for REGS_USER/INTR") ... for easier x86 PTI code testing and back-porting. ] Currently free running PEBS is disabled when user or interrupt registers are requested. Most of the registers are actually available in the PEBS record and can be supported. So we just need to check for the supported registers and then allow it: it is all except for the segment register. For user registers this only works when the counter is limited to ring 3 only, so this also needs to be checked. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170831214630.21892-1-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 4 ++++ arch/x86/events/perf_event.h | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9fb9a1f1e47b..43445da30cea 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2958,6 +2958,10 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) if (event->attr.use_clockid) flags &= ~PERF_SAMPLE_TIME; + if (!event->attr.exclude_kernel) + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_user & ~PEBS_REGS) + flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4196f81ec0e1..f7aaadf9331f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -85,13 +85,15 @@ struct amd_nb { * Flags PEBS can handle without an PMI. * * TID can only be handled by flushing at context switch. + * REGS_USER can be handled for events limited to ring 3. * */ #define PEBS_FREERUNNING_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) /* * A debug store configuration. @@ -110,6 +112,26 @@ struct debug_store { u64 pebs_event_reset[MAX_PEBS_EVENTS]; }; +#define PEBS_REGS \ + (PERF_REG_X86_AX | \ + PERF_REG_X86_BX | \ + PERF_REG_X86_CX | \ + PERF_REG_X86_DX | \ + PERF_REG_X86_DI | \ + PERF_REG_X86_SI | \ + PERF_REG_X86_SP | \ + PERF_REG_X86_BP | \ + PERF_REG_X86_IP | \ + PERF_REG_X86_FLAGS | \ + PERF_REG_X86_R8 | \ + PERF_REG_X86_R9 | \ + PERF_REG_X86_R10 | \ + PERF_REG_X86_R11 | \ + PERF_REG_X86_R12 | \ + PERF_REG_X86_R13 | \ + PERF_REG_X86_R14 | \ + PERF_REG_X86_R15) + /* * Per register state. */ -- cgit v1.2.3 From ab95477e7cb35557ecfc837687007b646bab9a9f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 12 Dec 2017 02:25:31 +0100 Subject: bpf: fix build issues on um due to mising bpf_perf_event.h [ Note, this is a Git cherry-pick of the following commit: a23f06f06dbe ("bpf: fix build issues on um due to mising bpf_perf_event.h") ... for easier x86 PTI code testing and back-porting. ] Since c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") um (uml) won't build on i386 or x86_64: [...] CC init/main.o In file included from ../include/linux/perf_event.h:18:0, from ../include/linux/trace_events.h:10, from ../include/trace/syscall.h:7, from ../include/linux/syscalls.h:82, from ../init/main.c:20: ../include/uapi/linux/bpf_perf_event.h:11:32: fatal error: asm/bpf_perf_event.h: No such file or directory #include [...] Lets add missing bpf_perf_event.h also to um arch. This seems to be the only one still missing. Fixes: c895f6f703ad ("bpf: correct broken uapi for BPF_PROG_TYPE_PERF_EVENT program type") Reported-by: Randy Dunlap Suggested-by: Richard Weinberger Signed-off-by: Daniel Borkmann Tested-by: Randy Dunlap Cc: Hendrik Brueckner Cc: Richard Weinberger Acked-by: Alexei Starovoitov Acked-by: Richard Weinberger Signed-off-by: Alexei Starovoitov Signed-off-by: Ingo Molnar --- arch/um/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 50a32c33d729..73c57f614c9e 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += barrier.h +generic-y += bpf_perf_event.h generic-y += bug.h generic-y += clkdev.h generic-y += current.h -- cgit v1.2.3 From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: locking/barriers: Convert users of lockless_dereference() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/mmu_context.h | 4 ++-- arch/x86/kernel/ldt.c | 2 +- drivers/md/dm-mpath.c | 20 ++++++++++---------- fs/dcache.c | 4 ++-- fs/overlayfs/ovl_entry.h | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/rculist.h | 4 ++-- include/linux/rcupdate.h | 4 ++-- kernel/events/core.c | 4 ++-- kernel/seccomp.c | 2 +- kernel/task_work.c | 2 +- mm/slab.h | 2 +- 13 files changed, 27 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 80534d3c2480..589af1eec7c1 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment) struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = lockless_dereference(current->active_mm->context.ldt); + ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6699fc441644..6d16d15d09a0 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -73,8 +73,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - /* lockless_dereference synchronizes with smp_store_release */ - ldt = lockless_dereference(mm->context.ldt); + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ae5615b03def..1c1eae961340 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -103,7 +103,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) static void install_ldt(struct mm_struct *current_mm, struct ldt_struct *ldt) { - /* Synchronizes with lockless_dereference in load_mm_ldt. */ + /* Synchronizes with READ_ONCE in load_mm_ldt. */ smp_store_release(¤t_mm->context.ldt, ldt); /* Activate the LDT for all CPUs using current_mm. */ diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 11f273d2f018..3f88c9d32f7e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m, pgpath = path_to_pgpath(path); - if (unlikely(lockless_dereference(m->current_pg) != pg)) { + if (unlikely(READ_ONCE(m->current_pg) != pg)) { /* Only update current_pgpath if pg changed */ spin_lock_irqsave(&m->lock, flags); m->current_pgpath = pgpath; @@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) } /* Were we instructed to switch PG? */ - if (lockless_dereference(m->next_pg)) { + if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { @@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) /* Don't change PG until it has no remaining paths */ check_current_pg: - pg = lockless_dereference(m->current_pg); + pg = READ_ONCE(m->current_pg); if (pg) { pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) @@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, struct request *clone; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, nr_bytes); @@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bool queue_io; /* Do we need to select a new pgpath? */ - pgpath = lockless_dereference(m->current_pgpath); + pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) pgpath = choose_pgpath(m, nr_bytes); @@ -1804,7 +1804,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct pgpath *current_pgpath; int r; - current_pgpath = lockless_dereference(m->current_pgpath); + current_pgpath = READ_ONCE(m->current_pgpath); if (!current_pgpath) current_pgpath = choose_pgpath(m, 0); @@ -1826,7 +1826,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } if (r == -ENOTCONN) { - if (!lockless_dereference(m->current_pg)) { + if (!READ_ONCE(m->current_pg)) { /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } @@ -1895,9 +1895,9 @@ static int multipath_busy(struct dm_target *ti) return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ - pg = lockless_dereference(m->current_pg); - next_pg = lockless_dereference(m->next_pg); - if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg)) + pg = READ_ONCE(m->current_pg); + next_pg = READ_ONCE(m->next_pg); + if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg)) pg = next_pg; if (!pg) { diff --git a/fs/dcache.c b/fs/dcache.c index f90141387f01..34c852af215c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c { /* * Be careful about RCU walk racing with rename: - * use 'lockless_dereference' to fetch the name pointer. + * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - const unsigned char *cs = lockless_dereference(dentry->d_name.name); + const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 25d9b5adcd42..36b49bd09264 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) { - return lockless_dereference(oi->__upperdentry); + return READ_ONCE(oi->__upperdentry); } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 698b74dd750e..c310e3ff7f3f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); - realfile = lockless_dereference(od->upperfile); + realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index c2cdd45a880a..127f534fec94 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -275,7 +275,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(lockless_dereference(ptr), type, member) + container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? @@ -368,7 +368,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * example is when items are added to the list, but never deleted. */ #define list_entry_lockless(ptr, type, member) \ - container_of((typeof(ptr))lockless_dereference(ptr), type, member) + container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a9f70d44af9..a6ddc42f87a5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ + typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ @@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(p) ________p1 = lockless_dereference(p); \ + typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) diff --git a/kernel/events/core.c b/kernel/events/core.c index 10cdb9c26b5d..6eee4ed97af0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4233,7 +4233,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4330,7 +4330,7 @@ again: * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 418a1c045933..5f0dfb2abb8d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = - lockless_dereference(current->seccomp.filter); + READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (unlikely(WARN_ON(f == NULL))) diff --git a/kernel/task_work.c b/kernel/task_work.c index 5718b3ea202a..0fef395662a6 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = lockless_dereference(*pprev))) { + while ((work = READ_ONCE(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) diff --git a/mm/slab.h b/mm/slab.h index 028cdc7df67e..86d7c7d860f9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -259,7 +259,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) * memcg_caches issues a write barrier to match this (see * memcg_create_kmem_cache()). */ - cachep = lockless_dereference(arr->entries[idx]); + cachep = READ_ONCE(arr->entries[idx]); rcu_read_unlock(); return cachep; -- cgit v1.2.3 From 2aeb07365bcd489620f71390a7d2031cd4dfb83e Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Nov 2017 17:36:35 -0800 Subject: x86/mm/kasan: Don't use vmemmap_populate() to initialize shadow [ Note, this is a Git cherry-pick of the following commit: d17a1d97dc20: ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow") ... for easier x86 PTI code testing and back-porting. ] The KASAN shadow is currently mapped using vmemmap_populate() since that provides a semi-convenient way to map pages into init_top_pgt. However, since that no longer zeroes the mapped pages, it is not suitable for KASAN, which requires zeroed shadow memory. Add kasan_populate_shadow() interface and use it instead of vmemmap_populate(). Besides, this allows us to take advantage of gigantic pages and use them to populate the shadow, which should save us some memory wasted on page tables and reduce TLB pressure. Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com Signed-off-by: Andrey Ryabinin Signed-off-by: Pavel Tatashin Cc: Andy Lutomirski Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Michal Hocko Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/mm/kasan_init_64.c | 143 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 137 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4ae940a0ed3b..665eba1b6103 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -108,7 +108,7 @@ config X86 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 2b60dc6e64b1..99dfed6dfef8 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -4,12 +4,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static int __init map_range(struct range *range) +static __init void *early_alloc(size_t size, int nid) +{ + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, int nid) +{ + pte_t *pte; + + if (pmd_none(*pmd)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_PSE) && + ((end - addr) == PMD_SIZE) && + IS_ALIGNED(addr, PMD_SIZE)) { + p = early_alloc(PMD_SIZE, nid); + if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PMD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pmd_populate_kernel(&init_mm, pmd, p); + } + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t entry; + void *p; + + if (!pte_none(*pte)) + continue; + + p = early_alloc(PAGE_SIZE, nid); + entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, + unsigned long end, int nid) +{ + pmd_t *pmd; + unsigned long next; + + if (pud_none(*pud)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_GBPAGES) && + ((end - addr) == PUD_SIZE) && + IS_ALIGNED(addr, PUD_SIZE)) { + p = early_alloc(PUD_SIZE, nid); + if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PUD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pud_populate(&init_mm, pud, p); + } + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_large(*pmd)) + kasan_populate_pmd(pmd, addr, next, nid); + } while (pmd++, addr = next, addr != end); +} + +static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, int nid) +{ + pud_t *pud; + unsigned long next; + + if (p4d_none(*p4d)) { + void *p = early_alloc(PAGE_SIZE, nid); + + p4d_populate(&init_mm, p4d, p); + } + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_large(*pud)) + kasan_populate_pud(pud, addr, next, nid); + } while (pud++, addr = next, addr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, + unsigned long end, int nid) +{ + void *p; + p4d_t *p4d; + unsigned long next; + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, nid); + pgd_populate(&init_mm, pgd, p); + } + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + kasan_populate_p4d(p4d, addr, next, nid); + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, + int nid) +{ + pgd_t *pgd; + unsigned long next; + + addr = addr & PAGE_MASK; + end = round_up(end, PAGE_SIZE); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + kasan_populate_pgd(pgd, addr, next, nid); + } while (pgd++, addr = next, addr != end); +} + +static void __init map_range(struct range *range) { unsigned long start; unsigned long end; @@ -26,7 +155,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - return vmemmap_populate(start, end, NUMA_NO_NODE); + kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); } static void __init clear_pgds(unsigned long start, @@ -189,16 +318,16 @@ void __init kasan_init(void) if (pfn_mapped[i].end == 0) break; - if (map_range(&pfn_mapped[i])) - panic("kasan: unable to allocate shadow!"); + map_range(&pfn_mapped[i]); } + kasan_populate_zero_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), kasan_mem_to_shadow((void *)__START_KERNEL_map)); - vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), - (unsigned long)kasan_mem_to_shadow(_end), - NUMA_NO_NODE); + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); -- cgit v1.2.3 From e17f8234538d1ff708673f287a42457c4dee720d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 4 Dec 2017 15:07:07 +0100 Subject: x86/entry/64/paravirt: Use paravirt-safe macro to access eflags Commit 1d3e53e8624a ("x86/entry/64: Refactor IRQ stacks and make them NMI-safe") added DEBUG_ENTRY_ASSERT_IRQS_OFF macro that acceses eflags using 'pushfq' instruction when testing for IF bit. On PV Xen guests looking at IF flag directly will always see it set, resulting in 'ud2'. Introduce SAVE_FLAGS() macro that will use appropriate save_fl pv op when running paravirt. Signed-off-by: Boris Ostrovsky Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: xen-devel@lists.xenproject.org Link: https://lkml.kernel.org/r/20171204150604.899457242@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 7 ++++--- arch/x86/include/asm/irqflags.h | 3 +++ arch/x86/include/asm/paravirt.h | 9 +++++++++ arch/x86/kernel/asm-offsets_64.c | 3 +++ 4 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a2b30ec69497..32306788821c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -462,12 +462,13 @@ END(irq_entries_start) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY - pushfq - testl $X86_EFLAGS_IF, (%rsp) + pushq %rax + SAVE_FLAGS(CLBR_RAX) + testl $X86_EFLAGS_IF, %eax jz .Lokay_\@ ud2 .Lokay_\@: - addq $8, %rsp + popq %rax #endif .endm diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c8ef23f2c28f..89f08955fff7 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void) swapgs; \ sysretl +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(x) pushfq; popq %rax +#endif #else #define INTERRUPT_RETURN iret #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 283efcaac8af..892df375b615 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -927,6 +927,15 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ + PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ + PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +#endif + #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 630212fa9b9d..e3a5175a444b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,6 +23,9 @@ int main(void) #ifdef CONFIG_PARAVIRT OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); +#endif BLANK(); #endif -- cgit v1.2.3 From d3a09104018cf2ad5973dfa8a9c138ef9f5015a3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:08 +0100 Subject: x86/unwinder/orc: Dont bail on stack overflow If the stack overflows into a guard page and the ORC unwinder should work well: by construction, there can't be any meaningful data in the guard page because no writes to the guard page will have succeeded. But there is a bug that prevents unwinding from working correctly: if the starting register state has RSP pointing into a stack guard page, the ORC unwinder bails out immediately. Instead of bailing out immediately check whether the next page up is a valid check page and if so analyze that. As a result the ORC unwinder will start the unwind. Tested by intentionally overflowing the task stack. The result is an accurate call trace instead of a trace consisting purely of '?' entries. There are a few other bugs that are triggered if the unwinder encounters a stack overflow after the first step, but they are outside the scope of this fix. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150604.991389777@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_orc.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a3f973b2c97a..ff8e1132b2ae 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -553,8 +553,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } if (get_stack_info((unsigned long *)state->sp, state->task, - &state->stack_info, &state->stack_mask)) - return; + &state->stack_info, &state->stack_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + if (get_stack_info(next_page, state->task, &state->stack_info, + &state->stack_mask)) + return; + } /* * The caller can provide the address of the first frame directly -- cgit v1.2.3 From b02fcf9ba1211097754b286043cd87a8b4907e75 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 4 Dec 2017 15:07:09 +0100 Subject: x86/unwinder: Handle stack overflows more gracefully There are at least two unwinder bugs hindering the debugging of stack-overflow crashes: - It doesn't deal gracefully with the case where the stack overflows and the stack pointer itself isn't on a valid stack but the to-be-dereferenced data *is*. - The ORC oops dump code doesn't know how to print partial pt_regs, for the case where if we get an interrupt/exception in *early* entry code before the full pt_regs have been saved. Fix both issues. http://lkml.kernel.org/r/20171126024031.uxi4numpbjm5rlbr@treble Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.071425003@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 1 + arch/x86/include/asm/unwind.h | 7 ++++ arch/x86/kernel/dumpstack.c | 32 ++++++++++++++++--- arch/x86/kernel/process_64.c | 11 +++---- arch/x86/kernel/unwind_orc.c | 74 +++++++++++++++---------------------------- 5 files changed, 65 insertions(+), 60 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index f86a8caa561e..395c9631e000 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); +extern void show_iret_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e9cc6fe1fc6f..c1688c2d0a12 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -7,6 +7,9 @@ #include #include +#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) +#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) + struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; @@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) +/* + * WARNING: The entire pt_regs may not be safe to dereference. In some cases, + * only the iret frame registers are accessible. Use with caution! + */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f13b4c00a5de..0bc95be5c638 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -50,6 +50,28 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +void show_iret_regs(struct pt_regs *regs) +{ + printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) +{ + if (on_stack(info, regs, sizeof(*regs))) + __show_regs(regs, 0); + else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs); + } +} + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { @@ -94,8 +116,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); /* * Scan the stack, printing any text addresses we find. At the @@ -119,7 +141,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by __show_regs() below. + * by show_regs_safe() below. */ if (regs && stack == ®s->ip) goto next; @@ -155,8 +177,8 @@ next: /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_safe(&stack_info, regs); } if (stack_name) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eeeb34f85c25..01b119bebb68 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, - regs->sp, regs->flags); + show_iret_regs(regs); + if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else @@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); + if (!all) + return; + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - if (!all) - return; - cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index ff8e1132b2ae..be86a865087a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } -static bool stack_access_ok(struct unwind_state *state, unsigned long addr, +static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, size_t len) { struct stack_info *info = &state->stack_info; + void *addr = (void *)_addr; - /* - * If the address isn't on the current stack, switch to the next one. - * - * We may have to traverse multiple stacks to deal with the possibility - * that info->next_sp could point to an empty stack and the address - * could be on a subsequent stack. - */ - while (!on_stack(info, (void *)addr, len)) - if (get_stack_info(info->next_sp, state->task, info, - &state->stack_mask)) - return false; + if (!on_stack(info, addr, len) && + (get_stack_info(addr, state->task, info, &state->stack_mask))) + return false; return true; } @@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, return true; } -#define REGS_SIZE (sizeof(struct pt_regs)) -#define SP_OFFSET (offsetof(struct pt_regs, sp)) -#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) -#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) - static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, - unsigned long *ip, unsigned long *sp, bool full) + unsigned long *ip, unsigned long *sp) { - size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; - size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; - struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); - - if (IS_ENABLED(CONFIG_X86_64)) { - if (!stack_access_ok(state, addr, regs_size)) - return false; - - *ip = regs->ip; - *sp = regs->sp; + struct pt_regs *regs = (struct pt_regs *)addr; - return true; - } + /* x86-32 support will be more complicated due to the ®s->sp hack */ + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); - if (!stack_access_ok(state, addr, sp_offset)) + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; *ip = regs->ip; + *sp = regs->sp; + return true; +} - if (user_mode(regs)) { - if (!stack_access_ok(state, addr + sp_offset, - REGS_SIZE - SP_OFFSET)) - return false; +static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp) +{ + struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; - *sp = regs->sp; - } else - *sp = (unsigned long)®s->sp; + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + *ip = regs->ip; + *sp = regs->sp; return true; } @@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state) unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; - struct pt_regs *ptregs; bool indirect = false; if (unwind_done(state)) @@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; @@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS_IRET: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; } - ptregs = container_of((void *)sp, struct pt_regs, ip); - if ((unsigned long)ptregs >= prev_sp && - on_stack(&state->stack_info, ptregs, REGS_SIZE)) { - state->regs = ptregs; - state->full_regs = false; - } else - state->regs = NULL; - + state->regs = (void *)sp - IRET_FRAME_OFFSET; + state->full_regs = false; state->signal = true; break; -- cgit v1.2.3 From 6669a692605547892a026445e460bf233958bd7f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:10 +0100 Subject: x86/irq: Remove an old outdated comment about context tracking races That race has been fixed and code cleaned up for a while now. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.150551639@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 52089c043160..aa9d51eea9d0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - /* - * NB: Unlike exception entries, IRQ entries do not reliably - * handle context tracking in the low-level entry code. This is - * because syscall entries execute briefly with IRQs on before - * updating context tracking state, so we can take an IRQ from - * kernel mode with CONTEXT_USER. The low-level entry code only - * updates the context if we came from user mode, so we won't - * switch to CONTEXT_KERNEL. We'll fix that once the syscall - * code is cleaned up enough that we can cleanly defer enabling - * IRQs. - */ - entering_irq(); /* entering_irq() tells RCU that we're not quiescent. Check it. */ -- cgit v1.2.3 From 4f3789e792296e21405f708cf3cb409d7c7d5683 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:11 +0100 Subject: x86/irq/64: Print the offending IP in the stack overflow warning In case something goes wrong with unwind (not unlikely in case of overflow), print the offending IP where we detected the overflow. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.231677119@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 020efbf5786b..d86e344f5b3d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom); + estack_top, estack_bottom, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); -- cgit v1.2.3 From 1a79797b58cddfa948420a7553241c79c013e3ca Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:12 +0100 Subject: x86/entry/64: Allocate and enable the SYSENTER stack This will simplify future changes that want scratch variables early in the SYSENTER handler -- they'll be able to spill registers to the stack. It also lets us get rid of a SWAPGS_UNSAFE_STACK user. This does not depend on CONFIG_IA32_EMULATION=y because we'll want the stack space even without IA32 emulation. As far as I can tell, the reason that this wasn't done from day 1 is that we use IST for #DB and #BP, which is IMO rather nasty and causes a lot more problems than it solves. But, since #DB uses IST, we don't actually need a real stack for SYSENTER (because SYSENTER with TF set will invoke #DB on the IST stack rather than the SYSENTER stack). I want to remove IST usage from these vectors some day, and this patch is a prerequisite for that as well. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.312726423@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/include/asm/processor.h | 3 --- arch/x86/kernel/asm-offsets.c | 5 +++++ arch/x86/kernel/asm-offsets_32.c | 5 ----- arch/x86/kernel/cpu/common.c | 4 +++- arch/x86/kernel/process.c | 2 -- arch/x86/kernel/traps.c | 3 +-- 7 files changed, 10 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 568e130d932c..dcc6987f9bae 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -48,7 +48,7 @@ */ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + SWAPGS movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2db7cf720b04..789dad5da20f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -339,14 +339,11 @@ struct tss_struct { */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; -#ifdef CONFIG_X86_32 /* * Space for the temporary SYSENTER stack. */ unsigned long SYSENTER_stack_canary; unsigned long SYSENTER_stack[64]; -#endif - } ____cacheline_aligned; DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8ea78275480d..b275863128eb 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -93,4 +93,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* Offset from cpu_tss to SYSENTER_stack */ + OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); + /* Size of SYSENTER_stack */ + DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dedf428b20b6..52ce4ea16e53 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -50,11 +50,6 @@ void foo(void) DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - offsetofend(struct tss_struct, SYSENTER_stack)); - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); - #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); OFFSET(stack_canary_offset, stack_canary, canary); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cdf79ab628c2..22f542170198 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1361,7 +1361,9 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)this_cpu_ptr(&cpu_tss) + + offsetofend(struct tss_struct, SYSENTER_stack)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 97fb3e5737f5..35d674157fda 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -71,9 +71,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif -#ifdef CONFIG_X86_32 .SYSENTER_stack_canary = STACK_END_MAGIC, -#endif }; EXPORT_PER_CPU_SYMBOL(cpu_tss); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d366adfc61da..d3e3bbd5d3a0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -794,14 +794,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: -#if defined(CONFIG_X86_32) /* * This is the most likely code path that involves non-trivial use * of the SYSENTER stack. Check that we haven't overrun it. */ WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, "Overran or corrupted SYSENTER stack\n"); -#endif + ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); -- cgit v1.2.3 From 33a2f1a6c4d7c0a02d1c006fb0379cc5ca3b96bb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:13 +0100 Subject: x86/dumpstack: Add get_stack_info() support for the SYSENTER stack get_stack_info() doesn't currently know about the SYSENTER stack, so unwinding will fail if we entered the kernel on the SYSENTER stack and haven't fully switched off. Teach get_stack_info() about the SYSENTER stack. With future patches applied that run part of the entry code on the SYSENTER stack and introduce an intentional BUG(), I would get: PANIC: double fault, error_code: 0x0 ... RIP: 0010:do_error_trap+0x33/0x1c0 ... Call Trace: Code: ... With this patch, I get: PANIC: double fault, error_code: 0x0 ... Call Trace: ? async_page_fault+0x36/0x60 ? invalid_op+0x22/0x40 ? async_page_fault+0x36/0x60 ? sync_regs+0x3c/0x40 ? sync_regs+0x2e/0x40 ? error_entry+0x6c/0xd0 ? async_page_fault+0x36/0x60 Code: ... which is a lot more informative. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.392711508@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 3 +++ arch/x86/kernel/dumpstack.c | 19 +++++++++++++++++++ arch/x86/kernel/dumpstack_32.c | 6 ++++++ arch/x86/kernel/dumpstack_64.c | 6 ++++++ 4 files changed, 34 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 8da111b3c342..f8062bfd43a0 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -16,6 +16,7 @@ enum stack_type { STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, + STACK_TYPE_SYSENTER, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; @@ -28,6 +29,8 @@ struct stack_info { bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); +bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0bc95be5c638..a33a1373a252 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -43,6 +43,25 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } +bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss); + + /* Treat the canary as part of the stack for unwinding purposes. */ + void *begin = &tss->SYSENTER_stack_canary; + void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_SYSENTER; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index daefae83a3aa..5ff13a6b3680 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; + if (type == STACK_TYPE_SYSENTER) + return "SYSENTER"; + return NULL; } @@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (task != current) goto unknown; + if (in_sysenter_stack(stack, info)) + goto recursion_check; + if (in_hardirq_stack(stack, info)) goto recursion_check; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 88ce2ffdb110..abc828f8c297 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_SYSENTER) + return "SYSENTER"; + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_irq_stack(stack, info)) goto recursion_check; + if (in_sysenter_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: -- cgit v1.2.3 From aaeed3aeb39c1ba69f0a49baec8cb728121d0a91 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:14 +0100 Subject: x86/entry/gdt: Put per-CPU GDT remaps in ascending order We currently have CPU 0's GDT at the top of the GDT range and higher-numbered CPUs at lower addresses. This happens because the fixmap is upside down (index 0 is the top of the fixmap). Flip it so that GDTs are in ascending order by virtual address. This will simplify a future patch that will generalize the GDT remap to contain multiple pages. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.471561421@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 0a3e808b9123..01fd944fd721 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -63,7 +63,7 @@ static inline struct desc_struct *get_current_gdt_rw(void) /* Get the fixmap index for a specific processor */ static inline unsigned int get_cpu_gdt_ro_index(int cpu) { - return FIX_GDT_REMAP_BEGIN + cpu; + return FIX_GDT_REMAP_END - cpu; } /* Provide the fixmap address of the remapped GDT */ -- cgit v1.2.3 From ef8813ab280507972bb57e4b1b502811ad4411e9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:15 +0100 Subject: x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area Currently, the GDT is an ad-hoc array of pages, one per CPU, in the fixmap. Generalize it to be an array of a new 'struct cpu_entry_area' so that we can cleanly add new things to it. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.563271721@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 9 +-------- arch/x86/include/asm/fixmap.h | 37 +++++++++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/common.c | 14 +++++++------- arch/x86/xen/mmu_pv.c | 2 +- 4 files changed, 44 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 01fd944fd721..f6f428432a68 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void) return this_cpu_ptr(&gdt_page)->gdt; } -/* Get the fixmap index for a specific processor */ -static inline unsigned int get_cpu_gdt_ro_index(int cpu) -{ - return FIX_GDT_REMAP_END - cpu; -} - /* Provide the fixmap address of the remapped GDT */ static inline struct desc_struct *get_cpu_gdt_ro(int cpu) { - unsigned int idx = get_cpu_gdt_ro_index(cpu); - return (struct desc_struct *)__fix_to_virt(idx); + return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; } /* Provide the current read-only GDT */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b0c505fe9a95..b61f0242f9d0 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -44,6 +44,19 @@ extern unsigned long __FIXADDR_TOP; PAGE_SIZE) #endif +/* + * cpu_entry_area is a percpu region in the fixmap that contains things + * needed by the CPU and early entry/exit code. Real types aren't used + * for all fields here to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { + char gdt[PAGE_SIZE]; +}; + +#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) /* * Here we define all the compile-time 'special' virtual @@ -101,8 +114,8 @@ enum fixed_addresses { FIX_LNW_VRTC, #endif /* Fixmap entries to remap the GDTs, one per processor. */ - FIX_GDT_REMAP_BEGIN, - FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + FIX_CPU_ENTRY_AREA_TOP, + FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, #ifdef CONFIG_ACPI_APEI_GHES /* Used for GHES mapping from assorted contexts */ @@ -191,5 +204,25 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, void __early_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags); +static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) +{ + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; +} + +#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ + BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ + __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ + }) + +#define get_cpu_entry_area_index(cpu, field) \ + __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) + +static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22f542170198..2cb394dc4153 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,12 +466,12 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) +/* Setup the fixmap mappings only once per-processor */ +static inline void setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 /* On 64-bit systems, we use a read-only fixmap GDT. */ - pgprot_t prot = PAGE_KERNEL_RO; + pgprot_t gdt_prot = PAGE_KERNEL_RO; #else /* * On native 32-bit systems, the GDT cannot be read-only because @@ -482,11 +482,11 @@ static inline void setup_fixmap_gdt(int cpu) * On Xen PV, the GDT must be read-only because the hypervisor requires * it. */ - pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? PAGE_KERNEL_RO : PAGE_KERNEL; #endif - __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); } /* Load the original GDT from the per-cpu structure */ @@ -1589,7 +1589,7 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); - setup_fixmap_gdt(cpu); + setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } @@ -1651,7 +1651,7 @@ void cpu_init(void) fpu__init_cpu(); - setup_fixmap_gdt(cpu); + setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2ccdaba31a07..c2454237fa67 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: - case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: + case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: /* All local page mappings */ pte = pfn_pte(phys, prot); break; -- cgit v1.2.3 From 21506525fb8ddb0342f2a2370812d47f6a1f3833 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:16 +0100 Subject: x86/kasan/64: Teach KASAN about the cpu_entry_area The cpu_entry_area will contain stacks. Make sure that KASAN has appropriate shadow mappings for them. Signed-off-by: Andy Lutomirski Signed-off-by: Andrey Ryabinin Signed-off-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: kasan-dev@googlegroups.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.642806442@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 99dfed6dfef8..9ec70d780f1f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -277,6 +277,7 @@ void __init kasan_early_init(void) void __init kasan_init(void) { int i; + void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; #ifdef CONFIG_KASAN_INLINE register_die_notifier(&kasan_die_notifier); @@ -329,8 +330,23 @@ void __init kasan_init(void) (unsigned long)kasan_mem_to_shadow(_end), early_pfn_to_nid(__pa(_stext))); + shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, + PAGE_SIZE); + + shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, + PAGE_SIZE); + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), - (void *)KASAN_SHADOW_END); + shadow_cpu_entry_begin); + + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, + (unsigned long)shadow_cpu_entry_end, 0); + + kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); load_cr3(init_top_pgt); __flush_tlb_all(); -- cgit v1.2.3 From 7fb983b4dd569e08564134a850dfd4eb1c63d9b8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:17 +0100 Subject: x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss A future patch will move SYSENTER_stack to the beginning of cpu_tss to help detect overflow. Before this can happen, fix several code paths that hardcode assumptions about the old layout. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.722425540@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 +- arch/x86/include/asm/processor.h | 9 +++++++-- arch/x86/kernel/cpu/common.c | 8 ++++---- arch/x86/kernel/doublefault.c | 36 +++++++++++++++++------------------- arch/x86/kvm/vmx.c | 2 +- arch/x86/power/cpu.c | 13 +++++++------ 6 files changed, 37 insertions(+), 33 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index f6f428432a68..2ace1f90d138 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -178,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, #endif } -static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) { struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 789dad5da20f..555c9478f3df 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -162,7 +162,7 @@ enum cpuid_regs_idx { extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; -extern struct tss_struct doublefault_tss; +extern struct x86_hw_tss doublefault_tss; extern __u32 cpu_caps_cleared[NCAPINTS]; extern __u32 cpu_caps_set[NCAPINTS]; @@ -252,6 +252,11 @@ static inline void load_cr3(pgd_t *pgdir) write_cr3(__sme_pa(pgdir)); } +/* + * Note that while the legacy 'TSS' name comes from 'Task State Segment', + * on modern x86 CPUs the TSS also holds information important to 64-bit mode, + * unrelated to the task-switch mechanism: + */ #ifdef CONFIG_X86_32 /* This is the TSS defined by the hardware. */ struct x86_hw_tss { @@ -322,7 +327,7 @@ struct x86_hw_tss { #define IO_BITMAP_BITS 65536 #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 struct tss_struct { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2cb394dc4153..3f285b973f50 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1557,7 +1557,7 @@ void cpu_init(void) } } - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; /* * <= is required because the CPU will access up to @@ -1576,7 +1576,7 @@ void cpu_init(void) * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &t->x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1634,12 +1634,12 @@ void cpu_init(void) * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, t); + set_tss_desc(cpu, &t->x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0e662c55ae90..0b8cedb20d6d 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -50,25 +50,23 @@ static void doublefault_fn(void) cpu_relax(); } -struct tss_struct doublefault_tss __cacheline_aligned = { - .x86_tss = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, - - .__cr3 = __pa_nodebug(swapper_pg_dir), - } +struct x86_hw_tss doublefault_tss __cacheline_aligned = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa_nodebug(swapper_pg_dir), }; /* dummy for do_double_fault() call */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a6f4f095f8f4..2abe0073b573 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. See 22.2.4. */ vmcs_writel(HOST_TR_BASE, - (unsigned long)this_cpu_ptr(&cpu_tss)); + (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 84fcfde53f8f..50593e138281 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -165,12 +165,13 @@ static void fix_processor_context(void) struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif - set_tss_desc(cpu, t); /* - * This just modifies memory; should not be - * necessary. But... This is necessary, because - * 386 hardware has concept of busy TSS or some - * similar stupidity. - */ + + /* + * This just modifies memory; should not be necessary. But... This is + * necessary, because 386 hardware has concept of busy TSS or some + * similar stupidity. + */ + set_tss_desc(cpu, &t->x86_tss); #ifdef CONFIG_X86_64 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); -- cgit v1.2.3 From 6e60e583426c2f8751c22c2dfe5c207083b4483a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:18 +0100 Subject: x86/dumpstack: Handle stack overflow on all stacks We currently special-case stack overflow on the task stack. We're going to start putting special stacks in the fixmap with a custom layout, so they'll have guard pages, too. Teach the unwinder to be able to unwind an overflow of any of the stacks. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.802057305@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index a33a1373a252..64f8ed2a4827 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -112,24 +112,28 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) + * - SYSENTER stack * - * x86-32 can have up to three stacks: + * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack + * - SYSENTER stack */ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; - /* - * If we overflowed the task stack into a guard page, jump back - * to the bottom of the usable stack. - */ - if (task_stack_page(task) - (void *)stack < PAGE_SIZE) - stack = task_stack_page(task); - - if (get_stack_info(stack, task, &stack_info, &visit_mask)) - break; + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } stack_name = stack_type_name(stack_info.type); if (stack_name) -- cgit v1.2.3 From 1a935bc3d4ea61556461a9e92a68ca3556232efd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:19 +0100 Subject: x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct SYSENTER_stack should have reliable overflow detection, which means that it needs to be at the bottom of a page, not the top. Move it to the beginning of struct tss_struct and page-align it. Also add an assertion to make sure that the fixed hardware TSS doesn't cross a page boundary. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.881827433@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 21 ++++++++++++--------- arch/x86/kernel/cpu/common.c | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 555c9478f3df..759051251664 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -332,7 +332,16 @@ struct x86_hw_tss { struct tss_struct { /* - * The hardware state: + * Space for the temporary SYSENTER stack, used for SYSENTER + * and the entry trampoline as well. + */ + unsigned long SYSENTER_stack_canary; + unsigned long SYSENTER_stack[64]; + + /* + * The fixed hardware portion. This must not cross a page boundary + * at risk of violating the SDM's advice and potentially triggering + * errata. */ struct x86_hw_tss x86_tss; @@ -343,15 +352,9 @@ struct tss_struct { * be within the limit. */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +} __aligned(PAGE_SIZE); - /* - * Space for the temporary SYSENTER stack. - */ - unsigned long SYSENTER_stack_canary; - unsigned long SYSENTER_stack[64]; -} ____cacheline_aligned; - -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); /* * sizeof(unsigned long) coming from an extra "long" at the end diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3f285b973f50..60b2dfd2a58b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,6 +487,27 @@ static inline void setup_cpu_entry_area(int cpu) #endif __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + } /* Load the original GDT from the per-cpu structure */ -- cgit v1.2.3 From 72f5e08dbba2d01aa90b592cf76c378ea233b00b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:20 +0100 Subject: x86/entry: Remap the TSS into the CPU entry area This has a secondary purpose: it puts the entry stack into a region with a well-controlled layout. A subsequent patch will take advantage of this to streamline the SYSCALL entry code to be able to find it more easily. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150605.962042855@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 6 ++++-- arch/x86/include/asm/fixmap.h | 7 +++++++ arch/x86/kernel/asm-offsets.c | 3 +++ arch/x86/kernel/cpu/common.c | 41 +++++++++++++++++++++++++++++++++++------ arch/x86/kernel/dumpstack.c | 3 ++- arch/x86/kvm/vmx.c | 2 +- arch/x86/power/cpu.c | 11 ++++++----- 7 files changed, 58 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4838037f97f6..0ab316c46806 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -941,7 +941,8 @@ ENTRY(debug) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -984,7 +985,8 @@ ENTRY(nmi) movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b61f0242f9d0..84558b611ad3 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -54,6 +54,13 @@ extern unsigned long __FIXADDR_TOP; */ struct cpu_entry_area { char gdt[PAGE_SIZE]; + + /* + * The GDT is just below cpu_tss and thus serves (on x86_64) as a + * a read-only guard page for the SYSENTER stack at the bottom + * of the TSS region. + */ + struct tss_struct tss; }; #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index b275863128eb..55858b277cf6 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -98,4 +98,7 @@ void common(void) { OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); /* Size of SYSENTER_stack */ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 60b2dfd2a58b..e5837bd6c672 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,6 +466,22 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } +static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, + int pages, pgprot_t prot) +{ + int i; + + for (i = 0; i < pages; i++) { + __set_fixmap(fixmap_index - i, + per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); + } +} + +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +#endif + /* Setup the fixmap mappings only once per-processor */ static inline void setup_cpu_entry_area(int cpu) { @@ -507,7 +523,15 @@ static inline void setup_cpu_entry_area(int cpu) */ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), + &per_cpu(cpu_tss, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, + PAGE_KERNEL); +#ifdef CONFIG_X86_32 + this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); +#endif } /* Load the original GDT from the per-cpu structure */ @@ -1257,7 +1281,8 @@ void enable_sep_cpu(void) wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), + (unsigned long)&get_cpu_entry_area(cpu)->tss + + offsetofend(struct tss_struct, SYSENTER_stack), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); @@ -1370,6 +1395,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { + int cpu = smp_processor_id(); + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); @@ -1383,7 +1410,7 @@ void syscall_init(void) */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)this_cpu_ptr(&cpu_tss) + + (unsigned long)&get_cpu_entry_area(cpu)->tss + offsetofend(struct tss_struct, SYSENTER_stack)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else @@ -1593,11 +1620,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); + setup_cpu_entry_area(cpu); + /* * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, &t->x86_tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1610,7 +1639,6 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); - setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } @@ -1651,11 +1679,13 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); + setup_cpu_entry_area(cpu); + /* * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. */ - set_tss_desc(cpu, &t->x86_tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); load_mm_ldt(&init_mm); @@ -1672,7 +1702,6 @@ void cpu_init(void) fpu__init_cpu(); - setup_cpu_entry_area(cpu); load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 64f8ed2a4827..60267850125e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -45,7 +45,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) { - struct tss_struct *tss = this_cpu_ptr(&cpu_tss); + int cpu = smp_processor_id(); + struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; /* Treat the canary as part of the stack for unwinding purposes. */ void *begin = &tss->SYSENTER_stack_canary; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2abe0073b573..62ee4362e1c1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2291,7 +2291,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. See 22.2.4. */ vmcs_writel(HOST_TR_BASE, - (unsigned long)this_cpu_ptr(&cpu_tss.x86_tss)); + (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 50593e138281..04d5157fe7f8 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -160,18 +160,19 @@ static void do_fpu_end(void) static void fix_processor_context(void) { int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif /* - * This just modifies memory; should not be necessary. But... This is - * necessary, because 386 hardware has concept of busy TSS or some - * similar stupidity. + * We need to reload TR, which requires that we change the + * GDT entry to indicate "available" first. + * + * XXX: This could probably all be replaced by a call to + * force_reload_TR(). */ - set_tss_desc(cpu, &t->x86_tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); #ifdef CONFIG_X86_64 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); -- cgit v1.2.3 From 9aaefe7b59ae00605256a7d6bd1c1456432495fc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:21 +0100 Subject: x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0 On 64-bit kernels, we used to assume that TSS.sp0 was the current top of stack. With the addition of an entry trampoline, this will no longer be the case. Store the current top of stack in TSS.sp1, which is otherwise unused but shares the same cacheline. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.050864668@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 18 +++++++++++++----- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/asm-offsets_64.c | 1 + arch/x86/kernel/process.c | 10 ++++++++++ arch/x86/kernel/process_64.c | 1 + 5 files changed, 26 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 759051251664..b0cf0612a454 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -309,7 +309,13 @@ struct x86_hw_tss { struct x86_hw_tss { u32 reserved1; u64 sp0; + + /* + * We store cpu_current_top_of_stack in sp1 so it's always accessible. + * Linux does not use ring 1, so sp1 is not otherwise needed. + */ u64 sp1; + u64 sp2; u64 reserved2; u64 ist[7]; @@ -368,6 +374,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#else +#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 #endif /* @@ -539,12 +547,12 @@ static inline void native_swapgs(void) static inline unsigned long current_top_of_stack(void) { -#ifdef CONFIG_X86_64 - return this_cpu_read_stable(cpu_tss.x86_tss.sp0); -#else - /* sp0 on x86_32 is special in and around vm86 mode. */ + /* + * We can't read directly from tss.sp0: sp0 on x86_32 is special in + * and around vm86 mode and sp0 on x86_64 is special because of the + * entry trampoline. + */ return this_cpu_read_stable(cpu_current_top_of_stack); -#endif } static inline bool on_thread_stack(void) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 70f425947dc5..44a04999791e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) #endif #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e3a5175a444b..bf51e51d808d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,6 +66,7 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 35d674157fda..86e83762e3b3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { * Poison it. */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_64 + /* + * .sp1 is cpu_current_top_of_stack. The init task never + * runs user code, but cpu_current_top_of_stack should still + * be well defined before the first context switch. + */ + .sp1 = TOP_OF_INIT_STACK, +#endif + #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 01b119bebb68..157f81816915 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -461,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ update_sp0(next_p); -- cgit v1.2.3 From 6d9256f0a89eaff97fca6006100bcaea8d1d8bdb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:22 +0100 Subject: x86/espfix/64: Stop assuming that pt_regs is on the entry stack When we start using an entry trampoline, a #GP from userspace will be delivered on the entry stack, not on the task stack. Fix the espfix64 #DF fixup to set up #GP according to TSS.SP0, rather than assuming that pt_regs + 1 == SP0. This won't change anything without an entry stack, but it will make the code continue to work when an entry stack is added. While we're at it, improve the comments to explain what's actually going on. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.130778051@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d3e3bbd5d3a0..f0029d17b14b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) /* * If IRET takes a non-IST fault on the espfix64 stack, then we - * end up promoting it to a doublefault. In that case, modify - * the stack to make it look like we just entered the #GP - * handler from user space, similar to bad_iret. + * end up promoting it to a doublefault. In that case, take + * advantage of the fact that we're not using the normal (TSS.sp0) + * stack right now. We can write a fake #GP(0) frame at TSS.sp0 + * and then modify our own IRET frame so that, when we return, + * we land directly at the #GP(0) vector with the stack already + * set up according to its expectations. + * + * The net result is that our #GP handler will think that we + * entered from usermode with the bad user context. * * No need for ist_enter here because we don't use RCU. */ @@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *normal_regs = task_pt_regs(current); + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + + /* + * regs->sp points to the failing IRET frame on the + * ESPFIX64 stack. Copy it to the entry stack. This fills + * in gpregs->ss through gpregs->ip. + * + */ + memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ - /* Fake a #GP(0) from userspace. */ - memmove(&normal_regs->ip, (void *)regs->sp, 5*8); - normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + /* + * Adjust our frame so that we return straight to the #GP + * vector with the expected RSP value. This is safe because + * we won't enable interupts or schedule before we invoke + * general_protection, so nothing will clobber the stack + * frame we just set up. + */ regs->ip = (unsigned long)general_protection; - regs->sp = (unsigned long)&normal_regs->orig_ax; + regs->sp = (unsigned long)&gpregs->orig_ax; return; } @@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being - * deliv- ered, the faulting linear address of the second fault will + * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a -- cgit v1.2.3 From 7f2590a110b837af5679d08fc25c6227c5a8c497 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:23 +0100 Subject: x86/entry/64: Use a per-CPU trampoline stack for IDT entries Historically, IDT entries from usermode have always gone directly to the running task's kernel stack. Rearrange it so that we enter on a per-CPU trampoline stack and then manually switch to the task's stack. This touches a couple of extra cachelines, but it gives us a chance to run some code before we touch the kernel stack. The asm isn't exactly beautiful, but I think that fully refactoring it can wait. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.225330557@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 67 ++++++++++++++++++++++++++++++---------- arch/x86/entry/entry_64_compat.S | 5 ++- arch/x86/include/asm/switch_to.h | 4 ++- arch/x86/include/asm/traps.h | 1 - arch/x86/kernel/cpu/common.c | 6 ++-- arch/x86/kernel/traps.c | 21 +++++++------ 6 files changed, 72 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 32306788821c..35b8e949ac2f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -560,6 +560,13 @@ END(irq_entries_start) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld + + testb $3, CS-ORIG_RAX(%rsp) + jz 1f + SWAPGS + call switch_to_thread_stack +1: + ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS @@ -569,12 +576,8 @@ END(irq_entries_start) jz 1f /* - * IRQ from user mode. Switch to kernel gsbase and inform context - * tracking that we're in kernel mode. - */ - SWAPGS - - /* + * IRQ from user mode. + * * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode * (which can take locks). Since TRACE_IRQS_OFF idempotent, @@ -828,6 +831,32 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +/* + * Switch to the thread stack. This is called with the IRET frame and + * orig_ax on the stack. (That is, RDI..R12 are not on the stack and + * space has not been allocated for them.) + */ +ENTRY(switch_to_thread_stack) + UNWIND_HINT_FUNC + + pushq %rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI + + pushq 7*8(%rdi) /* regs->ss */ + pushq 6*8(%rdi) /* regs->rsp */ + pushq 5*8(%rdi) /* regs->eflags */ + pushq 4*8(%rdi) /* regs->cs */ + pushq 3*8(%rdi) /* regs->ip */ + pushq 2*8(%rdi) /* regs->orig_ax */ + pushq 8(%rdi) /* return address */ + UNWIND_HINT_FUNC + + movq (%rdi), %rdi + ret +END(switch_to_thread_stack) + .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -845,11 +874,12 @@ ENTRY(\sym) ALLOC_PT_GPREGS_ON_STACK - .if \paranoid - .if \paranoid == 1 + .if \paranoid < 2 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ - jnz 1f + jnz .Lfrom_usermode_switch_stack_\@ .endif + + .if \paranoid call paranoid_entry .else call error_entry @@ -891,20 +921,15 @@ ENTRY(\sym) jmp error_exit .endif - .if \paranoid == 1 + .if \paranoid < 2 /* - * Paranoid entry from userspace. Switch stacks and treat it + * Entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers * run in real process context if user_mode(regs). */ -1: +.Lfrom_usermode_switch_stack_\@: call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - call sync_regs - movq %rax, %rsp /* switch stack */ - movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code @@ -1165,6 +1190,14 @@ ENTRY(error_entry) SWAPGS .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ + ENCODE_FRAME_POINTER + pushq %r12 + /* * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index dcc6987f9bae..95ad40eb7eff 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat) */ movl %eax, %eax - /* Construct struct pt_regs on stack (iret frame is already on stack) */ pushq %rax /* pt_regs->orig_ax */ + + /* switch to thread stack expects orig_ax to be pushed */ + call switch_to_thread_stack + pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8c6bd6863db9..cbc71e73bd32 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { + /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 load_sp0(task->thread.sp0); #else - load_sp0(task_top_of_stack(task)); + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task_top_of_stack(task)); #endif } diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1fadd310ff68..31051f35cbb7 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long); #ifdef CONFIG_X86_64 dotraplinkage void do_double_fault(struct pt_regs *, long); -asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e5837bd6c672..57968880e39b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1623,11 +1623,13 @@ void cpu_init(void) setup_cpu_entry_area(cpu); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + + offsetofend(struct tss_struct, SYSENTER_stack)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f0029d17b14b..ee9ca0ad4388 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -619,14 +619,15 @@ NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* - * Help handler running on IST stack to switch off the IST stack if the - * interrupted code was in user mode. The actual stack switch is done in - * entry_64.S + * Help handler running on a per-cpu (IST or entry trampoline) stack + * to switch to the normal thread stack if the interrupted code was in + * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = task_pt_regs(current); - *regs = *eregs; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + if (regs != eregs) + *regs = *eregs; return regs; } NOKPROBE_SYMBOL(sync_regs); @@ -642,13 +643,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault - * correctly, we want move our stack frame to task_pt_regs - * and we want to pretend that the exception came from the - * iret target. + * correctly, we want to move our stack frame to where it would + * be had we entered directly on the entry stack (rather than + * just below the IRET frame) and we want to pretend that the + * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - container_of(task_pt_regs(current), - struct bad_iret_stack, regs); + (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); -- cgit v1.2.3 From 3e3b9293d392c577b62e24e4bc9982320438e749 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:24 +0100 Subject: x86/entry/64: Return to userspace from the trampoline stack By itself, this is useless. It gives us the ability to run some final code before exit that cannnot run on the kernel stack. This could include a CR3 switch a la PAGE_TABLE_ISOLATION or some kernel stack erasing, for example. (Or even weird things like *changing* which kernel stack gets used as an ASLR-strengthening mechanism.) The SYSRET32 path is not covered yet. It could be in the future or we could just ignore it and force the slow path if needed. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.306546484@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 55 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 35b8e949ac2f..42a9379f7acb 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -326,8 +326,24 @@ syscall_return_via_sysret: popq %rsi /* skip rcx */ popq %rdx popq %rsi + + /* + * Now all regs are restored except RSP and RDI. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + popq %rdi - movq RSP-ORIG_RAX(%rsp), %rsp + popq %rsp USERGS_SYSRET64 END(entry_SYSCALL_64) @@ -630,10 +646,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) ud2 1: #endif - SWAPGS POP_EXTRA_REGS - POP_C_REGS - addq $8, %rsp /* skip regs->orig_ax */ + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + + /* + * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ + pushq 5*8(%rdi) /* RSP */ + pushq 4*8(%rdi) /* EFLAGS */ + pushq 3*8(%rdi) /* CS */ + pushq 2*8(%rdi) /* RIP */ + + /* Push user RDI on the trampoline stack. */ + pushq (%rdi) + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + + /* Restore RDI. */ + popq %rdi + SWAPGS INTERRUPT_RETURN -- cgit v1.2.3 From 3386bc8aed825e9f1f65ce38df4b109b2019b71a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:25 +0100 Subject: x86/entry/64: Create a per-CPU SYSCALL entry trampoline Handling SYSCALL is tricky: the SYSCALL handler is entered with every single register (except FLAGS), including RSP, live. It somehow needs to set RSP to point to a valid stack, which means it needs to save the user RSP somewhere and find its own stack pointer. The canonical way to do this is with SWAPGS, which lets us access percpu data using the %gs prefix. With PAGE_TABLE_ISOLATION-like pagetable switching, this is problematic. Without a scratch register, switching CR3 is impossible, so %gs-based percpu memory would need to be mapped in the user pagetables. Doing that without information leaks is difficult or impossible. Instead, use a different sneaky trick. Map a copy of the first part of the SYSCALL asm at a different address for each CPU. Now RIP varies depending on the CPU, so we can use RIP-relative memory access to access percpu memory. By putting the relevant information (one scratch slot and the stack address) at a constant offset relative to RIP, we can make SYSCALL work without relying on %gs. A nice thing about this approach is that we can easily switch it on and off if we want pagetable switching to be configurable. The compat variant of SYSCALL doesn't have this problem in the first place -- there are plenty of scratch registers, since we don't care about preserving r8-r15. This patch therefore doesn't touch SYSCALL32 at all. This patch actually seems to be a small speedup. With this patch, SYSCALL touches an extra cache line and an extra virtual page, but the pipeline no longer stalls waiting for SWAPGS. It seems that, at least in a tight loop, the latter outweights the former. Thanks to David Laight for an optimization tip. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.403607157@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 58 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/fixmap.h | 2 ++ arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/cpu/common.c | 15 ++++++++++- arch/x86/kernel/vmlinux.lds.S | 9 +++++++ 5 files changed, 84 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 42a9379f7acb..2582984ffb4b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -136,6 +136,64 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ + .pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline. This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address). So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * _entry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline. We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ + SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY + swapgs + + /* Stash the user RSP. */ + movq %rsp, RSP_SCRATCH + + /* Load the top of the task stack into RSP */ + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + + /* Start building the simulated IRET frame. */ + pushq $__USER_DS /* pt_regs->ss */ + pushq RSP_SCRATCH /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + + /* + * x86 lacks a near absolute jump, and we can't jump to the real + * entry text with a relative jump. We could push the target + * address and then use retq, but this destroys the pipeline on + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, + * spill RDI and restore it in a second-stage trampoline. + */ + pushq %rdi + movq $entry_SYSCALL_64_stage2, %rdi + jmp *%rdi +END(entry_SYSCALL_64_trampoline) + + .popsection + +ENTRY(entry_SYSCALL_64_stage2) + UNWIND_HINT_EMPTY + popq %rdi + jmp entry_SYSCALL_64_after_hwframe +END(entry_SYSCALL_64_stage2) + ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 84558b611ad3..6a699474c2c7 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -61,6 +61,8 @@ struct cpu_entry_area { * of the TSS region. */ struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; }; #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 55858b277cf6..61b1af88ac07 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -101,4 +101,5 @@ void common(void) { /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 57968880e39b..430f950b0b7f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -486,6 +486,8 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); static inline void setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + /* On 64-bit systems, we use a read-only fixmap GDT. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; #else @@ -532,6 +534,11 @@ static inline void setup_cpu_entry_area(int cpu) #ifdef CONFIG_X86_32 this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); #endif + +#ifdef CONFIG_X86_64 + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif } /* Load the original GDT from the per-cpu structure */ @@ -1395,10 +1402,16 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { + extern char _entry_trampoline[]; + extern char entry_SYSCALL_64_trampoline[]; + int cpu = smp_processor_id(); + unsigned long SYSCALL64_entry_trampoline = + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + + (entry_SYSCALL_64_trampoline - _entry_trampoline); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be87..d2a8b5a24a44 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -107,6 +107,15 @@ SECTIONS SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) + +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + _entry_trampoline = .; + *(.entry_trampoline) + . = ALIGN(PAGE_SIZE); + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); +#endif + /* End of text section */ _etext = .; } :text = 0x9090 -- cgit v1.2.3 From 40e7f949e0d9a33968ebde5d67f7e3a47c97742a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:26 +0100 Subject: x86/entry/64: Move the IST stacks into struct cpu_entry_area The IST stacks are needed when an IST exception occurs and are accessed before any kernel code at all runs. Move them into struct cpu_entry_area. The IST stacks are unlike the rest of cpu_entry_area: they're used even for entries from kernel mode. This means that they should be set up before we load the final IDT. Move cpu_entry_area setup to trap_init() for the boot CPU and set it up for all possible CPUs at once in native_smp_prepare_cpus(). Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.480598743@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 12 +++++++ arch/x86/kernel/cpu/common.c | 74 ++++++++++++++++++++++++------------------- arch/x86/kernel/traps.c | 3 ++ 3 files changed, 57 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 6a699474c2c7..451da7d9a502 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -63,10 +63,22 @@ struct cpu_entry_area { struct tss_struct tss; char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 + /* + * Exception stacks used for IST entries. + * + * In the future, this should have a separate slot for each stack + * with guard pages between them. + */ + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif }; #define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +extern void setup_cpu_entry_areas(void); + /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 430f950b0b7f..fb01a8e5e9b7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,24 +466,36 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -static void set_percpu_fixmap_pages(int fixmap_index, void *ptr, - int pages, pgprot_t prot) -{ - int i; - - for (i = 0; i < pages; i++) { - __set_fixmap(fixmap_index - i, - per_cpu_ptr_to_phys(ptr + i * PAGE_SIZE), prot); - } -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif +#ifdef CONFIG_X86_64 +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; + +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +static void __init +set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) + __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); +} + /* Setup the fixmap mappings only once per-processor */ -static inline void setup_cpu_entry_area(int cpu) +static void __init setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 extern char _entry_trampoline[]; @@ -532,15 +544,31 @@ static inline void setup_cpu_entry_area(int cpu) PAGE_KERNEL); #ifdef CONFIG_X86_32 - this_cpu_write(cpu_entry_area, get_cpu_entry_area(cpu)); + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); #endif #ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, + PAGE_KERNEL); + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); #endif } +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); +} + /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1385,20 +1413,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1607,7 +1621,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = per_cpu(exception_stacks, cpu); + char *estacks = get_cpu_entry_area(cpu)->exception_stacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; @@ -1633,8 +1647,6 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - setup_cpu_entry_area(cpu); - /* * Initialize the TSS. sp0 points to the entry trampoline stack * regardless of what task is running. @@ -1694,8 +1706,6 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - setup_cpu_entry_area(cpu); - /* * Initialize the TSS. Don't bother initializing sp0, as the initial * task never enters user mode. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ee9ca0ad4388..3e29aad5c7cc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -947,6 +947,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init trap_init(void) { + /* Init cpu_entry_area before IST entries are set up */ + setup_cpu_entry_areas(); + idt_setup_traps(); /* -- cgit v1.2.3 From 7fbbd5cbebf118a9e09f5453f686656a167c3d1c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:27 +0100 Subject: x86/entry/64: Remove the SYSENTER stack canary Now that the SYSENTER stack has a guard page, there's no need for a canary to detect overflow after the fact. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.572577316@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/dumpstack.c | 3 +-- arch/x86/kernel/process.c | 1 - arch/x86/kernel/traps.c | 7 ------- 4 files changed, 1 insertion(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b0cf0612a454..d34ac13c5866 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -341,7 +341,6 @@ struct tss_struct { * Space for the temporary SYSENTER stack, used for SYSENTER * and the entry trampoline as well. */ - unsigned long SYSENTER_stack_canary; unsigned long SYSENTER_stack[64]; /* diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 60267850125e..ae1ce2e3f132 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -48,8 +48,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) int cpu = smp_processor_id(); struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; - /* Treat the canary as part of the stack for unwinding purposes. */ - void *begin = &tss->SYSENTER_stack_canary; + void *begin = &tss->SYSENTER_stack; void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); if ((void *)stack < begin || (void *)stack >= end) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 86e83762e3b3..6a04287f222b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -81,7 +81,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif - .SYSENTER_stack_canary = STACK_END_MAGIC, }; EXPORT_PER_CPU_SYMBOL(cpu_tss); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3e29aad5c7cc..5ade4f89a6d1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -814,13 +814,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - /* - * This is the most likely code path that involves non-trivial use - * of the SYSENTER stack. Check that we haven't overrun it. - */ - WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, - "Overran or corrupted SYSENTER stack\n"); - ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); -- cgit v1.2.3 From 0f9a48100fba3f189724ae88a450c2261bf91c80 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:28 +0100 Subject: x86/entry: Clean up the SYSENTER_stack code The existing code was a mess, mainly because C arrays are nasty. Turn SYSENTER_stack into a struct, add a helper to find it, and do all the obvious cleanups this enables. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.653244723@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/fixmap.h | 5 +++++ arch/x86/include/asm/processor.h | 6 +++++- arch/x86/kernel/asm-offsets.c | 6 ++---- arch/x86/kernel/cpu/common.c | 14 +++----------- arch/x86/kernel/dumpstack.c | 7 +++---- 7 files changed, 21 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0ab316c46806..3629bcbf85a2 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,7 +942,7 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -986,7 +986,7 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2582984ffb4b..575b184f377f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,7 +154,7 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_tss + CPU_TSS_SYSENTER_stack + \ +#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 451da7d9a502..cc5d98bdca37 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -245,5 +245,10 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); } +static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) +{ + return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d34ac13c5866..f933869470b8 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -336,12 +336,16 @@ struct x86_hw_tss { #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 +struct SYSENTER_stack { + unsigned long words[64]; +}; + struct tss_struct { /* * Space for the temporary SYSENTER stack, used for SYSENTER * and the entry trampoline as well. */ - unsigned long SYSENTER_stack[64]; + struct SYSENTER_stack SYSENTER_stack; /* * The fixed hardware portion. This must not cross a page boundary diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 61b1af88ac07..46c0995344aa 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -94,10 +94,8 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); + DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fb01a8e5e9b7..3de7480e4f32 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1314,12 +1314,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack), - 0); - + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1436,9 +1431,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, - (unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1653,8 +1646,7 @@ void cpu_init(void) */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); - load_sp0((unsigned long)&get_cpu_entry_area(cpu)->tss + - offsetofend(struct tss_struct, SYSENTER_stack)); + load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae1ce2e3f132..bbd6d986e2d0 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -45,11 +45,10 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) { - int cpu = smp_processor_id(); - struct tss_struct *tss = &get_cpu_entry_area(cpu)->tss; + struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); - void *begin = &tss->SYSENTER_stack; - void *end = (void *)&tss->SYSENTER_stack + sizeof(tss->SYSENTER_stack); + void *begin = ss; + void *end = ss + 1; if ((void *)stack < begin || (void *)stack >= end) return false; -- cgit v1.2.3 From c482feefe1aeb150156248ba0fd3e029bc886605 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:29 +0100 Subject: x86/entry/64: Make cpu_entry_area.tss read-only The TSS is a fairly juicy target for exploits, and, now that the TSS is in the cpu_entry_area, it's no longer protected by kASLR. Make it read-only on x86_64. On x86_32, it can't be RO because it's written by the CPU during task switches, and we use a task gate for double faults. I'd also be nervous about errata if we tried to make it RO even on configurations without double fault handling. [ tglx: AMD confirmed that there is no problem on 64-bit with TSS RO. So it's probably safe to assume that it's a non issue, though Intel might have been creative in that area. Still waiting for confirmation. ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.733700132@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 4 ++-- arch/x86/entry/entry_64.S | 8 ++++---- arch/x86/include/asm/fixmap.h | 13 +++++++++---- arch/x86/include/asm/processor.h | 17 ++++++++--------- arch/x86/include/asm/switch_to.h | 4 ++-- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/kernel/asm-offsets.c | 5 ++--- arch/x86/kernel/asm-offsets_32.c | 4 ++-- arch/x86/kernel/cpu/common.c | 29 +++++++++++++++++++---------- arch/x86/kernel/ioport.c | 2 +- arch/x86/kernel/process.c | 6 +++--- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- arch/x86/lib/delay.c | 4 ++-- arch/x86/xen/enlighten_pv.c | 2 +- 16 files changed, 60 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3629bcbf85a2..bd8b57a5c874 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,7 +942,7 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Ldebug_from_sysenter_stack @@ -986,7 +986,7 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ cmpl $SIZEOF_SYSENTER_stack, %ecx jb .Lnmi_from_sysenter_stack diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 575b184f377f..2812ce043a7a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,7 +154,7 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_tss + TSS_STRUCT_SYSENTER_stack + \ +#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) @@ -390,7 +390,7 @@ syscall_return_via_sysret: * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -719,7 +719,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ @@ -934,7 +934,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) /* * Switch to the thread stack. This is called with the IRET frame and diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index cc5d98bdca37..94fc4fa14127 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -56,9 +56,14 @@ struct cpu_entry_area { char gdt[PAGE_SIZE]; /* - * The GDT is just below cpu_tss and thus serves (on x86_64) as a - * a read-only guard page for the SYSENTER stack at the bottom - * of the TSS region. + * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ + struct SYSENTER_stack_page SYSENTER_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because + * we need task switches to work, and task switches write to the TSS. */ struct tss_struct tss; @@ -247,7 +252,7 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) { - return &get_cpu_entry_area(cpu)->tss.SYSENTER_stack; + return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f933869470b8..e8991d7f7034 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -340,13 +340,11 @@ struct SYSENTER_stack { unsigned long words[64]; }; -struct tss_struct { - /* - * Space for the temporary SYSENTER stack, used for SYSENTER - * and the entry trampoline as well. - */ - struct SYSENTER_stack SYSENTER_stack; +struct SYSENTER_stack_page { + struct SYSENTER_stack stack; +} __aligned(PAGE_SIZE); +struct tss_struct { /* * The fixed hardware portion. This must not cross a page boundary * at risk of violating the SDM's advice and potentially triggering @@ -363,7 +361,7 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; } __aligned(PAGE_SIZE); -DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /* * sizeof(unsigned long) coming from an extra "long" at the end @@ -378,7 +376,8 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else -#define cpu_current_top_of_stack cpu_tss.x86_tss.sp1 +/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ +#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif /* @@ -538,7 +537,7 @@ static inline void native_set_iopl_mask(unsigned mask) static inline void native_load_sp0(unsigned long sp0) { - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } static inline void native_swapgs(void) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index cbc71e73bd32..9b6df68d8fd1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,10 +79,10 @@ do { \ static inline void refresh_sysenter_cs(struct thread_struct *thread) { /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) + if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) return; - this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); + this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } #endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 44a04999791e..00223333821a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack, #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp1) +# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) #endif #endif diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 46c0995344aa..cd360a5e0dca 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -94,10 +94,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); - OFFSET(TSS_STRUCT_SYSENTER_stack, tss_struct, SYSENTER_stack); - DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); - /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); + DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 52ce4ea16e53..7d20d9c0b3d6 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -47,8 +47,8 @@ void foo(void) BLANK(); /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3de7480e4f32..c2eada1056de 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,6 +487,9 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif +static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, + SYSENTER_stack_storage); + static void __init set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) { @@ -500,23 +503,29 @@ static void __init setup_cpu_entry_area(int cpu) #ifdef CONFIG_X86_64 extern char _entry_trampoline[]; - /* On 64-bit systems, we use a read-only fixmap GDT. */ + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; #else /* * On native 32-bit systems, the GDT cannot be read-only because * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the GDT - * is read-only, that will triple fault. + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. * - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. */ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; #endif __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), + per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, + PAGE_KERNEL); /* * The Intel SDM says (Volume 3, 7.2.1): @@ -539,9 +548,9 @@ static void __init setup_cpu_entry_area(int cpu) offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), - &per_cpu(cpu_tss, cpu), + &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, - PAGE_KERNEL); + tss_prot); #ifdef CONFIG_X86_32 per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); @@ -1305,7 +1314,7 @@ void enable_sep_cpu(void) return; cpu = get_cpu(); - tss = &per_cpu(cpu_tss, cpu); + tss = &per_cpu(cpu_tss_rw, cpu); /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1575,7 +1584,7 @@ void cpu_init(void) if (cpu) load_ucode_ap(); - t = &per_cpu(cpu_tss, cpu); + t = &per_cpu(cpu_tss_rw, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1667,7 +1676,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c4..2f723301eb58 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(cpu_tss, get_cpu()); + tss = &per_cpu(cpu_tss_rw, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6a04287f222b..517415978409 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower @@ -82,7 +82,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -111,7 +111,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 45bf0c5f93e1..5224c6099184 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 157f81816915..c75466232016 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -399,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5ade4f89a6d1..74136fd16f49 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -364,7 +364,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* * regs->sp points to the failing IRET frame on the @@ -649,7 +649,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - (struct bad_iret_stack *)this_cpu_read(cpu_tss.x86_tss.sp0) - 1; + (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 553f8fd23cc4..4846eff7e4c8 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops) delay = min_t(u64, MWAITX_MAX_LOOPS, loops); /* - * Use cpu_tss as a cacheline-aligned, seldomly + * Use cpu_tss_rw as a cacheline-aligned, seldomly * accessed per-cpu variable as the monitor target. */ - __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0); + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* * AMD, like Intel, supports the EAX hint and EAX=0xf diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index fbd054d6ac97..ae3a071e1d0f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -818,7 +818,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); - this_cpu_write(cpu_tss.x86_tss.sp0, sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } void xen_set_iopl_mask(unsigned mask) -- cgit v1.2.3 From a035795499ca1c2bd1928808d1a156eda1420383 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:30 +0100 Subject: x86/paravirt: Dont patch flush_tlb_single native_flush_tlb_single() will be changed with the upcoming PAGE_TABLE_ISOLATION feature. This requires to have more code in there than INVLPG. Remove the paravirt patching for it. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Reviewed-by: Juergen Gross Acked-by: Peter Zijlstra Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Cc: michael.schwarz@iaik.tugraz.at Cc: moritz.lipp@iaik.tugraz.at Cc: richard.fellner@student.tugraz.at Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt_patch_64.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index ac0be8283325..9edadabf04f6 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): -- cgit v1.2.3 From 79cc74155218316b9a5d28577c7077b2adba8e58 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:31 +0100 Subject: x86/paravirt: Provide a way to check for hypervisors There is no generic way to test whether a kernel is running on a specific hypervisor. But that's required to prevent the upcoming user address space separation feature in certain guest modes. Make the hypervisor type enum unconditionally available and provide a helper function which allows to test for a specific type. Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.912938129@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hypervisor.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 1b0a5abcd8ae..96aa6b9884dc 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,16 +20,7 @@ #ifndef _ASM_X86_HYPERVISOR_H #define _ASM_X86_HYPERVISOR_H -#ifdef CONFIG_HYPERVISOR_GUEST - -#include -#include -#include - -/* - * x86 hypervisor information - */ - +/* x86 hypervisor types */ enum x86_hypervisor_type { X86_HYPER_NATIVE = 0, X86_HYPER_VMWARE, @@ -39,6 +30,12 @@ enum x86_hypervisor_type { X86_HYPER_KVM, }; +#ifdef CONFIG_HYPERVISOR_GUEST + +#include +#include +#include + struct hypervisor_x86 { /* Hypervisor name */ const char *name; @@ -58,7 +55,15 @@ struct hypervisor_x86 { extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ + return x86_hyper_type == type; +} #else static inline void init_hypervisor_platform(void) { } +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ + return type == X86_HYPER_NATIVE; +} #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ -- cgit v1.2.3 From 6cbd2171e89b13377261d15e64384df60ecb530e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:32 +0100 Subject: x86/cpufeatures: Make CPU bugs sticky There is currently no way to force CPU bug bits like CPU feature bits. That makes it impossible to set a bug bit once at boot and have it stick for all upcoming CPUs. Extend the force set/clear arrays to handle bug bits as well. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/cpu/common.c | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bf6a76202a77..ea9a7dde62e5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e8991d7f7034..da943411d3d8 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -163,8 +163,8 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern struct x86_hw_tss doublefault_tss; -extern __u32 cpu_caps_cleared[NCAPINTS]; -extern __u32 cpu_caps_set[NCAPINTS]; +extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c2eada1056de..034900623adf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -452,8 +452,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS]; -__u32 cpu_caps_set[NCAPINTS]; +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; void load_percpu_segment(int cpu) { @@ -812,7 +812,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) { int i; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } -- cgit v1.2.3 From 203c110b39a89b48156c7450504e454fedb7f7f6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 12 Dec 2017 21:32:16 +0100 Subject: parisc: Fix indenting in puts() Static analysis tools complain that we intended to have curly braces around this indent block. In this case this assumption is wrong, so fix the indenting. Fixes: 2f3c7b8137ef ("parisc: Add core code for self-extracting kernel") Reported-by: Dan Carpenter Signed-off-by: Helge Deller Cc: # v4.14+ --- arch/parisc/boot/compressed/misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c index 9345b44b86f0..f57118e1f6b4 100644 --- a/arch/parisc/boot/compressed/misc.c +++ b/arch/parisc/boot/compressed/misc.c @@ -123,8 +123,8 @@ int puts(const char *s) while ((nuline = strchr(s, '\n')) != NULL) { if (nuline != s) pdc_iodc_print(s, nuline - s); - pdc_iodc_print("\r\n", 2); - s = nuline + 1; + pdc_iodc_print("\r\n", 2); + s = nuline + 1; } if (*s != '\0') pdc_iodc_print(s, strlen(s)); -- cgit v1.2.3 From 0ed9d3de5f8f97e6efd5ca0e3377cab5f0451ead Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 12 Dec 2017 21:25:41 +0100 Subject: parisc: Align os_hpmc_size on word boundary The os_hpmc_size variable sometimes wasn't aligned at word boundary and thus triggered the unaligned fault handler at startup. Fix it by aligning it properly. Signed-off-by: Helge Deller Cc: # v4.14+ --- arch/parisc/kernel/hpmc.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S index e3a8e5e4d5de..8d072c44f300 100644 --- a/arch/parisc/kernel/hpmc.S +++ b/arch/parisc/kernel/hpmc.S @@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc) __INITRODATA + .align 4 .export os_hpmc_size os_hpmc_size: .word .os_hpmc_end-.os_hpmc -- cgit v1.2.3 From 6a16fc322085bb3163d7d6e44856adfda06a8001 Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Sun, 10 Dec 2017 23:54:33 +0530 Subject: parisc: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Signed-off-by: Helge Deller --- arch/parisc/kernel/unwind.c | 1 - arch/parisc/lib/delay.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 5a657986ebbf..143f90e2f9f3 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/arch/parisc/lib/delay.c b/arch/parisc/lib/delay.c index 7eab4bb8abe6..66e506520505 100644 --- a/arch/parisc/lib/delay.c +++ b/arch/parisc/lib/delay.c @@ -16,9 +16,7 @@ #include #include -#include #include - #include /* for mfctl() */ #include /* for boot_cpu_data */ -- cgit v1.2.3 From 9352aeada4d8d8753fc0e414fbfe8fdfcb68a12c Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Mon, 13 Nov 2017 19:35:33 -0500 Subject: Revert "parisc: Re-enable interrupts early" This reverts commit 5c38602d83e584047906b41b162ababd4db4106d. Interrupts can't be enabled early because the register saves are done on the thread stack prior to switching to the IRQ stack. This caused stack overflows and the thread stack needed increasing to 32k. Even then, stack overflows still occasionally occurred. Background: Even with a 32 kB thread stack, I have seen instances where the thread stack overflowed on the mx3210 buildd. Detection of stack overflow only occurs when we have an external interrupt. When an external interrupt occurs, we switch to the thread stack if we are not already on a kernel stack. Then, registers and specials are saved to the kernel stack. The bug occurs in intr_return where interrupts are reenabled prior to returning from the interrupt. This was done incase we need to schedule or deliver signals. However, it introduces the possibility that multiple external interrupts may occur on the thread stack and cause a stack overflow. These might not be detected and cause the kernel to misbehave in random ways. This patch changes the code back to only reenable interrupts when we are going to schedule or deliver signals. As a result, we generally return from an interrupt before reenabling interrupts. This minimizes the growth of the thread stack. Fixes: 5c38602d83e5 ("parisc: Re-enable interrupts early") Signed-off-by: John David Anglin Cc: # v4.10+ Signed-off-by: Helge Deller --- arch/parisc/kernel/entry.S | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index a4fd296c958e..f3cecf5117cf 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi) STREG %r19,PT_SR7(%r16) intr_return: - /* NOTE: Need to enable interrupts incase we schedule. */ - ssm PSW_SM_I, %r0 - /* check for reschedule */ mfctl %cr30,%r1 LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */ @@ -907,6 +904,11 @@ intr_check_sig: LDREG PT_IASQ1(%r16), %r20 cmpib,COND(=),n 0,%r20,intr_restore /* backward */ + /* NOTE: We need to enable interrupts if we have to deliver + * signals. We used to do this earlier but it caused kernel + * stack overflows. */ + ssm PSW_SM_I, %r0 + copy %r0, %r25 /* long in_syscall = 0 */ #ifdef CONFIG_64BIT ldo -16(%r30),%r29 /* Reference param save area */ @@ -958,6 +960,10 @@ intr_do_resched: cmpib,COND(=) 0, %r20, intr_do_preempt nop + /* NOTE: We need to enable interrupts if we schedule. We used + * to do this earlier but it caused kernel stack overflows. */ + ssm PSW_SM_I, %r0 + #ifdef CONFIG_64BIT ldo -16(%r30),%r29 /* Reference param save area */ #endif -- cgit v1.2.3 From da57c5414f49ef9e4bcb9ae0bbafd1d650b31411 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Mon, 13 Nov 2017 19:35:33 -0500 Subject: parisc: Reduce thread stack to 16 kb In testing, I found that the thread stack can be 16 kB when using an irq stack. Without it, the thread stack needs to be 32 kB. Currently, the irq stack is 32 kB. While it probably could be 16 kB, I would prefer to leave it as is for safety. Signed-off-by: John David Anglin Signed-off-by: Helge Deller --- arch/parisc/include/asm/thread_info.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index c980a02a52bc..598c8d60fa5e 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -35,7 +35,12 @@ struct thread_info { /* thread information allocation */ +#ifdef CONFIG_IRQSTACKS +#define THREAD_SIZE_ORDER 2 /* PA-RISC requires at least 16k stack */ +#else #define THREAD_SIZE_ORDER 3 /* PA-RISC requires at least 32k stack */ +#endif + /* Be sure to hunt all references to this down when you change the size of * the kernel stack */ #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -- cgit v1.2.3 From 36b0cb84ee858f02c256d26f0cb4229c78e3399e Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Fri, 1 Dec 2017 03:51:04 +0100 Subject: ARM: 8731/1: Fix csum_partial_copy_from_user() stack mismatch An additional 'ip' will be pushed to the stack, for restoring the DACR later, if CONFIG_CPU_SW_DOMAIN_PAN defined. However, the fixup still get the err_ptr by add #8*4 to sp, which results in the fact that the code area pointed by the LR will be overwritten, or the kernel will crash if CONFIG_DEBUG_RODATA is enabled. This patch fixes the stack mismatch. Fixes: a5e090acbf54 ("ARM: software-based priviledged-no-access support") Signed-off-by: Lvqiang Huang Signed-off-by: Chunyan Zhang Signed-off-by: Russell King --- arch/arm/lib/csumpartialcopyuser.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S index 1712f132b80d..b83fdc06286a 100644 --- a/arch/arm/lib/csumpartialcopyuser.S +++ b/arch/arm/lib/csumpartialcopyuser.S @@ -85,7 +85,11 @@ .pushsection .text.fixup,"ax" .align 4 9001: mov r4, #-EFAULT +#ifdef CONFIG_CPU_SW_DOMAIN_PAN + ldr r5, [sp, #9*4] @ *err_ptr +#else ldr r5, [sp, #8*4] @ *err_ptr +#endif str r4, [r5] ldmia sp, {r1, r2} @ retrieve dst, len add r2, r2, r1 -- cgit v1.2.3 From 57358ba9564a0520f870dc14a0f91e7dacc18236 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 17 Dec 2017 14:43:15 -0800 Subject: xtensa: use generic strncpy_from_user with KASAN This enables KASAN check of the destination buffer. Signed-off-by: Max Filippov --- arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/uaccess.h | 7 +++++++ arch/xtensa/kernel/xtensa_ksyms.c | 2 ++ 3 files changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index f9f95d6e8da8..e2afffb71a6b 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -15,6 +15,7 @@ config XTENSA select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK + select GENERIC_STRNCPY_FROM_USER if KASAN select HAVE_ARCH_KASAN if MMU select HAVE_CC_STACKPROTECTOR select HAVE_DEBUG_KMEMLEAK diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h index 18bbe1caad94..f1158b4c629c 100644 --- a/arch/xtensa/include/asm/uaccess.h +++ b/arch/xtensa/include/asm/uaccess.h @@ -44,6 +44,8 @@ #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size))) #define access_ok(type, addr, size) __access_ok((unsigned long)(addr), (size)) +#define user_addr_max() (uaccess_kernel() ? ~0UL : TASK_SIZE) + /* * These are the main single-value transfer routines. They * automatically use the right size if we just have the right pointer @@ -277,6 +279,8 @@ clear_user(void *addr, unsigned long size) #define __clear_user __xtensa_clear_user +#ifndef CONFIG_GENERIC_STRNCPY_FROM_USER + extern long __strncpy_user(char *, const char *, long); static inline long @@ -286,6 +290,9 @@ strncpy_from_user(char *dst, const char *src, long count) return __strncpy_user(dst, src, count); return -EFAULT; } +#else +long strncpy_from_user(char *dst, const char *src, long count); +#endif /* * Return the size of a string (including the ending 0!) diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c index 3a443f83ae87..04f19de46700 100644 --- a/arch/xtensa/kernel/xtensa_ksyms.c +++ b/arch/xtensa/kernel/xtensa_ksyms.c @@ -44,7 +44,9 @@ EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__memset); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(__memmove); +#ifndef CONFIG_GENERIC_STRNCPY_FROM_USER EXPORT_SYMBOL(__strncpy_user); +#endif EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(copy_page); -- cgit v1.2.3 From fed566ca44ce99ff4604e3af941049f9a6bba405 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 10 Dec 2017 20:15:37 -0800 Subject: xtensa: print kernel sections info in mem_init Output virtual addresses and sizes occupied by the main kernel sections: .text, .rodata, .data, .init and .bss. Signed-off-by: Max Filippov --- arch/xtensa/mm/init.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 0d980f05da82..d776ec0d7b22 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -110,7 +110,12 @@ void __init mem_init(void) " pkmap : 0x%08lx - 0x%08lx (%5lu kB)\n" " fixmap : 0x%08lx - 0x%08lx (%5lu kB)\n" #endif - " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n", + " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n" + " .text : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .rodata : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .data : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .init : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .bss : 0x%08lx - 0x%08lx (%5lu kB)\n", #ifdef CONFIG_KASAN KASAN_SHADOW_START, KASAN_SHADOW_START + KASAN_SHADOW_SIZE, KASAN_SHADOW_SIZE >> 20, @@ -129,7 +134,17 @@ void __init mem_init(void) #else min_low_pfn * PAGE_SIZE, max_low_pfn * PAGE_SIZE, #endif - ((max_low_pfn - min_low_pfn) * PAGE_SIZE) >> 20); + ((max_low_pfn - min_low_pfn) * PAGE_SIZE) >> 20, + (unsigned long)_text, (unsigned long)_etext, + (unsigned long)(_etext - _text) >> 10, + (unsigned long)__start_rodata, (unsigned long)_sdata, + (unsigned long)(_sdata - __start_rodata) >> 10, + (unsigned long)_sdata, (unsigned long)_edata, + (unsigned long)(_edata - _sdata) >> 10, + (unsigned long)__init_begin, (unsigned long)__init_end, + (unsigned long)(__init_end - __init_begin) >> 10, + (unsigned long)__bss_start, (unsigned long)__bss_stop, + (unsigned long)(__bss_stop - __bss_start) >> 10); } #ifdef CONFIG_BLK_DEV_INITRD -- cgit v1.2.3 From bfe766cf65fb65e68c4764f76158718560bdcee5 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Wed, 6 Dec 2017 17:09:49 +0000 Subject: arm64: kvm: Prevent restoring stale PMSCR_EL1 for vcpu When VHE is not present, KVM needs to save and restores PMSCR_EL1 when possible. If SPE is used by the host, value of PMSCR_EL1 cannot be saved for the guest. If the host starts using SPE between two save+restore on the same vcpu, restore will write the value of PMSCR_EL1 read during the first save. Make sure __debug_save_spe_nvhe clears the value of the saved PMSCR_EL1 when the guest cannot use SPE. Signed-off-by: Julien Thierry Cc: Christoffer Dall Cc: Marc Zyngier Cc: Catalin Marinas Cc: Reviewed-by: Will Deacon Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp/debug-sr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c index 321c9c05dd9e..f4363d40e2cd 100644 --- a/arch/arm64/kvm/hyp/debug-sr.c +++ b/arch/arm64/kvm/hyp/debug-sr.c @@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1) { u64 reg; + /* Clear pmscr in case of early return */ + *pmscr_el1 = 0; + /* SPE present on this CPU? */ if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1), ID_AA64DFR0_PMSVER_SHIFT)) -- cgit v1.2.3 From e39d200fa5bf5b94a0948db0dae44c1b73b84a56 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 14 Dec 2017 17:40:50 -0800 Subject: KVM: Fix stack-out-of-bounds read in write_mmio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm] Read of size 8 at addr ffff8803259df7f8 by task syz-executor/32298 CPU: 6 PID: 32298 Comm: syz-executor Tainted: G OE 4.15.0-rc2+ #18 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016 Call Trace: dump_stack+0xab/0xe1 print_address_description+0x6b/0x290 kasan_report+0x28a/0x370 write_mmio+0x11e/0x270 [kvm] emulator_read_write_onepage+0x311/0x600 [kvm] emulator_read_write+0xef/0x240 [kvm] emulator_fix_hypercall+0x105/0x150 [kvm] em_hypercall+0x2b/0x80 [kvm] x86_emulate_insn+0x2b1/0x1640 [kvm] x86_emulate_instruction+0x39a/0xb90 [kvm] handle_exception+0x1b4/0x4d0 [kvm_intel] vcpu_enter_guest+0x15a0/0x2640 [kvm] kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm] kvm_vcpu_ioctl+0x479/0x880 [kvm] do_vfs_ioctl+0x142/0x9a0 SyS_ioctl+0x74/0x80 entry_SYSCALL_64_fastpath+0x23/0x9a The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall) to the guest memory, however, write_mmio tracepoint always prints 8 bytes through *(u64 *)val since kvm splits the mmio access into 8 bytes. This leaks 5 bytes from the kernel stack (CVE-2017-17741). This patch fixes it by just accessing the bytes which we operate on. Before patch: syz-executor-5567 [007] .... 51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 val 0x1ffff10077c1010f After patch: syz-executor-13416 [002] .... 51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 val 0xc1010f Reported-by: Dmitry Vyukov Reviewed-by: Darren Kenny Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++---- include/trace/events/kvm.h | 7 +++++-- virt/kvm/arm/mmio.c | 6 +++--- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a82f2d4333b..1cec2c62a0b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4384,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) break; - trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v); handled += n; addr += n; len -= n; @@ -4643,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_fragments[0].gpa, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, val); vcpu->mmio_read_completed = 0; return 1; } @@ -4665,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) { - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val); return vcpu_mmio_write(vcpu, gpa, bytes, val); } static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL); return X86EMUL_IO_NEEDED; } diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index e4b0b8e09932..2c735a3e6613 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq, { KVM_TRACE_MMIO_WRITE, "write" } TRACE_EVENT(kvm_mmio, - TP_PROTO(int type, int len, u64 gpa, u64 val), + TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( @@ -225,7 +225,10 @@ TRACE_EVENT(kvm_mmio, __entry->type = type; __entry->len = len; __entry->gpa = gpa; - __entry->val = val; + __entry->val = 0; + if (val) + memcpy(&__entry->val, val, + min_t(u32, sizeof(__entry->val), len)); ), TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index b6e715fd3c90..dac7ceb1a677 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -112,7 +112,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) } trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, - data); + &data); data = vcpu_data_host_to_guest(vcpu, data, len); vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); } @@ -182,14 +182,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), len); - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } else { trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, - fault_ipa, 0); + fault_ipa, NULL); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); -- cgit v1.2.3 From 11cf887728a3d1de77cc12ce247b64ef32608891 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 18 Dec 2017 12:37:12 +0100 Subject: x86/MCE/AMD: Define a function to get SMCA bank type Scalable MCA systems have various types of banks. The bank's type can determine how we handle errors from it. For example, if a bank represents a UMC (Unified Memory Controller) then we will need to convert its address from a normalized address to a system physical address before handling the error. [ bp: Verify m->bank is within range and use bank pointer. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20171207203955.118171-1-Yazen.Ghannam@amd.com --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index a38ab1fa53a2..661c4738be27 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); +static enum smca_bank_types smca_get_bank_type(struct mce *m) +{ + struct smca_bank *b; + + if (m->bank >= N_SMCA_BANK_TYPES) + return N_SMCA_BANK_TYPES; + + b = &smca_banks[m->bank]; + if (!b->hwid) + return N_SMCA_BANK_TYPES; + + return b->hwid->bank_type; +} + static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ -- cgit v1.2.3 From c6708d50f166bea2d763c96485d31fdbc50204f1 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 18 Dec 2017 12:37:13 +0100 Subject: x86/MCE: Report only DRAM ECC as memory errors on AMD systems The MCA_STATUS[ErrorCodeExt] field is very bank type specific. We currently check if the ErrorCodeExt value is 0x0 or 0x8 in mce_is_memory_error(), but we don't check the bank number. This means that we could flag non-memory errors as memory errors. We know that we want to flag DRAM ECC errors as memory errors, so let's do those cases first. We can add more cases later when needed. Define a wrapper function in mce_amd.c so we can use SMCA enums. [ bp: Remove brackets around return statements. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20171207203955.118171-2-Yazen.Ghannam@amd.com --- arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/mcheck/mce.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b1e8d8db921f..96ea4b5ba658 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -376,6 +376,7 @@ struct smca_bank { extern struct smca_bank smca_banks[MAX_NR_BANKS]; extern const char *smca_get_long_name(enum smca_bank_types t); +extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); extern int mce_threshold_remove_device(unsigned int cpu); @@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu); static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; +static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1d616d08eee..321c7a80be66 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m) bool mce_is_memory_error(struct mce *m) { if (m->cpuvendor == X86_VENDOR_AMD) { - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; + return amd_mce_is_memory_error(m); - return (xec == 0x0 || xec == 0x8); } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 661c4738be27..0f32ad242324 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -754,6 +754,17 @@ out_err: } EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); +bool amd_mce_is_memory_error(struct mce *m) +{ + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + if (mce_flags.smca) + return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0; + + return m->bank == 4 && xec == 0x8; +} + static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { struct mce m; -- cgit v1.2.3 From 179eb850ac57c06edaed67fc744ba9d902172f96 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 18 Dec 2017 12:37:14 +0100 Subject: x86/MCE: Make correctable error detection look at the Deferred bit AMD systems may log Deferred errors. These are errors that are uncorrected but which do not need immediate action. The MCA_STATUS[UC] bit may not be set for Deferred errors. Flag the error as not correctable when MCA_STATUS[Deferred] is set and do not feed it into the Correctable Errors Collector. [ bp: Massage commit message. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20171212165143.27475-1-Yazen.Ghannam@amd.com --- arch/x86/kernel/cpu/mcheck/mce.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 321c7a80be66..1b2c11473376 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -528,6 +528,17 @@ bool mce_is_memory_error(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_memory_error); +static bool mce_is_correctable(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + static bool cec_add_mce(struct mce *m) { if (!m) @@ -535,7 +546,7 @@ static bool cec_add_mce(struct mce *m) /* We eat only correctable DRAM errors with usable addresses. */ if (mce_is_memory_error(m) && - !(m->status & MCI_STATUS_UC) && + mce_is_correctable(m) && mce_usable_address(m)) if (!cec_add_elem(m->addr >> PAGE_SHIFT)) return true; -- cgit v1.2.3 From 9d5f38ba6c82359b7cec31fb27fb78ecc02f3946 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 15 Dec 2017 10:20:12 -0600 Subject: x86/mm: Unbreak modules that use the DMA API Commit d8aa7eea78a1 ("x86/mm: Add Secure Encrypted Virtualization (SEV) support") changed sme_active() from an inline function that referenced sme_me_mask to a non-inlined function in order to make the sev_enabled variable a static variable. This function was marked EXPORT_SYMBOL_GPL because at the time the patch was submitted, sme_me_mask was marked EXPORT_SYMBOL_GPL. Commit 87df26175e67 ("x86/mm: Unbreak modules that rely on external PAGE_KERNEL availability") changed sme_me_mask variable from EXPORT_SYMBOL_GPL to EXPORT_SYMBOL, allowing external modules the ability to build with CONFIG_AMD_MEM_ENCRYPT=y. Now, however, with sev_active() no longer an inline function and marked as EXPORT_SYMBOL_GPL, external modules that use the DMA API are once again broken in 4.15. Since the DMA API is meant to be used by external modules, this needs to be changed. Change the sme_active() and sev_active() functions from EXPORT_SYMBOL_GPL to EXPORT_SYMBOL. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brijesh Singh Link: https://lkml.kernel.org/r/20171215162011.14125.7113.stgit@tlendack-t1.amdoffice.net --- arch/x86/mm/mem_encrypt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index d9a9e9fc75dd..391b13402e40 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -405,13 +405,13 @@ bool sme_active(void) { return sme_me_mask && !sev_enabled; } -EXPORT_SYMBOL_GPL(sme_active); +EXPORT_SYMBOL(sme_active); bool sev_active(void) { return sme_me_mask && sev_enabled; } -EXPORT_SYMBOL_GPL(sev_active); +EXPORT_SYMBOL(sev_active); static const struct dma_map_ops sev_dma_ops = { .alloc = sev_alloc, -- cgit v1.2.3 From ca26cffa4e4aaeb09bb9e308f95c7835cb149248 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 4 Dec 2017 13:08:47 -0300 Subject: x86/asm: Allow again using asm.h when building for the 'bpf' clang target Up to f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") we were able to use x86 headers to build to the 'bpf' clang target, as done by the BPF code in tools/perf/. With that commit, we ended up with following failure for 'perf test LLVM', this is because "clang ... -target bpf ..." fails since 4.0 does not have bpf inline asm support and 6.0 does not recognize the register 'esp', fix it by guarding that part with an #ifndef __BPF__, that is defined by clang when building to the "bpf" target. # perf test -v LLVM 37: LLVM search and compile : 37.1: Basic BPF llvm compile : --- start --- test child forked, pid 25526 Kernel build dir is set to /lib/modules/4.14.0+/build set env: KBUILD_DIR=/lib/modules/4.14.0+/build unset env: KBUILD_OPTS include option is set to -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: NR_CPUS=4 set env: LINUX_VERSION_CODE=0x40e00 set env: CLANG_EXEC=/usr/local/bin/clang set env: CLANG_OPTIONS=-xc set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: WORKING_DIR=/lib/modules/4.14.0+/build set env: CLANG_SOURCE=- llvm compiling command template: echo '/* * bpf-script-example.c * Test basic LLVM building */ #ifndef LINUX_VERSION_CODE # error Need LINUX_VERSION_CODE # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig' #endif #define BPF_ANY 0 #define BPF_MAP_TYPE_ARRAY 2 #define BPF_FUNC_map_lookup_elem 1 #define BPF_FUNC_map_update_elem 2 static void *(*bpf_map_lookup_elem)(void *map, void *key) = (void *) BPF_FUNC_map_lookup_elem; static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) = (void *) BPF_FUNC_map_update_elem; struct bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; }; #define SEC(NAME) __attribute__((section(NAME), used)) struct bpf_map_def SEC("maps") flip_table = { .type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), .value_size = sizeof(int), .max_entries = 1, }; SEC("func=SyS_epoll_wait") int bpf_func__SyS_epoll_wait(void *ctx) { int ind =0; int *flag = bpf_map_lookup_elem(&flip_table, &ind); int new_flag; if (!flag) return 0; /* flip flag and store back */ new_flag = !*flag; bpf_map_update_elem(&flip_table, &ind, &new_flag, BPF_ANY); return new_flag; } char _license[] SEC("license") = "GPL"; int _version SEC("version") = LINUX_VERSION_CODE; ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o - test child finished with 0 ---- end ---- LLVM search and compile subtest 0: Ok 37.2: kbuild searching : --- start --- test child forked, pid 25950 Kernel build dir is set to /lib/modules/4.14.0+/build set env: KBUILD_DIR=/lib/modules/4.14.0+/build unset env: KBUILD_OPTS include option is set to -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: NR_CPUS=4 set env: LINUX_VERSION_CODE=0x40e00 set env: CLANG_EXEC=/usr/local/bin/clang set env: CLANG_OPTIONS=-xc set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/7/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h set env: WORKING_DIR=/lib/modules/4.14.0+/build set env: CLANG_SOURCE=- llvm compiling command template: echo '/* * bpf-script-test-kbuild.c * Test include from kernel header */ #ifndef LINUX_VERSION_CODE # error Need LINUX_VERSION_CODE # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig' #endif #define SEC(NAME) __attribute__((section(NAME), used)) #include #include SEC("func=vfs_llseek") int bpf_func__vfs_llseek(void *ctx) { return 0; } char _license[] SEC("license") = "GPL"; int _version SEC("version") = LINUX_VERSION_CODE; ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o - In file included from :12: In file included from /home/acme/git/linux/arch/x86/include/uapi/asm/ptrace.h:5: In file included from /home/acme/git/linux/include/linux/compiler.h:242: In file included from /home/acme/git/linux/arch/x86/include/asm/barrier.h:5: In file included from /home/acme/git/linux/arch/x86/include/asm/alternative.h:10: /home/acme/git/linux/arch/x86/include/asm/asm.h:145:50: error: unknown register name 'esp' in asm register unsigned long current_stack_pointer asm(_ASM_SP); ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:44:18: note: expanded from macro '_ASM_SP' #define _ASM_SP __ASM_REG(sp) ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:27:32: note: expanded from macro '__ASM_REG' #define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:18:29: note: expanded from macro '__ASM_SEL_RAW' # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) ^ /home/acme/git/linux/arch/x86/include/asm/asm.h:11:32: note: expanded from macro '__ASM_FORM_RAW' # define __ASM_FORM_RAW(x) #x ^ :4:1: note: expanded from here "esp" ^ 1 error generated. ERROR: unable to compile - Hint: Check error message shown above. Hint: You can also pre-compile it into .o using: clang -target bpf -O2 -c - with proper -I and -D options. Failed to compile test case: 'kbuild searching' test child finished with -1 ---- end ---- LLVM search and compile subtest 1: FAILED! Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Daniel Borkmann Cc: David Ahern Cc: Dmitriy Vyukov Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wang Nan Cc: Yonghong Song Link: https://lkml.kernel.org/r/20171128175948.GL3298@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/include/asm/asm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..386a6900e206 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -136,6 +136,7 @@ #endif #ifndef __ASSEMBLY__ +#ifndef __BPF__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -145,5 +146,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif +#endif #endif /* _ASM_X86_ASM_H */ -- cgit v1.2.3 From 182dc9c7f217146d69d9c0b75c150c0314b9b170 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 18 Dec 2017 16:33:36 +1100 Subject: powerpc/kernel: Print actual address of regs when oopsing When we oops or otherwise call show_regs() we print the address of the regs structure. Being able to see the address is fairly useful, firstly to verify that the regs pointer is not completely bogus, and secondly it allows you to dump the regs and surrounding memory with a debugger if you have one. In the normal case the regs will be located somewhere on the stack, so printing their location discloses no further information than printing the stack pointer does already. So switch to %px and print the actual address, not the hashed value. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 5acb5a176dbe..72be0c32e902 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1403,7 +1403,7 @@ void show_regs(struct pt_regs * regs) printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); - printk("REGS: %p TRAP: %04lx %s (%s)\n", + printk("REGS: %px TRAP: %04lx %s (%s)\n", regs, regs->trap, print_tainted(), init_utsname()->release); printk("MSR: "REG" ", regs->msr); print_msr_bits(regs->msr); -- cgit v1.2.3 From 6454b3bdd138dfc640deb5e7b9a0668fca2d55dd Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Dec 2017 15:13:44 -0600 Subject: x86/stacktrace: Make zombie stack traces reliable Commit: 1959a60182f4 ("x86/dumpstack: Pin the target stack when dumping it") changed the behavior of stack traces for zombies. Before that commit, /proc//stack reported the last execution path of the zombie before it died: [] do_exit+0x6f7/0xa80 [] do_group_exit+0x39/0xa0 [] __wake_up_parent+0x0/0x30 [] system_call_fastpath+0x16/0x1b [<00007fd128f9c4f9>] 0x7fd128f9c4f9 [] 0xffffffffffffffff After the commit, it just reports an empty stack trace. The new behavior is actually probably more correct. If the stack refcount has gone down to zero, then the task has already gone through do_exit() and isn't going to run anymore. The stack could be freed at any time and is basically gone, so reporting an empty stack makes sense. However, save_stack_trace_tsk_reliable() treats such a missing stack condition as an error. That can cause livepatch transition stalls if there are any unreaped zombies. Instead, just treat it as a reliable, empty stack. Reported-and-tested-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Fixes: af085d9084b4 ("stacktrace/x86: add function for detecting reliable stack traces") Link: http://lkml.kernel.org/r/e4b09e630e99d0c1080528f0821fc9d9dbaeea82.1513631620.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/stacktrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 77835bc021c7..20161ef53537 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -164,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk, { int ret; + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ if (!try_get_task_stack(tsk)) - return -EINVAL; + return 0; ret = __save_stack_trace_reliable(trace, tsk); -- cgit v1.2.3 From eac6a3639decefcc8eb0941dd3cebe79993670ad Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 7 Dec 2017 16:58:59 +0100 Subject: ARM: dts: sun8i: a711: Reinstate the PMIC compatible When we added the regulator support in commit 90c5d7cdae64 ("ARM: dts: sun8i: a711: Add regulator support"), we also dropped the PMIC's compatible. Since it's not in the PMIC DTSI, unlike most other PMIC DTSI, it obviously wasn't probing anymore. Re-add it so that everything works again. Fixes: 90c5d7cdae64 ("ARM: dts: sun8i: a711: Add regulator support") Reviewed-by: Chen-Yu Tsai Signed-off-by: Maxime Ripard --- arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts index 98715538932f..a021ee6da396 100644 --- a/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts +++ b/arch/arm/boot/dts/sun8i-a83t-tbs-a711.dts @@ -146,6 +146,7 @@ status = "okay"; axp81x: pmic@3a3 { + compatible = "x-powers,axp813"; reg = <0x3a3>; interrupt-parent = <&r_intc>; interrupts = <0 IRQ_TYPE_LEVEL_LOW>; -- cgit v1.2.3 From 2cc42bac1c795f75fcc062b95c6ca7ac1b84d5d8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 18 Dec 2017 09:37:45 -0700 Subject: x86-64/Xen: eliminate W+X mappings A few thousand such pages are usually left around due to the re-use of L1 tables having been provided by the hypervisor (Dom0) or tool stack (DomU). Set NX in the direct map variant, which needs to be done in L2 due to the dual use of the re-used L1s. For x86_configure_nx() to actually do what it is supposed to do, call get_cpu_cap() first. This was broken by commit 4763ed4d45 ("x86, mm: Clean up and simplify NX enablement") when switching away from the direct EFER read. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/enlighten_pv.c | 3 +++ arch/x86/xen/mmu_pv.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'arch') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 69b9deff7e5c..86f26ea99324 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -87,6 +87,8 @@ #include "multicalls.h" #include "pmu.h" +#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ + void *xen_initial_gdt; static int xen_cpu_up_prepare_pv(unsigned int cpu); @@ -1249,6 +1251,7 @@ asmlinkage __visible void __init xen_start_kernel(void) __userpte_alloc_gfp &= ~__GFP_HIGHMEM; /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); x86_configure_nx(); /* Get mfn list */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 9d9cc3870722..7118f776cd49 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1916,6 +1916,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* + * Zap execute permission from the ident map. Due to the sharing of + * L1 entries we need to do this in the L2. + */ + if (__supported_pte_mask & _PAGE_NX) { + for (i = 0; i < PTRS_PER_PMD; ++i) { + if (pmd_none(level2_ident_pgt[i])) + continue; + level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX); + } + } + /* Copy the initial P->M table mappings if necessary. */ i = pgd_index(xen_start_info->mfn_list); if (i && i < pgd_index(__START_KERNEL_map)) -- cgit v1.2.3 From 10a7e9d849150a2879efc0b04d8a51068c9dd0c5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 19 Dec 2017 13:52:23 -0800 Subject: Do not hash userspace addresses in fault handlers The hashing of %p was designed to restrict kernel addresses. There is no reason to hash the userspace values seen during a segfault report, so switch these to %px. (Some architectures already use %lx.) Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- arch/sparc/mm/fault_32.c | 2 +- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index be3136f142a9..a8103a84b4ac 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->pc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 815c03d7a765..41363f46797b 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->tpc, (void *)regs->u_regs[UREG_I7], diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 4e6fcb32620f..428644175956 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs) if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), (void *)UPT_IP(regs), (void *)UPT_SP(regs), diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index febf6980e653..06fe3d51d385 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); -- cgit v1.2.3 From b67336eee3fcb8ecedc6c13e2bf88aacfa3151e2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 27 Nov 2017 09:33:03 +0000 Subject: MIPS: Validate PR_SET_FP_MODE prctl(2) requests against the ABI of the task Fix an API loophole introduced with commit 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS"), where the caller of prctl(2) is incorrectly allowed to make a change to CP0.Status.FR or CP0.Config5.FRE register bits even if CONFIG_MIPS_O32_FP64_SUPPORT has not been enabled, despite that an executable requesting the mode requested via ELF file annotation would not be allowed to run in the first place, or for n64 and n64 ABI tasks which do not have non-default modes defined at all. Add suitable checks to `mips_set_process_fp_mode' and bail out if an invalid mode change has been requested for the ABI in effect, even if the FPU hardware or emulation would otherwise allow it. Always succeed however without taking any further action if the mode requested is the same as one already in effect, regardless of whether any mode change, should it be requested, would actually be allowed for the task concerned. Signed-off-by: Maciej W. Rozycki Fixes: 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS") Reviewed-by: Paul Burton Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # 4.0+ Patchwork: https://patchwork.linux-mips.org/patch/17800/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/process.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 45d0b6b037ee..57028d49c202 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -705,6 +705,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value) struct task_struct *t; int max_users; + /* If nothing to change, return right away, successfully. */ + if (value == mips_get_process_fp_mode(task)) + return 0; + + /* Only accept a mode change if 64-bit FP enabled for o32. */ + if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT)) + return -EOPNOTSUPP; + + /* And only for o32 tasks. */ + if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS)) + return -EOPNOTSUPP; + /* Check the value is valid */ if (value & ~known_bits) return -EOPNOTSUPP; -- cgit v1.2.3 From b3cf8528bb21febb650a7ecbf080d0647be40b9f Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 12 Dec 2017 15:08:21 -0500 Subject: xen/balloon: Mark unallocated host memory as UNUSABLE Commit f5775e0b6116 ("x86/xen: discard RAM regions above the maximum reservation") left host memory not assigned to dom0 as available for memory hotplug. Unfortunately this also meant that those regions could be used by others. Specifically, commit fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") may try to map those addresses as MMIO. To prevent this mark unallocated host memory as E820_TYPE_UNUSABLE (thus effectively reverting f5775e0b6116) and keep track of that region as a hostmem resource that can be used for the hotplug. Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross --- arch/x86/xen/enlighten.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/setup.c | 6 ++-- drivers/xen/balloon.c | 65 ++++++++++++++++++++++++++++++++------ include/xen/balloon.h | 5 +++ 4 files changed, 144 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d669e9d89001..c9081c6671f0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,8 +1,12 @@ +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +#include +#endif #include #include #include #include +#include #include #include @@ -331,3 +335,80 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +void __init arch_xen_balloon_init(struct resource *hostmem_resource) +{ + struct xen_memory_map memmap; + int rc; + unsigned int i, last_guest_ram; + phys_addr_t max_addr = PFN_PHYS(max_pfn); + struct e820_table *xen_e820_table; + const struct e820_entry *entry; + struct resource *res; + + if (!xen_initial_domain()) + return; + + xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL); + if (!xen_e820_table) + return; + + memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries); + set_xen_guest_handle(memmap.buffer, xen_e820_table->entries); + rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap); + if (rc) { + pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc); + goto out; + } + + last_guest_ram = 0; + for (i = 0; i < memmap.nr_entries; i++) { + if (xen_e820_table->entries[i].addr >= max_addr) + break; + if (xen_e820_table->entries[i].type == E820_TYPE_RAM) + last_guest_ram = i; + } + + entry = &xen_e820_table->entries[last_guest_ram]; + if (max_addr >= entry->addr + entry->size) + goto out; /* No unallocated host RAM. */ + + hostmem_resource->start = max_addr; + hostmem_resource->end = entry->addr + entry->size; + + /* + * Mark non-RAM regions between the end of dom0 RAM and end of host RAM + * as unavailable. The rest of that region can be used for hotplug-based + * ballooning. + */ + for (; i < memmap.nr_entries; i++) { + entry = &xen_e820_table->entries[i]; + + if (entry->type == E820_TYPE_RAM) + continue; + + if (entry->addr >= hostmem_resource->end) + break; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + goto out; + + res->name = "Unavailable host RAM"; + res->start = entry->addr; + res->end = (entry->addr + entry->size < hostmem_resource->end) ? + entry->addr + entry->size : hostmem_resource->end; + rc = insert_resource(hostmem_resource, res); + if (rc) { + pr_warn("%s: Can't insert [%llx - %llx) (%d)\n", + __func__, res->start, res->end, rc); + kfree(res); + goto out; + } + } + + out: + kfree(xen_e820_table); +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ac55c02f98e9..e9011e1ee3de 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -807,7 +807,6 @@ char * __init xen_memory_setup(void) addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; while (i < xen_e820_table.nr_entries) { - bool discard = false; chunk_size = size; type = xen_e820_table.entries[i].type; @@ -823,11 +822,10 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - discard = true; + type = E820_TYPE_UNUSABLE; } - if (!discard) - xen_align_and_add_e820_region(addr, chunk_size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index f77e499afddd..065f0b607373 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -257,10 +257,25 @@ static void release_memory_resource(struct resource *resource) kfree(resource); } +/* + * Host memory not allocated to dom0. We can use this range for hotplug-based + * ballooning. + * + * It's a type-less resource. Setting IORESOURCE_MEM will make resource + * management algorithms (arch_remove_reservations()) look into guest e820, + * which we don't want. + */ +static struct resource hostmem_resource = { + .name = "Host RAM", +}; + +void __attribute__((weak)) __init arch_xen_balloon_init(struct resource *res) +{} + static struct resource *additional_memory_resource(phys_addr_t size) { - struct resource *res; - int ret; + struct resource *res, *res_hostmem; + int ret = -ENOMEM; res = kzalloc(sizeof(*res), GFP_KERNEL); if (!res) @@ -269,13 +284,42 @@ static struct resource *additional_memory_resource(phys_addr_t size) res->name = "System RAM"; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - ret = allocate_resource(&iomem_resource, res, - size, 0, -1, - PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); - if (ret < 0) { - pr_err("Cannot allocate new System RAM resource\n"); - kfree(res); - return NULL; + res_hostmem = kzalloc(sizeof(*res), GFP_KERNEL); + if (res_hostmem) { + /* Try to grab a range from hostmem */ + res_hostmem->name = "Host memory"; + ret = allocate_resource(&hostmem_resource, res_hostmem, + size, 0, -1, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + } + + if (!ret) { + /* + * Insert this resource into iomem. Because hostmem_resource + * tracks portion of guest e820 marked as UNUSABLE noone else + * should try to use it. + */ + res->start = res_hostmem->start; + res->end = res_hostmem->end; + ret = insert_resource(&iomem_resource, res); + if (ret < 0) { + pr_err("Can't insert iomem_resource [%llx - %llx]\n", + res->start, res->end); + release_memory_resource(res_hostmem); + res_hostmem = NULL; + res->start = res->end = 0; + } + } + + if (ret) { + ret = allocate_resource(&iomem_resource, res, + size, 0, -1, + PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); + if (ret < 0) { + pr_err("Cannot allocate new System RAM resource\n"); + kfree(res); + return NULL; + } } #ifdef CONFIG_SPARSEMEM @@ -287,6 +331,7 @@ static struct resource *additional_memory_resource(phys_addr_t size) pr_err("New System RAM resource outside addressable RAM (%lu > %lu)\n", pfn, limit); release_memory_resource(res); + release_memory_resource(res_hostmem); return NULL; } } @@ -765,6 +810,8 @@ static int __init balloon_init(void) set_online_page_callback(&xen_online_page); register_memory_notifier(&xen_memory_nb); register_sysctl_table(xen_root); + + arch_xen_balloon_init(&hostmem_resource); #endif #ifdef CONFIG_XEN_PV diff --git a/include/xen/balloon.h b/include/xen/balloon.h index 8906361bb50c..d0adfc78dcbd 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -43,3 +43,8 @@ static inline void xen_balloon_init(void) { } #endif + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +struct resource; +void arch_xen_balloon_init(struct resource *hostmem_resource); +#endif -- cgit v1.2.3 From d0729bc6bee797fb4bcca87583af5adbfe79ecfb Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 11 Dec 2017 21:50:25 +0900 Subject: arc: do not use __print_symbol() __print_symbol() uses extra stack space to sprintf() symbol information and then to feed that buffer to printk() char buffer[KSYM_SYMBOL_LEN]; sprint_symbol(buffer, address); printk(fmt, buffer); Replace __print_symbol() with a direct printk("%pS") call. Signed-off-by: Sergey Senozhatsky Signed-off-by: Vineet Gupta --- arch/arc/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 74315f302971..bf40e06f3fb8 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -163,7 +163,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, */ static int __print_sym(unsigned int address, void *unused) { - __print_symbol(" %s\n", address); + printk(" %pS\n", (void *)address); return 0; } -- cgit v1.2.3 From c18fc9071762769acb4040cabae45c817aefc537 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Tue, 5 Dec 2017 13:19:38 +0300 Subject: ARC: [plat-hsdk] Switch DisplayLink driver from fbdev to DRM Currently there're 2 different implementations of the driver for DisplayLink USB2.0-to-HDMI/DVI adapters: older FBDEV and modern true DRM. We initially decided to use FBDEV version just because with it /dev/fbX is usable from user-space while in DRM version with DRM_FBDEV_EMULATION user-space cannot draw anything on a real screen, for more info read [1]. But today /dev/fbX is not that important as more and more software projects switch to use of DRI (/dev/dri/cardX). But what's even more important DRM driver allows building of complicated graphics processing chains. The most important for us is rendering of 3D on a dedicated GPU while outputting video through a simpler bitstreamer like DisplayLink. So let's use much more future-proof driver from now on. [1] https://lists.freedesktop.org/archives/dri-devel/2017-December/159519.html Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/configs/hsdk_defconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index 7b8f8faf8a24..ac6b0ed8341e 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -49,10 +49,11 @@ CONFIG_SERIAL_8250_DW=y CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set # CONFIG_HWMON is not set +CONFIG_DRM=y +# CONFIG_DRM_FBDEV_EMULATION is not set +CONFIG_DRM_UDL=y CONFIG_FB=y -CONFIG_FB_UDL=y CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_USB=y CONFIG_USB_EHCI_HCD=y CONFIG_USB_EHCI_HCD_PLATFORM=y CONFIG_USB_OHCI_HCD=y -- cgit v1.2.3 From a08c832f277d7a6f9d3b341a5d5df2f5576220d8 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Sat, 9 Dec 2017 16:59:15 +0300 Subject: ARC: [plat-hsdk]: Set initial core pll output frequency Set initial core pll output frequency specified in device tree to 1GHz. It will be applied at the core pll driver probing. Acked-by: Stephen Boyd Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/hsdk.dts | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 8f627c200d60..006aa3de5348 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -114,6 +114,14 @@ reg = <0x00 0x10>, <0x14B8 0x4>; #clock-cells = <0>; clocks = <&input_clk>; + + /* + * Set initial core pll output frequency to 1GHz. + * It will be applied at the core pll driver probing + * on early boot. + */ + assigned-clocks = <&core_clk>; + assigned-clock-rates = <1000000000>; }; serial: serial@5000 { -- cgit v1.2.3 From 7bde846d0957fb81ac0bf8c4e2cab284a1da34e0 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Sat, 9 Dec 2017 16:59:16 +0300 Subject: ARC: [plat-hsdk]: Get rid of core pll frequency set in platform code Get rid of core pll frequency set in platform code as we set it via device tree using 'assigned-clock-rates' property. Acked-by: Stephen Boyd Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/plat-hsdk/platform.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'arch') diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c index fd0ae5e38639..2958aedb649a 100644 --- a/arch/arc/plat-hsdk/platform.c +++ b/arch/arc/plat-hsdk/platform.c @@ -38,42 +38,6 @@ static void __init hsdk_init_per_cpu(unsigned int cpu) #define CREG_PAE (CREG_BASE + 0x180) #define CREG_PAE_UPDATE (CREG_BASE + 0x194) -#define CREG_CORE_IF_CLK_DIV (CREG_BASE + 0x4B8) -#define CREG_CORE_IF_CLK_DIV_2 0x1 -#define CGU_BASE ARC_PERIPHERAL_BASE -#define CGU_PLL_STATUS (ARC_PERIPHERAL_BASE + 0x4) -#define CGU_PLL_CTRL (ARC_PERIPHERAL_BASE + 0x0) -#define CGU_PLL_STATUS_LOCK BIT(0) -#define CGU_PLL_STATUS_ERR BIT(1) -#define CGU_PLL_CTRL_1GHZ 0x3A10 -#define HSDK_PLL_LOCK_TIMEOUT 500 - -#define HSDK_PLL_LOCKED() \ - !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK) - -#define HSDK_PLL_ERR() \ - !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR) - -static void __init hsdk_set_cpu_freq_1ghz(void) -{ - u32 timeout = HSDK_PLL_LOCK_TIMEOUT; - - /* - * As we set cpu clock which exceeds 500MHz, the divider for the interface - * clock must be programmed to div-by-2. - */ - iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV); - - /* Set cpu clock to 1GHz */ - iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL); - - while (!HSDK_PLL_LOCKED() && timeout--) - cpu_relax(); - - if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR()) - pr_err("Failed to setup CPU frequency to 1GHz!"); -} - #define SDIO_BASE (ARC_PERIPHERAL_BASE + 0xA000) #define SDIO_UHS_REG_EXT (SDIO_BASE + 0x108) #define SDIO_UHS_REG_EXT_DIV_2 (2 << 30) @@ -98,12 +62,6 @@ static void __init hsdk_init_early(void) * minimum possible div-by-2. */ iowrite32(SDIO_UHS_REG_EXT_DIV_2, (void __iomem *) SDIO_UHS_REG_EXT); - - /* - * Setup CPU frequency to 1GHz. - * TODO: remove it after smart hsdk pll driver will be introduced. - */ - hsdk_set_cpu_freq_1ghz(); } static const char *hsdk_compat[] __initconst = { -- cgit v1.2.3 From fbd1cec57064aa1380726ec899c49fcd84e702b9 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Sat, 9 Dec 2017 16:59:17 +0300 Subject: ARC: [plat-axs103]: Set initial core pll output frequency Set initial core pll output frequency specified in device tree to 100MHz for SMP configuration and 90MHz for UP configuration. It will be applied at the core pll driver probing. Update platform quirk for decreasing core frequency for quad core configuration. Acked-by: Stephen Boyd Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/axc003.dtsi | 8 ++++++++ arch/arc/boot/dts/axc003_idu.dtsi | 8 ++++++++ arch/arc/plat-axs10x/axs10x.c | 8 ++------ 3 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index 4e6e9f57e790..dc91c663bcc0 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -35,6 +35,14 @@ reg = <0x80 0x10>, <0x100 0x10>; #clock-cells = <0>; clocks = <&input_clk>; + + /* + * Set initial core pll output frequency to 90MHz. + * It will be applied at the core pll driver probing + * on early boot. + */ + assigned-clocks = <&core_clk>; + assigned-clock-rates = <90000000>; }; core_intc: archs-intc@cpu { diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index 63954a8b0100..69ff4895f2ba 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -35,6 +35,14 @@ reg = <0x80 0x10>, <0x100 0x10>; #clock-cells = <0>; clocks = <&input_clk>; + + /* + * Set initial core pll output frequency to 100MHz. + * It will be applied at the core pll driver probing + * on early boot. + */ + assigned-clocks = <&core_clk>; + assigned-clock-rates = <100000000>; }; core_intc: archs-intc@cpu { diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index f1ac6790da5f..ac1a712f6f1f 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -320,22 +320,18 @@ static void __init axs103_early_init(void) unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F; if (num_cores > 2) { u32 freq = 50, orig; - /* - * TODO: use cpu node "cpu-freq" param instead of platform-specific - * "/cpu_card/core_clk" as it works only if we use fixed-clock for cpu. - */ int off = fdt_path_offset(initial_boot_params, "/cpu_card/core_clk"); const struct fdt_property *prop; prop = fdt_get_property(initial_boot_params, off, - "clock-frequency", NULL); + "assigned-clock-rates", NULL); orig = be32_to_cpu(*(u32*)(prop->data)) / 1000000; /* Patching .dtb in-place with new core clock value */ if (freq != orig ) { freq = cpu_to_be32(freq * 1000000); fdt_setprop_inplace(initial_boot_params, off, - "clock-frequency", &freq, sizeof(freq)); + "assigned-clock-rates", &freq, sizeof(freq)); } } #endif -- cgit v1.2.3 From d7de73b586b2db540187ff8a077330fa1a8efd64 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Sat, 9 Dec 2017 16:59:18 +0300 Subject: ARC: [plat-axs103] refactor the quad core DT quirk code Refactor the quad core DT quirk code: get rid of waste division and multiplication by 1000000 constant. Acked-by: Stephen Boyd Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/plat-axs10x/axs10x.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index ac1a712f6f1f..46544e88492d 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -317,19 +317,21 @@ static void __init axs103_early_init(void) * Instead of duplicating defconfig/DT for SMP/QUAD, add a small hack * of fudging the freq in DT */ +#define AXS103_QUAD_CORE_CPU_FREQ_HZ 50000000 + unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F; if (num_cores > 2) { - u32 freq = 50, orig; + u32 freq; int off = fdt_path_offset(initial_boot_params, "/cpu_card/core_clk"); const struct fdt_property *prop; prop = fdt_get_property(initial_boot_params, off, "assigned-clock-rates", NULL); - orig = be32_to_cpu(*(u32*)(prop->data)) / 1000000; + freq = be32_to_cpu(*(u32 *)(prop->data)); /* Patching .dtb in-place with new core clock value */ - if (freq != orig ) { - freq = cpu_to_be32(freq * 1000000); + if (freq != AXS103_QUAD_CORE_CPU_FREQ_HZ) { + freq = cpu_to_be32(AXS103_QUAD_CORE_CPU_FREQ_HZ); fdt_setprop_inplace(initial_boot_params, off, "assigned-clock-rates", &freq, sizeof(freq)); } -- cgit v1.2.3 From 79435ac78d160e4c245544d457850a56f805ac0d Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 8 Dec 2017 08:26:58 -0800 Subject: ARC: uaccess: dont use "l" gcc inline asm constraint modifier This used to setup the LP_COUNT register automatically, but now has been removed. There was an earlier fix 3c7c7a2fc8811 which fixed instance in delay.h but somehow missed this one as gcc change had not made its way into production toolchains and was not pedantic as it is now ! Cc: stable@vger.kernel.org Signed-off-by: Vineet Gupta --- arch/arc/include/asm/uaccess.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h index f35974ee7264..c9173c02081c 100644 --- a/arch/arc/include/asm/uaccess.h +++ b/arch/arc/include/asm/uaccess.h @@ -668,6 +668,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count) return 0; __asm__ __volatile__( + " mov lp_count, %5 \n" " lp 3f \n" "1: ldb.ab %3, [%2, 1] \n" " breq.d %3, 0, 3f \n" @@ -684,8 +685,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count) " .word 1b, 4b \n" " .previous \n" : "+r"(res), "+r"(dst), "+r"(src), "=r"(val) - : "g"(-EFAULT), "l"(count) - : "memory"); + : "g"(-EFAULT), "r"(count) + : "lp_count", "lp_start", "lp_end", "memory"); return res; } -- cgit v1.2.3 From f5a16b93e6291ba1f65f55647cb4cd8d75ed1b35 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 20 Dec 2017 12:37:54 -0800 Subject: ARC: handle gcc generated __builtin_trap() gcc toggle -fisolate-erroneous-paths-dereference (default at -O2 onwards) isolates faulty code paths such as null pointer access, divide by zero etc by emitting __builtin_trap() Newer ARC gcc generates TRAP_S 5 instruction which needs to be handled and treated like any other unexpected exception - user mode : task terminated with a SEGV - kernel mode: die() called after register and stack dump Signed-off-by: Vineet Gupta --- arch/arc/kernel/traps.c | 6 ++++++ arch/arc/kernel/troubleshoot.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index bcd7c9fc5d0f..004f4e4a4c10 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -83,6 +83,7 @@ DO_ERROR_INFO(SIGILL, "Illegal Insn (or Seq)", insterror_is_error, ILL_ILLOPC) DO_ERROR_INFO(SIGBUS, "Invalid Mem Access", __weak do_memory_error, BUS_ADRERR) DO_ERROR_INFO(SIGTRAP, "Breakpoint Set", trap_is_brkpt, TRAP_BRKPT) DO_ERROR_INFO(SIGBUS, "Misaligned Access", do_misaligned_error, BUS_ADRALN) +DO_ERROR_INFO(SIGSEGV, "gcc generated __builtin_trap", do_trap5_error, 0) /* * Entry Point for Misaligned Data access Exception, for emulating in software @@ -115,6 +116,8 @@ void do_machine_check_fault(unsigned long address, struct pt_regs *regs) * Thus TRAP_S can be used for specific purpose * -1 used for software breakpointing (gdb) * -2 used by kprobes + * -5 __builtin_trap() generated by gcc (2018.03 onwards) for toggle such as + * -fno-isolate-erroneous-paths-dereference */ void do_non_swi_trap(unsigned long address, struct pt_regs *regs) { @@ -134,6 +137,9 @@ void do_non_swi_trap(unsigned long address, struct pt_regs *regs) kgdb_trap(regs); break; + case 5: + do_trap5_error(address, regs); + break; default: break; } diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 7d8c1d6c2f60..6e9a0a9a6a04 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -163,6 +163,9 @@ static void show_ecr_verbose(struct pt_regs *regs) else pr_cont("Bus Error, check PRM\n"); #endif + } else if (vec == ECR_V_TRAP) { + if (regs->ecr_param == 5) + pr_cont("gcc generated __builtin_trap\n"); } else { pr_cont("Check Programmer's Manual\n"); } -- cgit v1.2.3 From fae1a3e775cca8c3a9e0eb34443b310871a15a92 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Dec 2017 00:49:14 +0100 Subject: kvm: x86: fix RSM when PCID is non-zero rsm_load_state_64() and rsm_enter_protected_mode() load CR3, then CR4 & ~PCIDE, then CR0, then CR4. However, setting CR4.PCIDE fails if CR3[11:0] != 0. It's probably easier in the long run to replace rsm_enter_protected_mode() with an emulator callback that sets all the special registers (like KVM_SET_SREGS would do). For now, set the PCID field of CR3 only after CR4.PCIDE is 1. Reported-by: Laszlo Ersek Tested-by: Laszlo Ersek Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index abe74f779f9d..b514b2b2845a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2390,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) } static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - u64 cr0, u64 cr4) + u64 cr0, u64 cr3, u64 cr4) { int bad; + u64 pcid; + + /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ + pcid = 0; + if (cr4 & X86_CR4_PCIDE) { + pcid = cr3 & 0xfff; + cr3 &= ~0xfff; + } + + bad = ctxt->ops->set_cr(ctxt, 3, cr3); + if (bad) + return X86EMUL_UNHANDLEABLE; /* * First enable PAE, long mode needs it before CR0.PG = 1 is set. @@ -2411,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, bad = ctxt->ops->set_cr(ctxt, 4, cr4); if (bad) return X86EMUL_UNHANDLEABLE; + if (pcid) { + bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + } return X86EMUL_CONTINUE; @@ -2421,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) struct desc_struct desc; struct desc_ptr dt; u16 selector; - u32 val, cr0, cr4; + u32 val, cr0, cr3, cr4; int i; cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); + cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); @@ -2467,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); - return rsm_enter_protected_mode(ctxt, cr0, cr4); + return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) { struct desc_struct desc; struct desc_ptr dt; - u64 val, cr0, cr4; + u64 val, cr0, cr3, cr4; u32 base3; u16 selector; int i, r; @@ -2491,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); + cr3 = GET_SMSTATE(u64, smbase, 0x7f50); cr4 = GET_SMSTATE(u64, smbase, 0x7f48); ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); val = GET_SMSTATE(u64, smbase, 0x7ed0); @@ -2519,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) dt.address = GET_SMSTATE(u64, smbase, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); - r = rsm_enter_protected_mode(ctxt, cr0, cr4); + r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); if (r != X86EMUL_CONTINUE) return r; -- cgit v1.2.3 From 976a9b35d77a9d297cb03154aa61a6214a213b5e Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Wed, 20 Dec 2017 18:17:29 +0100 Subject: ARM: dts: exynos: Enable Mixer node for Exynos5800 Peach Pi machine Commit 1cb686c08d12 ("ARM: dts: exynos: Add status property to Exynos 542x Mixer nodes") disabled the Mixer node by default in the DTSI and enabled for each Exynos 542x DTS. But unfortunately it missed to enable it for the Exynos5800 Peach Pi machine, since the 5800 is also an 542x SoC variant. Fixes: 1cb686c08d12 ("ARM: dts: exynos: Add status property to Exynos 542x Mixer nodes") Signed-off-by: Javier Martinez Canillas Acked-by: Marek Szyprowski Tested-by: Guillaume Tucker Signed-off-by: Krzysztof Kozlowski Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/exynos5800-peach-pi.dts | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/exynos5800-peach-pi.dts b/arch/arm/boot/dts/exynos5800-peach-pi.dts index b2b95ff205e8..0029ec27819c 100644 --- a/arch/arm/boot/dts/exynos5800-peach-pi.dts +++ b/arch/arm/boot/dts/exynos5800-peach-pi.dts @@ -664,6 +664,10 @@ status = "okay"; }; +&mixer { + status = "okay"; +}; + /* eMMC flash */ &mmc_0 { status = "okay"; -- cgit v1.2.3 From d2271826e58b83f9a75634a3f4334082ecf0a02e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Fri, 15 Dec 2017 16:03:32 +1030 Subject: ARM: dts: aspeed-g4: Correct VUART IRQ number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should have always been 8. Fixes: db4d6d9d80fa ("ARM: dts: aspeed: Correctly order UART nodes") Cc: stable@vger.kernel.org Signed-off-by: Joel Stanley Reviewed-by: Cédric Le Goater Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/aspeed-g4.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/aspeed-g4.dtsi b/arch/arm/boot/dts/aspeed-g4.dtsi index 45d815a86d42..de08d9045cb8 100644 --- a/arch/arm/boot/dts/aspeed-g4.dtsi +++ b/arch/arm/boot/dts/aspeed-g4.dtsi @@ -219,7 +219,7 @@ compatible = "aspeed,ast2400-vuart"; reg = <0x1e787000 0x40>; reg-shift = <2>; - interrupts = <10>; + interrupts = <8>; clocks = <&clk_uart>; no-loopback-test; status = "disabled"; -- cgit v1.2.3 From dc1c4165d189350cb51bdd3057deb6ecd164beda Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 12 Dec 2017 12:02:04 +0000 Subject: KVM: PPC: Book3S: fix XIVE migration of pending interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When restoring a pending interrupt, we are setting the Q bit to force a retrigger in xive_finish_unmask(). But we also need to force an EOI in this case to reach the same initial state : P=1, Q=0. This can be done by not setting 'old_p' for pending interrupts which will inform xive_finish_unmask() that an EOI needs to be sent. Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.12+ Suggested-by: Benjamin Herrenschmidt Signed-off-by: Cédric Le Goater Reviewed-by: Laurent Vivier Tested-by: Laurent Vivier Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/book3s_xive.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index bf457843e032..b5e6d227a034 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1558,7 +1558,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) /* * Restore P and Q. If the interrupt was pending, we - * force both P and Q, which will trigger a resend. + * force Q and !P, which will trigger a resend. * * That means that a guest that had both an interrupt * pending (queued) and Q set will restore with only @@ -1566,7 +1566,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) * is perfectly fine as coalescing interrupts that haven't * been presented yet is always allowed. */ - if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) + if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING)) state->old_p = true; if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING) state->old_q = true; -- cgit v1.2.3 From 7333b5aca412d6ad02667b5a513485838a91b136 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Tue, 12 Dec 2017 18:23:56 +0100 Subject: KVM: PPC: Book3S HV: Fix pending_pri value in kvmppc_xive_get_icp() When we migrate a VM from a POWER8 host (XICS) to a POWER9 host (XICS-on-XIVE), we have an error: qemu-kvm: Unable to restore KVM interrupt controller state \ (0xff000000) for CPU 0: Invalid argument This is because kvmppc_xics_set_icp() checks the new state is internaly consistent, and especially: ... 1129 if (xisr == 0) { 1130 if (pending_pri != 0xff) 1131 return -EINVAL; ... On the other side, kvmppc_xive_get_icp() doesn't set neither the pending_pri value, nor the xisr value (set to 0) (and kvmppc_xive_set_icp() ignores the pending_pri value) As xisr is 0, pending_pri must be set to 0xff. Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Laurent Vivier Acked-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/book3s_xive.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index b5e6d227a034..0d750d274c4e 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) /* Return the per-cpu state for state saving/migration */ return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | - (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT; + (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT | + (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT; } int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) -- cgit v1.2.3 From 506e8a912661c97b41adc8a286b875d01323ec45 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 21 Dec 2017 22:35:19 +0100 Subject: ARM: dts: ls1021a: fix incorrect clock references dtc warns about two 'clocks' properties that have an extraneous '1' at the end: arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a:clocks[1]) Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2190000/sgtl5000@a arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2190000/sgtl5000@a:clocks[1]) The clocks that get referenced here are fixed-rate, so they do not take any argument, and dtc interprets the next cell as a phandle, which is invalid. Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/ls1021a-qds.dts | 2 +- arch/arm/boot/dts/ls1021a-twr.dts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/ls1021a-qds.dts b/arch/arm/boot/dts/ls1021a-qds.dts index 940875316d0f..67b4de0e3439 100644 --- a/arch/arm/boot/dts/ls1021a-qds.dts +++ b/arch/arm/boot/dts/ls1021a-qds.dts @@ -215,7 +215,7 @@ reg = <0x2a>; VDDA-supply = <®_3p3v>; VDDIO-supply = <®_3p3v>; - clocks = <&sys_mclk 1>; + clocks = <&sys_mclk>; }; }; }; diff --git a/arch/arm/boot/dts/ls1021a-twr.dts b/arch/arm/boot/dts/ls1021a-twr.dts index a8b148ad1dd2..44715c8ef756 100644 --- a/arch/arm/boot/dts/ls1021a-twr.dts +++ b/arch/arm/boot/dts/ls1021a-twr.dts @@ -187,7 +187,7 @@ reg = <0x0a>; VDDA-supply = <®_3p3v>; VDDIO-supply = <®_3p3v>; - clocks = <&sys_mclk 1>; + clocks = <&sys_mclk>; }; }; -- cgit v1.2.3 From fbd90b4cae105fbd8364fa1ce3f41d0c06296f58 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 21 Dec 2017 22:45:24 +0100 Subject: ARM: dts: tango4: remove bogus interrupt-controller property dtc points out that the parent node of the interrupt controllers is not actually an interrupt controller itself, and lacks an #interrupt-cells property: arch/arm/boot/dts/tango4-vantage-1172.dtb: Warning (interrupts_property): Missing #interrupt-cells in interrupt-parent /soc/interrupt-controller@6e000 This removes the annotation. Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/tango4-common.dtsi | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/tango4-common.dtsi b/arch/arm/boot/dts/tango4-common.dtsi index 0ec1b0a317b4..ff72a8efb73d 100644 --- a/arch/arm/boot/dts/tango4-common.dtsi +++ b/arch/arm/boot/dts/tango4-common.dtsi @@ -156,7 +156,6 @@ reg = <0x6e000 0x400>; ranges = <0 0x6e000 0x400>; interrupt-parent = <&gic>; - interrupt-controller; #address-cells = <1>; #size-cells = <1>; -- cgit v1.2.3 From 87c059e9c39dae20b8b9bd19d9ec55a6d6c10468 Mon Sep 17 00:00:00 2001 From: Bogdan Mirea Date: Thu, 21 Dec 2017 17:18:58 +0200 Subject: arm64: dts: renesas: salvator-x: Remove renesas, no-ether-link property The present change is a bug fix for AVB link iteratively up/down. Steps to reproduce: - start AVB TX stream (Using aplay via MSE), - disconnect+reconnect the eth cable, - after a reconnection the eth connection goes iteratively up/down without user interaction, - this may heal after some seconds or even stay for minutes. As the documentation specifies, the "renesas,no-ether-link" option should be used when a board does not provide a proper AVB_LINK signal. There is no need for this option enabled on RCAR H3/M3 Salvator-X/XS and ULCB starter kits since the AVB_LINK is correctly handled by HW. Choosing to keep or remove the "renesas,no-ether-link" option will have impact on the code flow in the following ways: - keeping this option enabled may lead to unexpected behavior since the RX & TX are enabled/disabled directly from adjust_link function without any HW interrogation, - removing this option, the RX & TX will only be enabled/disabled after HW interrogation. The HW check is made through the LMON pin in PSR register which specifies AVB_LINK signal value (0 - at low level; 1 - at high level). In conclusion, the present change is also a safety improvement because it removes the "renesas,no-ether-link" option leading to a proper way of detecting the link state based on HW interrogation and not on software heuristic. Fixes: dc36965a8905 ("arm64: dts: r8a7796: salvator-x: Enable EthernetAVB") Fixes: 6fa501c549aa ("arm64: dts: r8a7795: enable EthernetAVB on Salvator-X") Signed-off-by: Bogdan Mirea Signed-off-by: Vladimir Zapolskiy Signed-off-by: Simon Horman --- arch/arm64/boot/dts/renesas/salvator-common.dtsi | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi index a298df74ca6c..dbe2648649db 100644 --- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi +++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi @@ -255,7 +255,6 @@ &avb { pinctrl-0 = <&avb_pins>; pinctrl-names = "default"; - renesas,no-ether-link; phy-handle = <&phy0>; status = "okay"; -- cgit v1.2.3 From 7d2901f809c110bd9a261e879d59efe62e3bc758 Mon Sep 17 00:00:00 2001 From: Bogdan Mirea Date: Thu, 21 Dec 2017 17:18:59 +0200 Subject: arm64: dts: renesas: ulcb: Remove renesas, no-ether-link property The present change is a bug fix for AVB link iteratively up/down. Steps to reproduce: - start AVB TX stream (Using aplay via MSE), - disconnect+reconnect the eth cable, - after a reconnection the eth connection goes iteratively up/down without user interaction, - this may heal after some seconds or even stay for minutes. As the documentation specifies, the "renesas,no-ether-link" option should be used when a board does not provide a proper AVB_LINK signal. There is no need for this option enabled on RCAR H3/M3 Salvator-X/XS and ULCB starter kits since the AVB_LINK is correctly handled by HW. Choosing to keep or remove the "renesas,no-ether-link" option will have impact on the code flow in the following ways: - keeping this option enabled may lead to unexpected behavior since the RX & TX are enabled/disabled directly from adjust_link function without any HW interrogation, - removing this option, the RX & TX will only be enabled/disabled after HW interrogation. The HW check is made through the LMON pin in PSR register which specifies AVB_LINK signal value (0 - at low level; 1 - at high level). In conclusion, the present change is also a safety improvement because it removes the "renesas,no-ether-link" option leading to a proper way of detecting the link state based on HW interrogation and not on software heuristic. Fixes: dc36965a8905 ("arm64: dts: r8a7796: salvator-x: Enable EthernetAVB") Fixes: 6fa501c549aa ("arm64: dts: r8a7795: enable EthernetAVB on Salvator-X") Signed-off-by: Bogdan Mirea Signed-off-by: Vladimir Zapolskiy Signed-off-by: Simon Horman --- arch/arm64/boot/dts/renesas/ulcb.dtsi | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/renesas/ulcb.dtsi b/arch/arm64/boot/dts/renesas/ulcb.dtsi index 0d85b315ce71..73439cf48659 100644 --- a/arch/arm64/boot/dts/renesas/ulcb.dtsi +++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi @@ -145,7 +145,6 @@ &avb { pinctrl-0 = <&avb_pins>; pinctrl-names = "default"; - renesas,no-ether-link; phy-handle = <&phy0>; status = "okay"; -- cgit v1.2.3 From 32aa144fc32abfcbf7140f473dfbd94c5b9b4105 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 15 Dec 2017 13:14:31 +0100 Subject: KVM: s390: fix cmma migration for multiple memory slots When multiple memory slots are present the cmma migration code does not allocate enough memory for the bitmap. The memory slots are sorted in reverse order, so we must use gfn and size of slot[0] instead of the last one. Signed-off-by: Christian Borntraeger Reviewed-by: Claudio Imbrenda Cc: stable@vger.kernel.org # 4.13+ Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode) Reviewed-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index efa439f6ffb3..abcd24fdde3f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -792,11 +792,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) if (kvm->arch.use_cmma) { /* - * Get the last slot. They should be sorted by base_gfn, so the - * last slot is also the one at the end of the address space. - * We have verified above that at least one slot is present. + * Get the first slot. They are reverse sorted by base_gfn, so + * the first slot is also the one at the end of the address + * space. We have verified above that at least one slot is + * present. */ - ms = slots->memslots + slots->used_slots - 1; + ms = slots->memslots; /* round up so we only use full longs */ ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); /* allocate enough bytes to store all the bits */ -- cgit v1.2.3 From c2cf265d860882b51a200e4a7553c17827f2b730 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 21 Dec 2017 09:18:22 +0100 Subject: KVM: s390: prevent buffer overrun on memory hotplug during migration We must not go beyond the pre-allocated buffer. This can happen when a new memory slot is added during migration. Reported-by: David Hildenbrand Signed-off-by: Christian Borntraeger Cc: stable@vger.kernel.org # 4.13+ Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode) Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand --- arch/s390/kvm/priv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 572496c688cc..0714bfa56da0 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1006,7 +1006,7 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) cbrlo[entries] = gfn << PAGE_SHIFT; } - if (orc) { + if (orc && gfn < ms->bitmap_size) { /* increment only if we are really flipping the bit to 1 */ if (!test_and_set_bit(gfn, ms->pgste_bitmap)) atomic64_inc(&ms->dirty_pages); -- cgit v1.2.3 From 7bbcbd3d1cdcbacd0f9f8dc4c98d550972f1ca30 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Dec 2017 18:02:34 +0100 Subject: x86/Kconfig: Limit NR_CPUS on 32-bit to a sane amount The recent cpu_entry_area changes fail to compile on 32-bit when BIGSMP=y and NR_CPUS=512, because the fixmap area becomes too big. Limit the number of CPUs with BIGSMP to 64, which is already way to big for 32-bit, but it's at least a working limitation. We performed a quick survey of 32-bit-only machines that might be affected by this change negatively, but found none. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 665eba1b6103..cd5199de231e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -925,7 +925,8 @@ config MAXSMP config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP range 2 8 if SMP && X86_32 && !X86_BIGSMP - range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK + range 2 64 if SMP && X86_32 && X86_BIGSMP + range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 default "1" if !SMP default "8192" if MAXSMP -- cgit v1.2.3 From c05344947b37f7cda726e802457370bc6eac4d26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 16 Dec 2017 01:14:39 +0100 Subject: x86/mm/dump_pagetables: Check PAGE_PRESENT for real The check for a present page in printk_prot(): if (!pgprot_val(prot)) { /* Not present */ is bogus. If a PTE is set to PAGE_NONE then the pgprot_val is not zero and the entry is decoded in bogus ways, e.g. as RX GLB. That is confusing when analyzing mapping correctness. Check for the present bit to make an informed decision. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 5e3ac6fe6c9e..1014cfb21c2c 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -140,7 +140,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) static const char * const level_name[] = { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; - if (!pgprot_val(prot)) { + if (!(pr & _PAGE_PRESENT)) { /* Not present */ pt_dump_cont_printf(m, dmsg, " "); } else { -- cgit v1.2.3 From 146122e24bdf208015d629babba673e28d090709 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Dec 2017 18:07:42 +0100 Subject: x86/mm/dump_pagetables: Make the address hints correct and readable The address hints are a trainwreck. The array entry numbers have to kept magically in sync with the actual hints, which is doomed as some of the array members are initialized at runtime via the entry numbers. Designated initializers have been around before this code was implemented.... Use the entry numbers to populate the address hints array and add the missing bits and pieces. Split 32 and 64 bit for readability sake. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 90 +++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 37 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 1014cfb21c2c..fdf09d8f98da 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -44,10 +44,12 @@ struct addr_marker { unsigned long max_lines; }; -/* indices for address_markers; keep sync'd w/ address_markers below */ +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 + enum address_markers_idx { USER_SPACE_NR = 0, -#ifdef CONFIG_X86_64 KERNEL_SPACE_NR, LOW_KERNEL_NR, VMALLOC_START_NR, @@ -56,56 +58,70 @@ enum address_markers_idx { KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif -# ifdef CONFIG_X86_ESPFIX64 +#ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, -# endif +#endif +#ifdef CONFIG_EFI + EFI_END_NR, +#endif HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, -#else + FIXADDR_START_NR, + END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, + [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, +#ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, +#endif +#ifdef CONFIG_X86_ESPFIX64 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI + [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, +#endif + [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, + [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, + [MODULES_END_NR] = { MODULES_END, "End Modules" }, + [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, + [END_OF_SPACE_NR] = { -1, NULL } +}; + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { + USER_SPACE_NR = 0, KERNEL_SPACE_NR, VMALLOC_START_NR, VMALLOC_END_NR, -# ifdef CONFIG_HIGHMEM +#ifdef CONFIG_HIGHMEM PKMAP_BASE_NR, -# endif - FIXADDR_START_NR, #endif + FIXADDR_START_NR, + END_OF_SPACE_NR, }; -/* Address space markers hints */ static struct addr_marker address_markers[] = { - { 0, "User Space" }, -#ifdef CONFIG_X86_64 - { 0x8000000000000000UL, "Kernel Space" }, - { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, - { 0/* VMALLOC_START */, "vmalloc() Area" }, - { 0/* VMEMMAP_START */, "Vmemmap" }, -#ifdef CONFIG_KASAN - { KASAN_SHADOW_START, "KASAN shadow" }, - { KASAN_SHADOW_END, "KASAN shadow end" }, + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, +#ifdef CONFIG_HIGHMEM + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, #endif -# ifdef CONFIG_X86_ESPFIX64 - { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, -# endif -# ifdef CONFIG_EFI - { EFI_VA_END, "EFI Runtime Services" }, -# endif - { __START_KERNEL_map, "High Kernel Mapping" }, - { MODULES_VADDR, "Modules" }, - { MODULES_END, "End Modules" }, -#else - { PAGE_OFFSET, "Kernel Mapping" }, - { 0/* VMALLOC_START */, "vmalloc() Area" }, - { 0/*VMALLOC_END*/, "vmalloc() End" }, -# ifdef CONFIG_HIGHMEM - { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, -# endif - { 0/*FIXADDR_START*/, "Fixmap Area" }, -#endif - { -1, NULL } /* End of list */ + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, + [END_OF_SPACE_NR] = { -1, NULL } }; +#endif /* !CONFIG_X86_64 */ + /* Multipliers for offsets within the PTEs */ #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) -- cgit v1.2.3 From 49275fef986abfb8b476e4708aaecc07e7d3e087 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 10 Dec 2017 22:47:19 -0800 Subject: x86/vsyscall/64: Explicitly set _PAGE_USER in the pagetable hierarchy The kernel is very erratic as to which pagetables have _PAGE_USER set. The vsyscall page gets lucky: it seems that all of the relevant pagetables are among the apparently arbitrary ones that set _PAGE_USER. Rather than relying on chance, just explicitly set _PAGE_USER. This will let us clean up pagetable setup to stop setting _PAGE_USER. The added code can also be reused by pagetable isolation to manage the _PAGE_USER bit in the usermode tables. [ tglx: Folded paravirt fix from Juergen Gross ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index f279ba2643dc..daad57c76e42 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -37,6 +37,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" @@ -329,16 +330,47 @@ int in_gate_area_no_mm(unsigned long addr) return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } +/* + * The VSYSCALL page is the only user-accessible page in the kernel address + * range. Normally, the kernel page tables can have _PAGE_USER clear, but + * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls + * are enabled. + * + * Some day we may create a "minimal" vsyscall mode in which we emulate + * vsyscalls but leave the page not present. If so, we skip calling + * this. + */ +static void __init set_vsyscall_pgtable_user_bits(void) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset_k(VSYSCALL_ADDR); + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); + p4d = p4d_offset(pgd, VSYSCALL_ADDR); +#if CONFIG_PGTABLE_LEVELS >= 5 + p4d->p4d |= _PAGE_USER; +#endif + pud = pud_offset(p4d, VSYSCALL_ADDR); + set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); + pmd = pmd_offset(pud, VSYSCALL_ADDR); + set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); +} + void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - if (vsyscall_mode != NONE) + if (vsyscall_mode != NONE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); + set_vsyscall_pgtable_user_bits(); + } BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); -- cgit v1.2.3 From 4831b779403a836158917d59a7ca880483c67378 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 10 Dec 2017 22:47:20 -0800 Subject: x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE mode If something goes wrong with pagetable setup, vsyscall=native will accidentally fall back to emulation. Make it warn and fail so that we notice. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index daad57c76e42..1faf40f2dda9 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -139,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) WARN_ON_ONCE(address != regs->ip); + /* This should be unreachable in NATIVE mode. */ + if (WARN_ON(vsyscall_mode == NATIVE)) + return false; + if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); -- cgit v1.2.3 From c10e83f598d08046dd1ebc8360d4bb12d802d51b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Dec 2017 12:27:29 +0100 Subject: arch, mm: Allow arch_dup_mmap() to fail In order to sanitize the LDT initialization on x86 arch_dup_mmap() must be allowed to fail. Fix up all instances. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Andy Lutomirsky Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: dan.j.williams@intel.com Cc: hughd@google.com Cc: keescook@google.com Cc: kirill.shutemov@linux.intel.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/mmu_context.h | 5 +++-- arch/um/include/asm/mmu_context.h | 3 ++- arch/unicore32/include/asm/mmu_context.h | 5 +++-- arch/x86/include/asm/mmu_context.h | 4 ++-- include/asm-generic/mm_hooks.h | 5 +++-- kernel/fork.c | 3 +-- 6 files changed, 14 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 492d8140a395..44fdf4786638 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -114,9 +114,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, #endif } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { + return 0; } static inline void arch_exit_mmap(struct mm_struct *mm) diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index b668e351fd6c..fca34b2177e2 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm); /* * Needed since we do not use the asm-generic/mm_hooks.h: */ -static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { uml_setup_stubs(mm); + return 0; } extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index 59b06b48f27d..5c205a9cb5a6 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -81,9 +81,10 @@ do { \ } \ } while (0) -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { + return 0; } static inline void arch_unmap(struct mm_struct *mm, diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6d16d15d09a0..c76162439c8a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -176,10 +176,10 @@ do { \ } while (0) #endif -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { paravirt_arch_dup_mmap(oldmm, mm); + return 0; } static inline void arch_exit_mmap(struct mm_struct *mm) diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index ea189d88a3cc..8ac4e68a12f0 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -7,9 +7,10 @@ #ifndef _ASM_GENERIC_MM_HOOKS_H #define _ASM_GENERIC_MM_HOOKS_H -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { + return 0; } static inline void arch_exit_mmap(struct mm_struct *mm) diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..500ce64517d9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; } /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; + retval = arch_dup_mmap(oldmm, mm); out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); -- cgit v1.2.3 From c2b3496bb30bd159e9de42e5c952e1f1f33c9a77 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Dec 2017 12:27:30 +0100 Subject: x86/ldt: Rework locking The LDT is duplicated on fork() and on exec(), which is wrong as exec() should start from a clean state, i.e. without LDT. To fix this the LDT duplication code will be moved into arch_dup_mmap() which is only called for fork(). This introduces a locking problem. arch_dup_mmap() holds mmap_sem of the parent process, but the LDT duplication code needs to acquire mm->context.lock to access the LDT data safely, which is the reverse lock order of write_ldt() where mmap_sem nests into context.lock. Solve this by introducing a new rw semaphore which serializes the read/write_ldt() syscall operations and use context.lock to protect the actual installment of the LDT descriptor. So context.lock stabilizes mm->context.ldt and can nest inside of the new semaphore or mmap_sem. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirsky Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: dan.j.williams@intel.com Cc: hughd@google.com Cc: keescook@google.com Cc: kirill.shutemov@linux.intel.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu.h | 4 +++- arch/x86/include/asm/mmu_context.h | 2 ++ arch/x86/kernel/ldt.c | 33 +++++++++++++++++++++------------ 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 9ea26f167497..5ff3e8af2c20 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,6 +3,7 @@ #define _ASM_X86_MMU_H #include +#include #include #include @@ -27,7 +28,8 @@ typedef struct { atomic64_t tlb_gen; #ifdef CONFIG_MODIFY_LDT_SYSCALL - struct ldt_struct *ldt; + struct rw_semaphore ldt_usr_sem; + struct ldt_struct *ldt; #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index c76162439c8a..4fdbe5efe535 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -132,6 +132,8 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mutex_init(&mm->context.lock); + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1c1eae961340..1600aebc1ec7 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -5,6 +5,11 @@ * Copyright (C) 2002 Andi Kleen * * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + * contex.ldt_usr_sem + * mmap_sem + * context.lock */ #include @@ -42,7 +47,7 @@ static void refresh_ldt_segments(void) #endif } -/* context.lock is held for us, so we don't need any locking. */ +/* context.lock is held by the task which issued the smp function call */ static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; @@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } -/* context.lock is held */ -static void install_ldt(struct mm_struct *current_mm, - struct ldt_struct *ldt) +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) { + mutex_lock(&mm->context.lock); + /* Synchronizes with READ_ONCE in load_mm_ldt. */ - smp_store_release(¤t_mm->context.ldt, ldt); + smp_store_release(&mm->context.ldt, ldt); - /* Activate the LDT for all CPUs using current_mm. */ - on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); + /* Activate the LDT for all CPUs using currents mm. */ + on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); + + mutex_unlock(&mm->context.lock); } static void free_ldt_struct(struct ldt_struct *ldt) @@ -133,7 +140,8 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) struct mm_struct *old_mm; int retval = 0; - mutex_init(&mm->context.lock); + init_rwsem(&mm->context.ldt_usr_sem); + old_mm = current->mm; if (!old_mm) { mm->context.ldt = NULL; @@ -180,7 +188,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) unsigned long entries_size; int retval; - mutex_lock(&mm->context.lock); + down_read(&mm->context.ldt_usr_sem); if (!mm->context.ldt) { retval = 0; @@ -209,7 +217,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) retval = bytecount; out_unlock: - mutex_unlock(&mm->context.lock); + up_read(&mm->context.ldt_usr_sem); return retval; } @@ -269,7 +277,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ldt.avl = 0; } - mutex_lock(&mm->context.lock); + if (down_write_killable(&mm->context.ldt_usr_sem)) + return -EINTR; old_ldt = mm->context.ldt; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; @@ -291,7 +300,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) error = 0; out_unlock: - mutex_unlock(&mm->context.lock); + up_write(&mm->context.ldt_usr_sem); out: return error; } -- cgit v1.2.3 From a4828f81037f491b2cc986595e3a969a6eeb2fb5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Dec 2017 12:27:31 +0100 Subject: x86/ldt: Prevent LDT inheritance on exec The LDT is inherited across fork() or exec(), but that makes no sense at all because exec() is supposed to start the process clean. The reason why this happens is that init_new_context_ldt() is called from init_new_context() which obviously needs to be called for both fork() and exec(). It would be surprising if anything relies on that behaviour, so it seems to be safe to remove that misfeature. Split the context initialization into two parts. Clear the LDT pointer and initialize the mutex from the general context init and move the LDT duplication to arch_dup_mmap() which is only called on fork(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Andy Lutomirski Cc: Andy Lutomirsky Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Will Deacon Cc: aliguori@amazon.com Cc: dan.j.williams@intel.com Cc: hughd@google.com Cc: keescook@google.com Cc: kirill.shutemov@linux.intel.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 21 ++++++++++++++------- arch/x86/kernel/ldt.c | 18 +++++------------- tools/testing/selftests/x86/ldt_gdt.c | 9 +++------ 3 files changed, 22 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4fdbe5efe535..5e25423bf9bb 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -57,11 +57,17 @@ struct ldt_struct { /* * Used for LDT copy/destruction. */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); +static inline void init_new_context_ldt(struct mm_struct *mm) +{ + mm->context.ldt = NULL; + init_rwsem(&mm->context.ldt_usr_sem); +} +int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ -static inline int init_new_context_ldt(struct task_struct *tsk, - struct mm_struct *mm) +static inline void init_new_context_ldt(struct mm_struct *mm) { } +static inline int ldt_dup_context(struct mm_struct *oldmm, + struct mm_struct *mm) { return 0; } @@ -137,15 +143,16 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; } - #endif - return init_new_context_ldt(tsk, mm); +#endif + init_new_context_ldt(mm); + return 0; } static inline void destroy_context(struct mm_struct *mm) { @@ -181,7 +188,7 @@ do { \ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { paravirt_arch_dup_mmap(oldmm, mm); - return 0; + return ldt_dup_context(oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1600aebc1ec7..a6b5d62f45a7 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -131,28 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt) } /* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed. */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { struct ldt_struct *new_ldt; - struct mm_struct *old_mm; int retval = 0; - init_rwsem(&mm->context.ldt_usr_sem); - - old_mm = current->mm; - if (!old_mm) { - mm->context.ldt = NULL; + if (!old_mm) return 0; - } mutex_lock(&old_mm->context.lock); - if (!old_mm->context.ldt) { - mm->context.ldt = NULL; + if (!old_mm->context.ldt) goto out_unlock; - } new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 66e5ce5b91f0..0304ffb714f2 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -627,13 +627,10 @@ static void do_multicpu_tests(void) static int finish_exec_test(void) { /* - * In a sensible world, this would be check_invalid_segment(0, 1); - * For better or for worse, though, the LDT is inherited across exec. - * We can probably change this safely, but for now we test it. + * Older kernel versions did inherit the LDT on exec() which is + * wrong because exec() starts from a clean state. */ - check_valid_segment(0, 1, - AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, - 42, true); + check_invalid_segment(0, 1); return nerrs ? 1 : 0; } -- cgit v1.2.3 From 4fe2d8b11a370af286287a2661de9d4e6c9a145a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 17:25:07 -0800 Subject: x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack If the kernel oopses while on the trampoline stack, it will print "" even if SYSENTER is not involved. That is rather confusing. The "SYSENTER" stack is used for a lot more than SYSENTER now. Give it a better string to display in stack dumps, and rename the kernel code to match. Also move the 32-bit code over to the new naming even though it still uses the entry stack only for SYSENTER. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 12 ++++++------ arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/fixmap.h | 8 ++++---- arch/x86/include/asm/processor.h | 6 +++--- arch/x86/include/asm/stacktrace.h | 4 ++-- arch/x86/kernel/asm-offsets.c | 4 ++-- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/cpu/common.c | 14 +++++++------- arch/x86/kernel/dumpstack.c | 10 +++++----- arch/x86/kernel/dumpstack_32.c | 6 +++--- arch/x86/kernel/dumpstack_64.c | 12 +++++++++--- 11 files changed, 44 insertions(+), 38 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bd8b57a5c874..ace8f321a5a1 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,9 +942,9 @@ ENTRY(debug) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Ldebug_from_sysenter_stack TRACE_IRQS_OFF @@ -986,9 +986,9 @@ ENTRY(nmi) /* Are we currently on the SYSENTER stack? */ movl PER_CPU_VAR(cpu_entry_area), %ecx - addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Lnmi_from_sysenter_stack /* Not on SYSENTER stack. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2812ce043a7a..87cebe78bbef 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -154,8 +154,8 @@ END(native_usergs_sysret64) _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ - SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA +#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ + SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA ENTRY(entry_SYSCALL_64_trampoline) UNWIND_HINT_EMPTY diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 94fc4fa14127..8153b8d86a3c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -56,10 +56,10 @@ struct cpu_entry_area { char gdt[PAGE_SIZE]; /* - * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as + * The GDT is just below entry_stack and thus serves (on x86_64) as * a a read-only guard page. */ - struct SYSENTER_stack_page SYSENTER_stack_page; + struct entry_stack_page entry_stack_page; /* * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because @@ -250,9 +250,9 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); } -static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) +static inline struct entry_stack *cpu_entry_stack(int cpu) { - return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; + return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index da943411d3d8..9e482d8b0b97 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -336,12 +336,12 @@ struct x86_hw_tss { #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 -struct SYSENTER_stack { +struct entry_stack { unsigned long words[64]; }; -struct SYSENTER_stack_page { - struct SYSENTER_stack stack; +struct entry_stack_page { + struct entry_stack stack; } __aligned(PAGE_SIZE); struct tss_struct { diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f8062bfd43a0..f73706878772 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -16,7 +16,7 @@ enum stack_type { STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, - STACK_TYPE_SYSENTER, + STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; @@ -29,7 +29,7 @@ struct stack_info { bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); -bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); +bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cd360a5e0dca..676b7cf4b62b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -97,6 +97,6 @@ void common(void) { /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); - OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); - DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 7d20d9c0b3d6..fa1261eefa16 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -48,7 +48,7 @@ void foo(void) /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - - offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 034900623adf..ed4acbce37a8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -487,8 +487,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif -static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, - SYSENTER_stack_storage); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, + entry_stack_storage); static void __init set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) @@ -523,8 +523,8 @@ static void __init setup_cpu_entry_area(int cpu) #endif __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), - per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), + per_cpu_ptr(&entry_stack_storage, cpu), 1, PAGE_KERNEL); /* @@ -1323,7 +1323,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1440,7 +1440,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1655,7 +1655,7 @@ void cpu_init(void) */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); - load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index bbd6d986e2d0..1dd3f533d78c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -43,9 +43,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } -bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) +bool in_entry_stack(unsigned long *stack, struct stack_info *info) { - struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); void *begin = ss; void *end = ss + 1; @@ -53,7 +53,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) if ((void *)stack < begin || (void *)stack >= end) return false; - info->type = STACK_TYPE_SYSENTER; + info->type = STACK_TYPE_ENTRY; info->begin = begin; info->end = end; info->next_sp = NULL; @@ -111,13 +111,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) - * - SYSENTER stack + * - entry stack * * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack - * - SYSENTER stack + * - entry stack */ for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5ff13a6b3680..04170f63e3a1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; - if (type == STACK_TYPE_SYSENTER) - return "SYSENTER"; + if (type == STACK_TYPE_ENTRY) + return "ENTRY_TRAMPOLINE"; return NULL; } @@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (task != current) goto unknown; - if (in_sysenter_stack(stack, info)) + if (in_entry_stack(stack, info)) goto recursion_check; if (in_hardirq_stack(stack, info)) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index abc828f8c297..563e28d14f2c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_IRQ) return "IRQ"; - if (type == STACK_TYPE_SYSENTER) - return "SYSENTER"; + if (type == STACK_TYPE_ENTRY) { + /* + * On 64-bit, we have a generic entry stack that we + * use for all the kernel entry points, including + * SYSENTER. + */ + return "ENTRY_TRAMPOLINE"; + } if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_irq_stack(stack, info)) goto recursion_check; - if (in_sysenter_stack(stack, info)) + if (in_entry_stack(stack, info)) goto recursion_check; goto unknown; -- cgit v1.2.3 From 3e46e0f5ee3643a1239be9046c7ba6c66ca2b329 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Dec 2017 13:34:50 +0100 Subject: x86/uv: Use the right TLB-flush API Since uv_flush_tlb_others() implements flush_tlb_others() which is about flushing user mappings, we should use __flush_tlb_single(), which too is about flushing user mappings. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Andrew Banman Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Mike Travis Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/platform/uv/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index f44c0bc95aa2..8538a6723171 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, local_flush_tlb(); stat->d_alltlb++; } else { - __flush_tlb_one(msg->address); + __flush_tlb_single(msg->address); stat->d_onetlb++; } stat->d_requestee++; -- cgit v1.2.3 From 23cb7d46f371844c004784ad9552a57446f73e5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Dec 2017 13:34:51 +0100 Subject: x86/microcode: Dont abuse the TLB-flush interface Commit: ec400ddeff20 ("x86/microcode_intel_early.c: Early update ucode on Intel's CPU") ... grubbed into tlbflush internals without coherent explanation. Since it says its a precaution and the SDM doesn't mention anything like this, take it out back. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: fenghua.yu@intel.com Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 19 ++++++------------- arch/x86/kernel/cpu/microcode/intel.c | 13 ------------- 2 files changed, 6 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 509046cfa5ce..c2e45da4e540 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -246,20 +246,9 @@ static inline void __native_flush_tlb(void) preempt_enable(); } -static inline void __native_flush_tlb_global_irq_disabled(void) -{ - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); -} - static inline void __native_flush_tlb_global(void) { - unsigned long flags; + unsigned long cr4, flags; if (static_cpu_has(X86_FEATURE_INVPCID)) { /* @@ -277,7 +266,11 @@ static inline void __native_flush_tlb_global(void) */ raw_local_irq_save(flags); - __native_flush_tlb_global_irq_disabled(); + cr4 = this_cpu_read(cpu_tlbstate.cr4); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + native_write_cr4(cr4); raw_local_irq_restore(flags); } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 7dbcb7adf797..8ccdca6d3f9e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci) } #else -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ - __native_flush_tlb_global_irq_disabled(); -} - static inline void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc; @@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (rev != mc->hdr.rev) return -1; -#ifdef CONFIG_X86_64 - /* Flush global tlb. This is precaution. */ - flush_tlb_early(); -#endif uci->cpu_sig.rev = rev; if (early) -- cgit v1.2.3 From a501686b2923ce6f2ff2b1d0d50682c6411baf72 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Dec 2017 13:34:49 +0100 Subject: x86/mm: Use __flush_tlb_one() for kernel memory __flush_tlb_single() is for user mappings, __flush_tlb_one() for kernel mappings. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3118392cdf75..0569987f6da6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -551,7 +551,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_single(addr); + __flush_tlb_one(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) -- cgit v1.2.3 From b5fc6d943808b570bdfbec80f40c6b3855f1c48b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Dec 2017 13:34:46 +0100 Subject: x86/mm: Remove superfluous barriers atomic64_inc_return() already implies smp_mb() before and after. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index c2e45da4e540..3e2227386abe 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -60,19 +60,13 @@ static inline void invpcid_flush_all_nonglobals(void) static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { - u64 new_tlb_gen; - /* * Bump the generation count. This also serves as a full barrier * that synchronizes with switch_mm(): callers are required to order * their read of mm_cpumask after their writes to the paging * structures. */ - smp_mb__before_atomic(); - new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); - smp_mb__after_atomic(); - - return new_tlb_gen; + return atomic64_inc_return(&mm->context.tlb_gen); } #ifdef CONFIG_PARAVIRT -- cgit v1.2.3 From 3f67af51e56f291d7417d77c4f67cd774633c5e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Dec 2017 13:34:52 +0100 Subject: x86/mm: Add comments to clarify which TLB-flush functions are supposed to flush what Per popular request.. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 3e2227386abe..552d581c8f9f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -228,6 +228,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) extern void initialize_tlbstate_and_flush(void); +/* + * flush the entire current user mapping + */ static inline void __native_flush_tlb(void) { /* @@ -240,6 +243,9 @@ static inline void __native_flush_tlb(void) preempt_enable(); } +/* + * flush everything + */ static inline void __native_flush_tlb_global(void) { unsigned long cr4, flags; @@ -269,17 +275,27 @@ static inline void __native_flush_tlb_global(void) raw_local_irq_restore(flags); } +/* + * flush one page in the user mapping + */ static inline void __native_flush_tlb_single(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } +/* + * flush everything + */ static inline void __flush_tlb_all(void) { - if (boot_cpu_has(X86_FEATURE_PGE)) + if (boot_cpu_has(X86_FEATURE_PGE)) { __flush_tlb_global(); - else + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ __flush_tlb(); + } /* * Note: if we somehow had PCID but not PGE, then this wouldn't work -- @@ -290,6 +306,9 @@ static inline void __flush_tlb_all(void) */ } +/* + * flush one page in the kernel mapping + */ static inline void __flush_tlb_one(unsigned long addr) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); -- cgit v1.2.3 From 50fb83a62cf472dc53ba23bd3f7bd6c1b2b3b53e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:54 +0100 Subject: x86/mm: Move the CR3 construction functions to tlbflush.h For flushing the TLB, the ASID which has been programmed into the hardware must be known. That differs from what is in 'cpu_tlbstate'. Add functions to transform the 'cpu_tlbstate' values into to the one programmed into the hardware (CR3). It's not easy to include mmu_context.h into tlbflush.h, so just move the CR3 building over to tlbflush.h. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 29 +---------------------------- arch/x86/include/asm/tlbflush.h | 26 ++++++++++++++++++++++++++ arch/x86/mm/tlb.c | 8 ++++---- 3 files changed, 31 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5e25423bf9bb..5ede7cae1d67 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -290,33 +290,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } -/* - * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID - * bits. This serves two purposes. It prevents a nasty situation in - * which PCID-unaware code saves CR3, loads some other value (with PCID - * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if - * the saved ASID was nonzero. It also means that any bugs involving - * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger - * deterministically. - */ - -static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) -{ - if (static_cpu_has(X86_FEATURE_PCID)) { - VM_WARN_ON_ONCE(asid > 4094); - return __sme_pa(mm->pgd) | (asid + 1); - } else { - VM_WARN_ON_ONCE(asid != 0); - return __sme_pa(mm->pgd); - } -} - -static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) -{ - VM_WARN_ON_ONCE(asid > 4094); - return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; -} - /* * This can be used from process context to figure out what the value of * CR3 is without needing to do a (slow) __read_cr3(). @@ -326,7 +299,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) */ static inline unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* For now, be very restrictive about when this can be called. */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 552d581c8f9f..ee7925adfb57 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -69,6 +69,32 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } +/* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. + * This serves two purposes. It prevents a nasty situation in which + * PCID-unaware code saves CR3, loads some other value (with PCID == 0), + * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved + * ASID was nonzero. It also means that any bugs involving loading a + * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. + */ +struct pgd_t; +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +{ + if (static_cpu_has(X86_FEATURE_PCID)) { + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(pgd); + } +} + +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +{ + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; +} + #ifdef CONFIG_PARAVIRT #include #else diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0569987f6da6..0a1be3adc97e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * isn't free. */ #ifdef CONFIG_DEBUG_VM - if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { /* * If we were to BUG here, we'd be very likely to kill * the system so hard that we don't see the call trace. @@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(build_cr3(next, new_asid)); + write_cr3(build_cr3(next->pgd, new_asid)); /* * NB: This gets called via leave_mm() in the idle path @@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(build_cr3_noflush(next, new_asid)); + write_cr3(build_cr3_noflush(next->pgd, new_asid)); /* See above wrt _rcuidle. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); @@ -288,7 +288,7 @@ void initialize_tlbstate_and_flush(void) !(cr4_read_shadow() & X86_CR4_PCIDE)); /* Force ASID 0 and force a TLB flush. */ - write_cr3(build_cr3(mm, 0)); + write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); -- cgit v1.2.3 From cb0a9144a744e55207e24dcef812f05cd15a499a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:55 +0100 Subject: x86/mm: Remove hard-coded ASID limit checks First, it's nice to remove the magic numbers. Second, PAGE_TABLE_ISOLATION is going to consume half of the available ASID space. The space is currently unused, but add a comment to spell out this new restriction. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index ee7925adfb57..f88ccd3ae466 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -69,6 +69,22 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#define PTI_CONSUMED_ASID_BITS 0 + +#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because ASID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) + /* * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. * This serves two purposes. It prevents a nasty situation in which @@ -81,7 +97,7 @@ struct pgd_t; static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) { if (static_cpu_has(X86_FEATURE_PCID)) { - VM_WARN_ON_ONCE(asid > 4094); + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); return __sme_pa(pgd) | (asid + 1); } else { VM_WARN_ON_ONCE(asid != 0); @@ -91,7 +107,7 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) { - VM_WARN_ON_ONCE(asid > 4094); + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; } -- cgit v1.2.3 From dd95f1a4b5ca904c78e6a097091eb21436478abb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:56 +0100 Subject: x86/mm: Put MMU to hardware ASID translation in one place There are effectively two ASID types: 1. The one stored in the mmu_context that goes from 0..5 2. The one programmed into the hardware that goes from 1..6 This consolidates the locations where converting between the two (by doing a +1) to a single place which gives us a nice place to comment. PAGE_TABLE_ISOLATION will also need to, given an ASID, know which hardware ASID to flush for the userspace mapping. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index f88ccd3ae466..8b27daff7a7f 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -85,20 +85,26 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) */ #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) -/* - * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. - * This serves two purposes. It prevents a nasty situation in which - * PCID-unaware code saves CR3, loads some other value (with PCID == 0), - * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved - * ASID was nonzero. It also means that any bugs involving loading a - * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. - */ +static inline u16 kern_pcid(u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + /* + * If PCID is on, ASID-aware code paths put the ASID+1 into the + * PCID bits. This serves two purposes. It prevents a nasty + * situation in which PCID-unaware code saves CR3, loads some other + * value (with PCID == 0), and then restores CR3, thus corrupting + * the TLB for ASID 0 if the saved ASID was nonzero. It also means + * that any bugs involving loading a PCID-enabled CR3 with + * CR4.PCIDE off will trigger deterministically. + */ + return asid + 1; +} + struct pgd_t; static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) { if (static_cpu_has(X86_FEATURE_PCID)) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - return __sme_pa(pgd) | (asid + 1); + return __sme_pa(pgd) | kern_pcid(asid); } else { VM_WARN_ON_ONCE(asid != 0); return __sme_pa(pgd); @@ -108,7 +114,8 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; + VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; } #ifdef CONFIG_PARAVIRT -- cgit v1.2.3 From 1a3b0caeb77edeac5ce5fa05e6a61c474c9a9745 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Dec 2017 13:34:47 +0100 Subject: x86/mm: Create asm/invpcid.h Unclutter tlbflush.h a little. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/invpcid.h | 53 +++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/tlbflush.h | 49 +------------------------------------ 2 files changed, 54 insertions(+), 48 deletions(-) create mode 100644 arch/x86/include/asm/invpcid.h (limited to 'arch') diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h new file mode 100644 index 000000000000..989cfa86de85 --- /dev/null +++ b/arch/x86/include/asm/invpcid.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INVPCID +#define _ASM_X86_INVPCID + +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + struct { u64 d[2]; } desc = { { pcid, addr } }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + +#endif /* _ASM_X86_INVPCID */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8b27daff7a7f..171b429f43a2 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -9,54 +9,7 @@ #include #include #include - -static inline void __invpcid(unsigned long pcid, unsigned long addr, - unsigned long type) -{ - struct { u64 d[2]; } desc = { { pcid, addr } }; - - /* - * The memory clobber is because the whole point is to invalidate - * stale TLB entries and, especially if we're flushing global - * mappings, we don't want the compiler to reorder any subsequent - * memory accesses before the TLB flush. - * - * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and - * invpcid (%rcx), %rax in long mode. - */ - asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (&desc) : "memory"); -} - -#define INVPCID_TYPE_INDIV_ADDR 0 -#define INVPCID_TYPE_SINGLE_CTXT 1 -#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 -#define INVPCID_TYPE_ALL_NON_GLOBAL 3 - -/* Flush all mappings for a given pcid and addr, not including globals. */ -static inline void invpcid_flush_one(unsigned long pcid, - unsigned long addr) -{ - __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); -} - -/* Flush all mappings for a given PCID, not including globals. */ -static inline void invpcid_flush_single_context(unsigned long pcid) -{ - __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); -} - -/* Flush all mappings, including globals, for all PCIDs. */ -static inline void invpcid_flush_all(void) -{ - __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); -} - -/* Flush all mappings for all PCIDs except globals. */ -static inline void invpcid_flush_all_nonglobals(void) -{ - __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); -} +#include static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { -- cgit v1.2.3 From ed1bbc40a0d10e0c5c74fe7bdc6298295cf40255 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Dec 2017 18:28:54 +0100 Subject: x86/cpu_entry_area: Move it to a separate unit Separate the cpu_entry_area code out of cpu/common.c and the fixmap. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpu_entry_area.h | 52 +++++++++++++++++ arch/x86/include/asm/fixmap.h | 41 +------------- arch/x86/kernel/cpu/common.c | 94 ------------------------------ arch/x86/kernel/traps.c | 1 + arch/x86/mm/Makefile | 2 +- arch/x86/mm/cpu_entry_area.c | 104 ++++++++++++++++++++++++++++++++++ 6 files changed, 159 insertions(+), 135 deletions(-) create mode 100644 arch/x86/include/asm/cpu_entry_area.h create mode 100644 arch/x86/mm/cpu_entry_area.c (limited to 'arch') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h new file mode 100644 index 000000000000..5471826803af --- /dev/null +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ASM_X86_CPU_ENTRY_AREA_H +#define _ASM_X86_CPU_ENTRY_AREA_H + +#include +#include + +/* + * cpu_entry_area is a percpu region that contains things needed by the CPU + * and early entry/exit code. Real types aren't used for all fields here + * to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* + * The GDT is just below entry_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ + struct entry_stack_page entry_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because + * we need task switches to work, and task switches write to the TSS. + */ + struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 + /* + * Exception stacks used for IST entries. + * + * In the future, this should have a separate slot for each stack + * with guard pages between them. + */ + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif +}; + +#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) + +DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + +extern void setup_cpu_entry_areas(void); + +#endif diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8153b8d86a3c..fb801662a230 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -25,6 +25,7 @@ #else #include #endif +#include /* * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall @@ -44,46 +45,6 @@ extern unsigned long __FIXADDR_TOP; PAGE_SIZE) #endif -/* - * cpu_entry_area is a percpu region in the fixmap that contains things - * needed by the CPU and early entry/exit code. Real types aren't used - * for all fields here to avoid circular header dependencies. - * - * Every field is a virtual alias of some other allocated backing store. - * There is no direct allocation of a struct cpu_entry_area. - */ -struct cpu_entry_area { - char gdt[PAGE_SIZE]; - - /* - * The GDT is just below entry_stack and thus serves (on x86_64) as - * a a read-only guard page. - */ - struct entry_stack_page entry_stack_page; - - /* - * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because - * we need task switches to work, and task switches write to the TSS. - */ - struct tss_struct tss; - - char entry_trampoline[PAGE_SIZE]; - -#ifdef CONFIG_X86_64 - /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. - */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; -#endif -}; - -#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) - -extern void setup_cpu_entry_areas(void); - /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ed4acbce37a8..8ddcfa4d4165 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -482,102 +482,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, [DEBUG_STACK - 1] = DEBUG_STKSZ }; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -#endif - -static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, - entry_stack_storage); - -static void __init -set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) -{ - for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) - __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); -} - -/* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) -{ -#ifdef CONFIG_X86_64 - extern char _entry_trampoline[]; - - /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ - pgprot_t gdt_prot = PAGE_KERNEL_RO; - pgprot_t tss_prot = PAGE_KERNEL_RO; -#else - /* - * On native 32-bit systems, the GDT cannot be read-only because - * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the - * GDT is read-only, that will triple fault. The TSS cannot be - * read-only because the CPU writes to it on task switches. - * - * On Xen PV, the GDT must be read-only because the hypervisor - * requires it. - */ - pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; - pgprot_t tss_prot = PAGE_KERNEL; -#endif - - __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), - per_cpu_ptr(&entry_stack_storage, cpu), 1, - PAGE_KERNEL); - - /* - * The Intel SDM says (Volume 3, 7.2.1): - * - * Avoid placing a page boundary in the part of the TSS that the - * processor reads during a task switch (the first 104 bytes). The - * processor may not correctly perform address translations if a - * boundary occurs in this area. During a task switch, the processor - * reads and writes into the first 104 bytes of each TSS (using - * contiguous physical addresses beginning with the physical address - * of the first byte of the TSS). So, after TSS access begins, if - * part of the 104 bytes is not physically contiguous, the processor - * will access incorrect information without generating a page-fault - * exception. - * - * There are also a lot of errata involving the TSS spanning a page - * boundary. Assert that we're not doing that. - */ - BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ - offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); - BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), - &per_cpu(cpu_tss_rw, cpu), - sizeof(struct tss_struct) / PAGE_SIZE, - tss_prot); - -#ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); #endif -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), - &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, - PAGE_KERNEL); - - __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); -#endif -} - -void __init setup_cpu_entry_areas(void) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) - setup_cpu_entry_area(cpu); -} - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 74136fd16f49..464daed6894f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 7ba7f3d7f477..2e0017af8f9b 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 000000000000..235ff9cfaaf4 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include +#include +#include +#include + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +static void __init +set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) + __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. + * + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. + */ + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; +#endif + + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), + per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), + &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, + tss_prot); + +#ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, + PAGE_KERNEL); + + __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); +} -- cgit v1.2.3 From 92a0f81d89571e3e8759366e050ee05cc545ef99 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Dec 2017 18:51:31 +0100 Subject: x86/cpu_entry_area: Move it out of the fixmap Put the cpu_entry_area into a separate P4D entry. The fixmap gets too big and 0-day already hit a case where the fixmap PTEs were cleared by cleanup_highmap(). Aside of that the fixmap API is a pain as it's all backwards. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 2 + arch/x86/include/asm/cpu_entry_area.h | 18 ++++++++- arch/x86/include/asm/desc.h | 1 + arch/x86/include/asm/fixmap.h | 32 +--------------- arch/x86/include/asm/pgtable_32_types.h | 15 ++++++-- arch/x86/include/asm/pgtable_64_types.h | 47 +++++++++++++---------- arch/x86/kernel/dumpstack.c | 1 + arch/x86/kernel/traps.c | 5 ++- arch/x86/mm/cpu_entry_area.c | 66 +++++++++++++++++++++++++-------- arch/x86/mm/dump_pagetables.c | 6 ++- arch/x86/mm/init_32.c | 6 +++ arch/x86/mm/kasan_init_64.c | 29 ++++++++------- arch/x86/mm/pgtable_32.c | 1 + arch/x86/xen/mmu_pv.c | 2 - 14 files changed, 143 insertions(+), 88 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 63a41671d25b..51101708a03a 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... +fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space @@ -35,6 +36,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ... unused hole ... ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ... unused hole ... +fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 5471826803af..2fbc69a0916e 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -43,10 +43,26 @@ struct cpu_entry_area { }; #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) +#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); extern void setup_cpu_entry_areas(void); +extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); + +#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE +#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) + +#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) + +#define CPU_ENTRY_AREA_MAP_SIZE \ + (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + +extern struct cpu_entry_area *get_cpu_entry_area(int cpu); + +static inline struct entry_stack *cpu_entry_stack(int cpu) +{ + return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +} #endif diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 2ace1f90d138..bc359dd2f7f6 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index fb801662a230..64c4a30e0d39 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -25,7 +25,6 @@ #else #include #endif -#include /* * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall @@ -84,7 +83,6 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif - FIX_RO_IDT, /* Virtual mapping for read-only IDT */ #ifdef CONFIG_X86_32 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, @@ -100,9 +98,6 @@ enum fixed_addresses { #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif - /* Fixmap entries to remap the GDTs, one per processor. */ - FIX_CPU_ENTRY_AREA_TOP, - FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, #ifdef CONFIG_ACPI_APEI_GHES /* Used for GHES mapping from assorted contexts */ @@ -143,7 +138,7 @@ enum fixed_addresses { extern void reserve_top_address(unsigned long reserve); #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) extern int fixmaps_set; @@ -191,30 +186,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, void __early_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags); -static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) -{ - BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); - - return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; -} - -#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ - BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ - __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ - }) - -#define get_cpu_entry_area_index(cpu, field) \ - __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) - -static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) -{ - return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); -} - -static inline struct entry_stack *cpu_entry_stack(int cpu) -{ - return &get_cpu_entry_area(cpu)->entry_stack_page.stack; -} - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index f2ca9b28fd68..ce245b0cdfca 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ - & PMD_MASK) +/* + * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c + * to avoid include recursion hell + */ +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) + +#define CPU_ENTRY_AREA_BASE \ + ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) + +#define PKMAP_BASE \ + ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) +# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) #endif #define MODULES_VADDR VMALLOC_START diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6d5f45dcd4a1..3d27831bc58d 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -76,32 +76,41 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) + #ifdef CONFIG_X86_5LEVEL -#define VMALLOC_SIZE_TB _AC(16384, UL) -#define __VMALLOC_BASE _AC(0xff92000000000000, UL) -#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +# define VMALLOC_SIZE_TB _AC(16384, UL) +# define __VMALLOC_BASE _AC(0xff92000000000000, UL) +# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) #else -#define VMALLOC_SIZE_TB _AC(32, UL) -#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +# define VMALLOC_SIZE_TB _AC(32, UL) +# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) #endif + #ifdef CONFIG_RANDOMIZE_MEMORY -#define VMALLOC_START vmalloc_base -#define VMEMMAP_START vmemmap_base +# define VMALLOC_START vmalloc_base +# define VMEMMAP_START vmemmap_base #else -#define VMALLOC_START __VMALLOC_BASE -#define VMEMMAP_START __VMEMMAP_BASE +# define VMALLOC_START __VMALLOC_BASE +# define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_RANDOMIZE_MEMORY */ -#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) -#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + +#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) + +#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) -#define MODULES_LEN (MODULES_END - MODULES_VADDR) -#define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) -#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) +#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) + +#define CPU_ENTRY_AREA_PGD _AC(-3, UL) +#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) + +#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1dd3f533d78c..36b17e0febe8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 464daed6894f..7c16fe0b60c2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -951,8 +951,9 @@ void __init trap_init(void) * "sidt" instruction will not leak the location of the kernel, and * to defend the IDT against arbitrary memory write vulnerabilities. * It will be reloaded in cpu_init() */ - __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); - idt_descr.address = fix_to_virt(FIX_RO_IDT); + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; /* * Should be a barrier for any external CPU state: diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 235ff9cfaaf4..21e8b595cbb1 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -15,11 +15,27 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif +struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ + unsigned long va = (unsigned long) cea_vaddr; + + set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); +} + static void __init -set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) { - for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) - __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } /* Setup the fixmap mappings only once per-processor */ @@ -47,10 +63,12 @@ static void __init setup_cpu_entry_area(int cpu) pgprot_t tss_prot = PAGE_KERNEL; #endif - __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), - per_cpu_ptr(&entry_stack_storage, cpu), 1, - PAGE_KERNEL); + cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), + gdt_prot); + + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); /* * The Intel SDM says (Volume 3, 7.2.1): @@ -72,10 +90,9 @@ static void __init setup_cpu_entry_area(int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), - &per_cpu(cpu_tss_rw, cpu), - sizeof(struct tss_struct) / PAGE_SIZE, - tss_prot); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, + &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); #ifdef CONFIG_X86_32 per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); @@ -85,20 +102,37 @@ static void __init setup_cpu_entry_area(int cpu) BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(exception_stacks) != sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), - &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, - PAGE_KERNEL); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); - __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); #endif } +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 + unsigned long start, end; + + BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + + start = CPU_ENTRY_AREA_BASE; + end = start + CPU_ENTRY_AREA_MAP_SIZE; + + for (; start < end; start += PMD_SIZE) + populate_extra_pte(start); +#endif +} + void __init setup_cpu_entry_areas(void) { unsigned int cpu; + setup_cpu_entry_area_ptes(); + for_each_possible_cpu(cpu) setup_cpu_entry_area(cpu); } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index fdf09d8f98da..43dedbfb7257 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -58,6 +58,7 @@ enum address_markers_idx { KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif + CPU_ENTRY_AREA_NR, #ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, #endif @@ -81,6 +82,7 @@ static struct addr_marker address_markers[] = { [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, #endif + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, #ifdef CONFIG_X86_ESPFIX64 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, #endif @@ -104,6 +106,7 @@ enum address_markers_idx { #ifdef CONFIG_HIGHMEM PKMAP_BASE_NR, #endif + CPU_ENTRY_AREA_NR, FIXADDR_START_NR, END_OF_SPACE_NR, }; @@ -116,6 +119,7 @@ static struct addr_marker address_markers[] = { #ifdef CONFIG_HIGHMEM [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, #endif + [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, [END_OF_SPACE_NR] = { -1, NULL } }; @@ -541,8 +545,8 @@ static int __init pt_dump_init(void) address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; # endif address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; + address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; #endif - return 0; } __initcall(pt_dump_init); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a64a6f2848d..135c9a7898c7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include "mm_internal.h" @@ -766,6 +767,7 @@ void __init mem_init(void) mem_init_print_info(NULL); printk(KERN_INFO "virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif @@ -777,6 +779,10 @@ void __init mem_init(void) FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10, + CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, + CPU_ENTRY_AREA_MAP_SIZE >> 10, + #ifdef CONFIG_HIGHMEM PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, (LAST_PKMAP*PAGE_SIZE) >> 10, diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9ec70d780f1f..47388f0c0e59 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,7 @@ #include #include #include +#include extern struct range pfn_mapped[E820_MAX_ENTRIES]; @@ -322,31 +323,33 @@ void __init kasan_init(void) map_range(&pfn_mapped[i]); } - kasan_populate_zero_shadow( - kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), - kasan_mem_to_shadow((void *)__START_KERNEL_map)); - - kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), - (unsigned long)kasan_mem_to_shadow(_end), - early_pfn_to_nid(__pa(_stext))); - - shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); + shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, PAGE_SIZE); - shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); + shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_MAP_SIZE); shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, PAGE_SIZE); - kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), - shadow_cpu_entry_begin); + kasan_populate_zero_shadow( + kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + shadow_cpu_entry_begin); kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, (unsigned long)shadow_cpu_entry_end, 0); - kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); + kasan_populate_zero_shadow(shadow_cpu_entry_end, + kasan_mem_to_shadow((void *)__START_KERNEL_map)); + + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); + + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), + (void *)KASAN_SHADOW_END); load_cr3(init_top_pgt); __flush_tlb_all(); diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6b9bf023a700..c3c5274410a9 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index c2454237fa67..a0e2b8c6e5c7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2261,7 +2261,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) switch (idx) { case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: - case FIX_RO_IDT: #ifdef CONFIG_X86_32 case FIX_WP_TEST: # ifdef CONFIG_HIGHMEM @@ -2272,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: - case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: /* All local page mappings */ pte = pfn_pte(phys, prot); break; -- cgit v1.2.3 From 613e396bc0d4c7604fba23256644e78454c68cf6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 17 Dec 2017 10:56:29 +0100 Subject: init: Invoke init_espfix_bsp() from mm_init() init_espfix_bsp() needs to be invoked before the page table isolation initialization. Move it into mm_init() which is the place where pti_init() will be added. While at it get rid of the #ifdeffery and provide proper stub functions. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/include/asm/espfix.h | 7 ++++--- arch/x86/kernel/smpboot.c | 6 +----- include/asm-generic/pgtable.h | 5 +++++ init/main.c | 6 ++---- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 0211029076ea..6777480d8a42 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_ESPFIX_H #define _ASM_X86_ESPFIX_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_ESPFIX64 #include @@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); extern void init_espfix_bsp(void); extern void init_espfix_ap(int cpu); - -#endif /* CONFIG_X86_64 */ +#else +static inline void init_espfix_ap(int cpu) { } +#endif #endif /* _ASM_X86_ESPFIX_H */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d56c1d209283..33d6000265aa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -990,12 +990,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; - /* - * Enable the espfix hack for this CPU - */ -#ifdef CONFIG_X86_ESPFIX64 + /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); -#endif /* So we see what's up */ announce_cpu(cpu, apicid); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 757dc6ffc7ba..231b35a76dd9 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1017,6 +1017,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); + +#ifndef CONFIG_X86_ESPFIX64 +static inline void init_espfix_bsp(void) { } +#endif + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/init/main.c b/init/main.c index 0ee9c6866ada..8a390f60ec81 100644 --- a/init/main.c +++ b/init/main.c @@ -504,6 +504,8 @@ static void __init mm_init(void) pgtable_init(); vmalloc_init(); ioremap_huge_init(); + /* Should be run before the first non-init thread is created */ + init_espfix_bsp(); } asmlinkage __visible void __init start_kernel(void) @@ -673,10 +675,6 @@ asmlinkage __visible void __init start_kernel(void) #ifdef CONFIG_X86 if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); -#endif -#ifdef CONFIG_X86_ESPFIX64 - /* Should be run before the first non-init thread is created */ - init_espfix_bsp(); #endif thread_stack_cache_init(); cred_init(); -- cgit v1.2.3 From f6c4fd506cb626e4346aa81688f255e593a7c5a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Dec 2017 19:45:11 +0100 Subject: x86/cpu_entry_area: Prevent wraparound in setup_cpu_entry_area_ptes() on 32bit The loop which populates the CPU entry area PMDs can wrap around on 32bit machines when the number of CPUs is small. It worked wonderful for NR_CPUS=64 for whatever reason and the moron who wrote that code did not bother to test it with !SMP. Check for the wraparound to fix it. Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") Reported-by: kernel test robot Signed-off-by: Thomas "Feels stupid" Gleixner Tested-by: Borislav Petkov --- arch/x86/mm/cpu_entry_area.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 21e8b595cbb1..fe814fd5e014 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -122,7 +122,8 @@ static __init void setup_cpu_entry_area_ptes(void) start = CPU_ENTRY_AREA_BASE; end = start + CPU_ENTRY_AREA_MAP_SIZE; - for (; start < end; start += PMD_SIZE) + /* Careful here: start + PMD_SIZE might wrap around */ + for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) populate_extra_pte(start); #endif } -- cgit v1.2.3 From a89f040fa34ec9cd682aed98b8f04e3c47d998bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:33 +0100 Subject: x86/cpufeatures: Add X86_BUG_CPU_INSECURE Many x86 CPUs leak information to user space due to missing isolation of user space and kernel space page tables. There are many well documented ways to exploit that. The upcoming software migitation of isolating the user and kernel space page tables needs a misfeature flag so code can be made runtime conditional. Add the BUG bits which indicates that the CPU is affected and add a feature bit which indicates that the software migitation is enabled. Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be made later. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 3 ++- arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/kernel/cpu/common.c | 4 ++++ 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 800104c8a3ed..d8ec834ea884 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -201,7 +201,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ - +#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ @@ -340,5 +340,6 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c10c9128f54e..e428e16dd822 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,6 +44,12 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define DISABLE_PTI 0 +#else +# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -54,7 +60,7 @@ #define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 -#define DISABLED_MASK7 0 +#define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 #define DISABLED_MASK9 (DISABLE_MPX) #define DISABLED_MASK10 0 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8ddcfa4d4165..a9210f9b7cf8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + /* Assume for now that ALL x86 CPUs are insecure */ + setup_force_cpu_bug(X86_BUG_CPU_INSECURE); + fpu__init_system(c); #ifdef CONFIG_X86_32 -- cgit v1.2.3 From c313ec66317d421fb5768d78c56abed2dc862264 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:34 +0100 Subject: x86/mm/pti: Disable global pages if PAGE_TABLE_ISOLATION=y Global pages stay in the TLB across context switches. Since all contexts share the same kernel mapping, these mappings are marked as global pages so kernel entries in the TLB are not flushed out on a context switch. But, even having these entries in the TLB opens up something that an attacker can use, such as the double-page-fault attack: http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf That means that even when PAGE_TABLE_ISOLATION switches page tables on return to user space the global pages would stay in the TLB cache. Disable global pages so that kernel TLB entries can be flushed before returning to user space. This way, all accesses to kernel addresses from userspace result in a TLB miss independent of the existence of a kernel mapping. Suppress global pages via the __supported_pte_mask. The user space mappings set PAGE_GLOBAL for the minimal kernel mappings which are required for entry/exit. These mappings are set up manually so the filtering does not take place. [ The __supported_pte_mask simplification was written by Thomas Gleixner. ] Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a22c2b95e513..020223420308 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -161,6 +161,12 @@ struct map_range { static int page_size_mask; +static void enable_global_pages(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + __supported_pte_mask |= _PAGE_GLOBAL; +} + static void __init probe_page_size_mask(void) { /* @@ -179,11 +185,11 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ + __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; - } else - __supported_pte_mask &= ~_PAGE_GLOBAL; + enable_global_pages(); + } /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { -- cgit v1.2.3 From 8a09317b895f073977346779df52f67c1056d81d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:35 +0100 Subject: x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching PAGE_TABLE_ISOLATION needs to switch to a different CR3 value when it enters the kernel and switch back when it exits. This essentially needs to be done before leaving assembly code. This is extra challenging because the switching context is tricky: the registers that can be clobbered can vary. It is also hard to store things on the stack because there is an established ABI (ptregs) or the stack is entirely unsafe to use. Establish a set of macros that allow changing to the user and kernel CR3 values. Interactions with SWAPGS: Previous versions of the PAGE_TABLE_ISOLATION code relied on having per-CPU scratch space to save/restore a register that can be used for the CR3 MOV. The %GS register is used to index into our per-CPU space, so SWAPGS *had* to be done before the CR3 switch. That scratch space is gone now, but the semantic that SWAPGS must be done before the CR3 MOV is retained. This is good to keep because it is not that hard to do and it allows to do things like add per-CPU debugging information. What this does in the NMI code is worth pointing out. NMIs can interrupt *any* context and they can also be nested with NMIs interrupting other NMIs. The comments below ".Lnmi_from_kernel" explain the format of the stack during this situation. Changing the format of this stack is hard. Instead of storing the old CR3 value on the stack, this depends on the *regular* register save/restore mechanism and then uses %r14 to keep CR3 during the NMI. It is callee-saved and will not be clobbered by the C NMI handlers that get called. [ PeterZ: ESPFIX optimization ] Based-on-code-from: Andy Lutomirski Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 66 ++++++++++++++++++++++++++++++++++++++++ arch/x86/entry/entry_64.S | 45 +++++++++++++++++++++++---- arch/x86/entry/entry_64_compat.S | 24 ++++++++++++++- 3 files changed, 128 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3fd8bc560fae..a9d17a7686ab 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include +#include /* @@ -187,6 +189,70 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ +#define PTI_SWITCH_MASK (1< in kernel */ SWAPGS xorl %ebx, %ebx -1: ret + +1: + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + + ret END(paranoid_entry) /* @@ -1266,6 +1287,7 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + RESTORE_CR3 save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: @@ -1293,6 +1315,8 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax .Lerror_entry_from_usermode_after_swapgs: /* Put us onto the real thread stack. */ @@ -1339,6 +1363,7 @@ ENTRY(error_entry) * .Lgs_change's error handler with kernel gsbase. */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done .Lbstep_iret: @@ -1348,10 +1373,11 @@ ENTRY(error_entry) .Lerror_bad_iret: /* - * We came from an IRET to user mode, so we have user gsbase. - * Switch to kernel gsbase: + * We came from an IRET to user mode, so we have user + * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* * Pretend that the exception came from user mode: set up pt_regs @@ -1383,6 +1409,10 @@ END(error_exit) /* * Runs on exception stack. Xen PV does not go through this path at all, * so we can use real assembly here. + * + * Registers: + * %r14: Used to save/restore the CR3 of the interrupted context + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ ENTRY(nmi) UNWIND_HINT_IRET_REGS @@ -1446,6 +1476,7 @@ ENTRY(nmi) swapgs cld + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 @@ -1698,6 +1729,8 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + RESTORE_CR3 save_reg=%r14 + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 95ad40eb7eff..05238b29895e 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -49,6 +49,10 @@ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ SWAPGS + + /* We are about to clobber %rsp anyway, clobbering here is OK */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* @@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) pushq $0 /* pt_regs->r14 = 0 */ pushq $0 /* pt_regs->r15 = 0 */ + /* + * We just saved %rdi so it is safe to clobber. It is not + * preserved during the C calls inside TRACE_IRQS_OFF anyway. + */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. @@ -256,10 +266,22 @@ sysret32_from_system_call: * when the system call started, which is already known to user * code. We zero R8-R10 to avoid info leaks. */ + movq RSP-ORIG_RAX(%rsp), %rsp + + /* + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored + * on the process stack which is not mapped to userspace and + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 + * switch until after after the last reference to the process + * stack. + * + * %r8 is zeroed before the sysret, thus safe to clobber. + */ + SWITCH_TO_USER_CR3 scratch_reg=%r8 + xorq %r8, %r8 xorq %r9, %r9 xorq %r10, %r10 - movq RSP-ORIG_RAX(%rsp), %rsp swapgs sysretl END(entry_SYSCALL_compat) -- cgit v1.2.3 From aa8c6248f8c75acfd610fe15d8cae23cf70d9d09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:36 +0100 Subject: x86/mm/pti: Add infrastructure for page table isolation Add the initial files for kernel page table isolation, with a minimal init function and the boot time detection for this misfeature. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 2 + arch/x86/boot/compressed/pagetable.c | 3 + arch/x86/entry/calling.h | 7 +++ arch/x86/include/asm/pti.h | 14 +++++ arch/x86/mm/Makefile | 7 ++- arch/x86/mm/init.c | 2 + arch/x86/mm/pti.c | 84 +++++++++++++++++++++++++ include/linux/pti.h | 11 ++++ init/main.c | 3 + 9 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 arch/x86/include/asm/pti.h create mode 100644 arch/x86/mm/pti.c create mode 100644 include/linux/pti.h (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 05496622b4ef..5dfd26265484 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2685,6 +2685,8 @@ steal time is computed, but won't influence scheduler behaviour + nopti [X86-64] Disable kernel page table isolation + nolapic [X86-32,APIC] Do not enable or use the local APIC. nolapic_timer [X86-32,APIC] Do not use the local APIC timer. diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 972319ff5b01..e691ff734cb5 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -23,6 +23,9 @@ */ #undef CONFIG_AMD_MEM_ENCRYPT +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index a9d17a7686ab..3d3389a92c33 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -205,18 +205,23 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI mov %cr3, \scratch_reg ADJUST_KERNEL_CR3 \scratch_reg mov \scratch_reg, %cr3 +.Lend_\@: .endm .macro SWITCH_TO_USER_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI mov %cr3, \scratch_reg ADJUST_USER_CR3 \scratch_reg mov \scratch_reg, %cr3 +.Lend_\@: .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI movq %cr3, \scratch_reg movq \scratch_reg, \save_reg /* @@ -233,11 +238,13 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro RESTORE_CR3 save_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI /* * The CR3 write could be avoided when not changing its value, * but would require a CR3 read *and* a scratch register. */ movq \save_reg, %cr3 +.Lend_\@: .endm #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h new file mode 100644 index 000000000000..0b5ef05b2d2d --- /dev/null +++ b/arch/x86/include/asm/pti.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _ASM_X86_PTI_H +#define _ASM_X86_PTI_H +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern void pti_init(void); +extern void pti_check_boottime_disable(void); +#else +static inline void pti_check_boottime_disable(void) { } +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PTI_H */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2e0017af8f9b..52906808e277 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_X86_INTEL_MPX) += mpx.o -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 020223420308..af75069fb116 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -20,6 +20,7 @@ #include #include #include +#include /* * We need to define the tracepoints somewhere, and tlb.c @@ -630,6 +631,7 @@ void __init init_mem_mapping(void) { unsigned long end; + pti_check_boottime_disable(); probe_page_size_mask(); setup_pcid(); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000000..375f23a758bc --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,84 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This code is based in part on work published here: + * + * https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + * Signed-off-by: Richard Fellner + * Signed-off-by: Moritz Lipp + * Signed-off-by: Daniel Gruss + * Signed-off-by: Michael Schwarz + * + * Major changes to the original code by: Dave Hansen + * Mostly rewritten by Thomas Gleixner and + * Andy Lutomirsky + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +static void __init pti_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + pr_info("%s\n", reason); +} + +void __init pti_check_boottime_disable(void) +{ + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_print_if_insecure("disabled on XEN PV."); + return; + } + + if (cmdline_find_option_bool(boot_command_line, "nopti")) { + pti_print_if_insecure("disabled on command line."); + return; + } + + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + return; + + setup_force_cpu_cap(X86_FEATURE_PTI); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("enabled\n"); +} diff --git a/include/linux/pti.h b/include/linux/pti.h new file mode 100644 index 000000000000..0174883a935a --- /dev/null +++ b/include/linux/pti.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _INCLUDE_PTI_H +#define _INCLUDE_PTI_H + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#include +#else +static inline void pti_init(void) { } +#endif + +#endif diff --git a/init/main.c b/init/main.c index 8a390f60ec81..b32ec72cdf3d 100644 --- a/init/main.c +++ b/init/main.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -506,6 +507,8 @@ static void __init mm_init(void) ioremap_huge_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); + /* Should be run after espfix64 is set up. */ + pti_init(); } asmlinkage __visible void __init start_kernel(void) -- cgit v1.2.3 From 41f4c20b57a4890ea7f56ff8717cc83fefb8d537 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Dec 2017 14:39:52 +0100 Subject: x86/pti: Add the pti= cmdline option and documentation Keep the "nopti" optional for traditional reasons. [ tglx: Don't allow force on when running on XEN PV and made 'on' printout conditional ] Requested-by: Linus Torvalds Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirsky Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Link: https://lkml.kernel.org/r/20171212133952.10177-1-bp@alien8.de Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/mm/pti.c | 26 ++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5dfd26265484..520fdec15bbb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3255,6 +3255,12 @@ pt. [PARIDE] See Documentation/blockdev/paride.txt. + pti= [X86_64] + Control user/kernel address space isolation: + on - enable + off - disable + auto - default setting + pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in default number. diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 375f23a758bc..a13f6b109865 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -54,21 +54,45 @@ static void __init pti_print_if_insecure(const char *reason) pr_info("%s\n", reason); } +static void __init pti_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + pr_info("%s\n", reason); +} + void __init pti_check_boottime_disable(void) { + char arg[5]; + int ret; + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { pti_print_if_insecure("disabled on XEN PV."); return; } + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_print_if_insecure("disabled on command line."); + return; + } + if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_print_if_secure("force enabled on command line."); + goto enable; + } + if (ret == 4 && !strncmp(arg, "auto", 4)) + goto autosel; + } + if (cmdline_find_option_bool(boot_command_line, "nopti")) { pti_print_if_insecure("disabled on command line."); return; } +autosel: if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) return; - +enable: setup_force_cpu_cap(X86_FEATURE_PTI); } -- cgit v1.2.3 From 61e9b3671007a5da8127955a1a3bda7e0d5f42e8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:37 +0100 Subject: x86/mm/pti: Add mapping helper functions Add the pagetable helper functions do manage the separate user space page tables. [ tglx: Split out from the big combo kaiser patch. Folded Andys simplification and made it out of line as Boris suggested ] Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 6 ++- arch/x86/include/asm/pgtable_64.h | 92 +++++++++++++++++++++++++++++++++++++++ arch/x86/mm/pti.c | 41 +++++++++++++++++ 3 files changed, 138 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f735c3016325..af38d93c4fbb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -909,7 +909,11 @@ static inline int pgd_none(pgd_t pgd) * pgd_offset() returns a (pgd_t *) * pgd_index() is used get the offset into the pgd page's array of pgd_t's; */ -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) +/* + * a shortcut to get a pgd_t in a given mm + */ +#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e9f05331e732..81462e9a34f6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr |= BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr &= ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgd(pgdp, pgd); +} +#else +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif + static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) + p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); +#else *p4dp = p4d; +#endif } static inline void native_p4d_clear(p4d_t *p4d) @@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + *pgdp = pti_set_user_pgd(pgdp, pgd); +#else *pgdp = pgd; +#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index a13f6b109865..69a983365392 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -96,6 +96,47 @@ enable: setup_force_cpu_cap(X86_FEATURE_PTI); } +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Changes to the high (kernel) portion of the kernelmode page + * tables are not automatically propagated to the usermode tables. + * + * Users should keep in mind that, unlike the kernelmode tables, + * there is no vmalloc_fault equivalent for the usermode tables. + * Top-level entries added to init_mm's usermode pgd after boot + * will not be automatically propagated to other mms. + */ + if (!pgdp_maps_userspace(pgdp)) + return pgd; + + /* + * The user page tables get the full PGD, accessible from + * userspace: + */ + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + + /* + * If this is normal user memory, make it NX in the kernel + * pagetables so that, if we somehow screw up and return to + * usermode with the kernel CR3 loaded, we'll get a page fault + * instead of allowing user code to execute with the wrong CR3. + * + * As exceptions, we don't set NX if: + * - _PAGE_USER is not set. This could be an executable + * EFI runtime mapping or something similar, and the kernel + * may execute from it + * - we don't have NX support + * - we're clearing the PGD (i.e. the new pgd is not present). + */ + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && + (__supported_pte_mask & _PAGE_NX)) + pgd.pgd |= _PAGE_NX; + + /* return the copy of the PGD we want the kernel to use: */ + return pgd; +} + /* * Initialize kernel page table isolation */ -- cgit v1.2.3 From 1c4de1ff4fe50453b968579ee86fac3da80dd783 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:38 +0100 Subject: x86/mm/pti: Allow NX poison to be set in p4d/pgd With PAGE_TABLE_ISOLATION the user portion of the kernel page tables is poisoned with the NX bit so if the entry code exits with the kernel page tables selected in CR3, userspace crashes. But doing so trips the p4d/pgd_bad() checks. Make sure it does not do that. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index af38d93c4fbb..2d2d07300b4a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -846,7 +846,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) static inline int p4d_bad(p4d_t p4d) { - return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; + unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (p4d_flags(p4d) & ~ignore_flags) != 0; } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -880,7 +885,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + unsigned long ignore_flags = _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) -- cgit v1.2.3 From d9e9a6418065bb376e5de8d93ce346939b9a37a6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:39 +0100 Subject: x86/mm/pti: Allocate a separate user PGD Kernel page table isolation requires to have two PGDs. One for the kernel, which contains the full kernel mapping plus the user space mapping and one for user space which contains the user space mappings and the minimal set of kernel mappings which are required by the architecture to be able to transition from and to user space. Add the necessary preliminaries. [ tglx: Split out from the big kaiser dump. EFI fixup from Kirill ] Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgalloc.h | 11 +++++++++++ arch/x86/kernel/head_64.S | 30 +++++++++++++++++++++++++++--- arch/x86/mm/pgtable.c | 5 +++-- arch/x86/platform/efi/efi_64.c | 5 ++++- 4 files changed, 45 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 4b5e1eafada7..aff42e1da6ee 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Instead of one PGD, we acquire two PGDs. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 +#else +#define PGD_ALLOCATION_ORDER 0 +#endif + /* * Allocate and free page tables. */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7dca675fe78d..04a625f0fcda 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -350,13 +371,14 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_top_pgt) +NEXT_PGD_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts) .data #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt) */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #else -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 #endif #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17ebc5a978cc..9b7bcbd33cc2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } #else + static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 20fb31579b69..39c4b35ac7a4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -195,6 +195,9 @@ static pgd_t *efi_pgd; * because we want to avoid inserting EFI region mappings (EFI_VA_END * to EFI_VA_START) into the standard kernel page tables. Everything * else can be shared, see efi_sync_low_kernel_mappings(). + * + * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the + * allocation. */ int __init efi_alloc_page_tables(void) { @@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void) return 0; gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; - efi_pgd = (pgd_t *)__get_free_page(gfp_mask); + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); if (!efi_pgd) return -ENOMEM; -- cgit v1.2.3 From fc2fbc8512ed08d1de7720936fd7d2e4ce02c3a2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:40 +0100 Subject: x86/mm/pti: Populate user PGD In clone_pgd_range() copy the init user PGDs which cover the kernel half of the address space, so a process has all the required kernel mappings visible. [ tglx: Split out from the big kaiser dump and folded Andys simplification ] Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2d2d07300b4a..cc6fa75884e9 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1119,7 +1119,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + /* Clone the user space pgd as well */ + memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), + count * sizeof(pgd_t)); +#endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) -- cgit v1.2.3 From 03f4424f348e8be95eb1bbeba09461cd7b867828 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:42 +0100 Subject: x86/mm/pti: Add functions to clone kernel PMDs Provide infrastructure to: - find a kernel PMD for a mapping which must be visible to user space for the entry/exit code to work. - walk an address range and share the kernel PMD with it. This reuses a small part of the original KAISER patches to populate the user space page table. [ tglx: Made it universally usable so it can be used for any kind of shared mapping. Add a mechanism to clear specific bits in the user space visible PMD entry. Folded Andys simplifactions ] Originally-by: Dave Hansen Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pti.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 69a983365392..d58bcee470fc 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -48,6 +48,11 @@ #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK 0 +#endif + static void __init pti_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) @@ -137,6 +142,128 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) return pgd; } +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (address < PAGE_OFFSET) { + WARN_ONCE(1, "attempt to walk user address\n"); + return NULL; + } + + if (pgd_none(*pgd)) { + unsigned long new_p4d_page = __get_free_page(gfp); + if (!new_p4d_page) + return NULL; + + if (pgd_none(*pgd)) { + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + new_p4d_page = 0; + } + if (new_p4d_page) + free_page(new_p4d_page); + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + p4d_t *p4d = pti_user_pagetable_walk_p4d(address); + pud_t *pud; + + BUILD_BUG_ON(p4d_large(*p4d) != 0); + if (p4d_none(*p4d)) { + unsigned long new_pud_page = __get_free_page(gfp); + if (!new_pud_page) + return NULL; + + if (p4d_none(*p4d)) { + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); + new_pud_page = 0; + } + if (new_pud_page) + free_page(new_pud_page); + } + + pud = pud_offset(p4d, address); + /* The user page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + + if (pud_none(*pud)) { + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + new_pmd_page = 0; + } + if (new_pmd_page) + free_page(new_pmd_page); + } + + return pmd_offset(pud, address); +} + +static void __init +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +{ + unsigned long addr; + + /* + * Clone the populated PMDs which cover start to end. These PMD areas + * can have holes. + */ + for (addr = start; addr < end; addr += PMD_SIZE) { + pmd_t *pmd, *target_pmd; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(addr); + if (WARN_ON(pgd_none(*pgd))) + return; + p4d = p4d_offset(pgd, addr); + if (WARN_ON(p4d_none(*p4d))) + return; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; + + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = pmd_clear_flags(*pmd, clear); + } +} + /* * Initialize kernel page table isolation */ -- cgit v1.2.3 From 8d4b067895791ab9fdb1aadfc505f64d71239dd2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:43 +0100 Subject: x86/mm/pti: Force entry through trampoline when PTI active Force the entry through the trampoline only when PTI is active. Otherwise go through the normal entry code. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a9210f9b7cf8..f2a94dfb434e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1339,7 +1339,10 @@ void syscall_init(void) (entry_SYSCALL_64_trampoline - _entry_trampoline); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + if (static_cpu_has(X86_FEATURE_PTI)) + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + else + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); -- cgit v1.2.3 From f7cfbee91559ca7e3e961a00ffac921208a115ad Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 4 Dec 2017 15:07:45 +0100 Subject: x86/mm/pti: Share cpu_entry_area with user space page tables Share the cpu entry area so the user space and kernel space page tables have the same P4D page. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pti.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index d58bcee470fc..59290356f19f 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -264,6 +264,29 @@ pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) } } +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ + p4d_t *kernel_p4d, *user_p4d; + pgd_t *kernel_pgd; + + user_p4d = pti_user_pagetable_walk_p4d(addr); + kernel_pgd = pgd_offset_k(addr); + kernel_p4d = p4d_offset(kernel_pgd, addr); + *user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA into the user space visible page table. + */ +static void __init pti_clone_user_shared(void) +{ + pti_clone_p4d(CPU_ENTRY_AREA_BASE); +} + /* * Initialize kernel page table isolation */ @@ -273,4 +296,6 @@ void __init pti_init(void) return; pr_info("enabled\n"); + + pti_clone_user_shared(); } -- cgit v1.2.3 From 2f7412ba9c6af5ab16bdbb4a3fdb1dcd2b4fd3c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:46 +0100 Subject: x86/entry: Align entry text section to PMD boundary The (irq)entry text must be visible in the user space page tables. To allow simple PMD based sharing, make the entry text PMD aligned. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d2a8b5a24a44..1e413a9326aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -61,11 +61,17 @@ jiffies_64 = jiffies; . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); + #else #define X64_ALIGN_RODATA_BEGIN #define X64_ALIGN_RODATA_END +#define ALIGN_ENTRY_TEXT_BEGIN +#define ALIGN_ENTRY_TEXT_END + #endif PHDRS { @@ -102,8 +108,10 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT IRQENTRY_TEXT + ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) -- cgit v1.2.3 From 6dc72c3cbca0580642808d677181cad4c6433893 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:47 +0100 Subject: x86/mm/pti: Share entry text PMD Share the entry text PMD of the kernel mapping with the user space mapping. If large pages are enabled this is a single PMD entry and at the point where it is copied into the user page table the RW bit has not been cleared yet. Clear it right away so the user space visible map becomes RX. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pti.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 59290356f19f..0e78797650a7 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -287,6 +287,15 @@ static void __init pti_clone_user_shared(void) pti_clone_p4d(CPU_ENTRY_AREA_BASE); } +/* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +static void __init pti_clone_entry_text(void) +{ + pti_clone_pmds((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, _PAGE_RW); +} + /* * Initialize kernel page table isolation */ @@ -298,4 +307,5 @@ void __init pti_init(void) pr_info("enabled\n"); pti_clone_user_shared(); + pti_clone_entry_text(); } -- cgit v1.2.3 From 4b6bbe95b87966ba08999574db65c93c5e925a36 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 15 Dec 2017 22:08:18 +0100 Subject: x86/mm/pti: Map ESPFIX into user space Map the ESPFIX pages into user space when PTI is enabled. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/pti.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 0e78797650a7..b1c38ef9fbbb 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -287,6 +287,16 @@ static void __init pti_clone_user_shared(void) pti_clone_p4d(CPU_ENTRY_AREA_BASE); } +/* + * Clone the ESPFIX P4D into the user space visinble page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 + pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + /* * Clone the populated PMDs of the entry and irqentry text and force it RO. */ @@ -308,4 +318,5 @@ void __init pti_init(void) pti_clone_user_shared(); pti_clone_entry_text(); + pti_setup_espfix64(); } -- cgit v1.2.3 From 10043e02db7f8a4161f76434931051e7d797a5f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:49 +0100 Subject: x86/cpu_entry_area: Add debugstore entries to cpu_entry_area The Intel PEBS/BTS debug store is a design trainwreck as it expects virtual addresses which must be visible in any execution context. So it is required to make these mappings visible to user space when kernel page table isolation is active. Provide enough room for the buffer mappings in the cpu_entry_area so the buffers are available in the user space visible page tables. At the point where the kernel side entry area is populated there is no buffer available yet, but the kernel PMD must be populated. To achieve this set the entries for these buffers to non present. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 5 +++-- arch/x86/events/perf_event.h | 21 ++------------------ arch/x86/include/asm/cpu_entry_area.h | 13 +++++++++++++ arch/x86/include/asm/intel_ds.h | 36 +++++++++++++++++++++++++++++++++++ arch/x86/mm/cpu_entry_area.c | 27 ++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 21 deletions(-) create mode 100644 arch/x86/include/asm/intel_ds.h (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3674a4b6f8bd..6522f0279cb8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -8,11 +8,12 @@ #include "../perf_event.h" +/* Waste a full page so it can be mapped into the cpu_entry_area */ +DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 -#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_FIXUP_SIZE PAGE_SIZE /* diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f7aaadf9331f..373f9eda80b1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,6 +14,8 @@ #include +#include + /* To enable MSR tracing please use the generic trace points. */ /* @@ -77,8 +79,6 @@ struct amd_nb { struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 8 #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) /* @@ -95,23 +95,6 @@ struct amd_nb { PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; - #define PEBS_REGS \ (PERF_REG_X86_AX | \ PERF_REG_X86_BX | \ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 2fbc69a0916e..4a7884b8dca5 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -5,6 +5,7 @@ #include #include +#include /* * cpu_entry_area is a percpu region that contains things needed by the CPU @@ -40,6 +41,18 @@ struct cpu_entry_area { */ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; #endif +#ifdef CONFIG_CPU_SUP_INTEL + /* + * Per CPU debug store for Intel performance monitoring. Wastes a + * full page at the moment. + */ + struct debug_store cpu_debug_store; + /* + * The actual PEBS/BTS buffers must be mapped to user space + * Reserve enough fixmap PTEs. + */ + struct debug_store_buffers cpu_debug_buffers; +#endif }; #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h new file mode 100644 index 000000000000..62a9f4966b42 --- /dev/null +++ b/arch/x86/include/asm/intel_ds.h @@ -0,0 +1,36 @@ +#ifndef _ASM_INTEL_DS_H +#define _ASM_INTEL_DS_H + +#include + +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +} __aligned(PAGE_SIZE); + +DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + +struct debug_store_buffers { + char bts_buffer[BTS_BUFFER_SIZE]; + char pebs_buffer[PEBS_BUFFER_SIZE]; +}; + +#endif diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index fe814fd5e014..b9283cc27622 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL + int npages; + void *cea; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; + npages = sizeof(struct debug_store) / PAGE_SIZE; + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, + PAGE_KERNEL); + + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; + /* + * Force the population of PMDs for not yet allocated per cpu + * memory like debug store buffers. + */ + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; + for (; npages; npages--, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + /* Setup the fixmap mappings only once per-processor */ static void __init setup_cpu_entry_area(int cpu) { @@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu) cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); #endif + percpu_setup_debug_store(cpu); } static __init void setup_cpu_entry_area_ptes(void) -- cgit v1.2.3 From c1961a4631daef4aeabee8e368b1b13e8f173c91 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 4 Dec 2017 15:07:50 +0100 Subject: x86/events/intel/ds: Map debug buffers in cpu_entry_area The BTS and PEBS buffers both have their virtual addresses programmed into the hardware. This means that any access to them is performed via the page tables. The times that the hardware accesses these are entirely dependent on how the performance monitoring hardware events are set up. In other words, there is no way for the kernel to tell when the hardware might access these buffers. To avoid perf crashes, place 'debug_store' allocate pages and map them into the cpu_entry_area. The PEBS fixup buffer does not need this treatment. [ tglx: Got rid of the kaiser_add_mapping() complication ] Signed-off-by: Hugh Dickins Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 125 +++++++++++++++++++++++++++---------------- arch/x86/events/perf_event.h | 2 + 2 files changed, 82 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6522f0279cb8..8f0aace08b87 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -280,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu) static DEFINE_PER_CPU(void *, insn_buffer); -static int alloc_pebs_buffer(int cpu) +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + phys_addr_t pa; + size_t msz = 0; + + pa = virt_to_phys(addr); + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, pa, prot); +} + +static void ds_clear_cea(void *cea, size_t size) +{ + size_t msz = 0; + + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +} + +static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) +{ + unsigned int order = get_order(size); int node = cpu_to_node(cpu); - int max; - void *buffer, *ibuffer; + struct page *page; + + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); + return page ? page_address(page) : NULL; +} + +static void dsfree_pages(const void *buffer, size_t size) +{ + if (buffer) + free_pages((unsigned long)buffer, get_order(size)); +} + +static int alloc_pebs_buffer(int cpu) +{ + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + size_t bsiz = x86_pmu.pebs_buffer_size; + int max, node = cpu_to_node(cpu); + void *buffer, *ibuffer, *cea; if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; @@ -301,25 +337,27 @@ static int alloc_pebs_buffer(int cpu) if (x86_pmu.intel_cap.pebs_format < 2) { ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); if (!ibuffer) { - kfree(buffer); + dsfree_pages(buffer, bsiz); return -ENOMEM; } per_cpu(insn_buffer, cpu) = ibuffer; } - - max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_pebs_vaddr = buffer; + /* Update the cpu entry area mapping */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds->pebs_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); + ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; return 0; } static void release_pebs_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.pebs) return; @@ -327,73 +365,70 @@ static void release_pebs_buffer(int cpu) kfree(per_cpu(insn_buffer, cpu)); per_cpu(insn_buffer, cpu) = NULL; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); ds->pebs_buffer_base = 0; + dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->ds_pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - int node = cpu_to_node(cpu); - int max, thresh; - void *buffer; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *buffer, *cea; + int max; if (!x86_pmu.bts) return 0; - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); if (unlikely(!buffer)) { WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; } - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_bts_vaddr = buffer; + /* Update the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds->bts_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); + ds->bts_absolute_maximum = ds->bts_buffer_base + max; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); return 0; } static void release_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds_clear_cea(cea, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; + dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); + hwev->ds_bts_vaddr = NULL; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; - return 0; } static void release_ds_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 373f9eda80b1..8e4ea143ed96 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -199,6 +199,8 @@ struct cpu_hw_events { * Intel DebugStore bits */ struct debug_store *ds; + void *ds_pebs_vaddr; + void *ds_bts_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; -- cgit v1.2.3 From 9f449772a3106bcdd4eb8fdeb281147b0e99fb30 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 12 Dec 2017 07:56:44 -0800 Subject: x86/mm/64: Make a full PGD-entry size hole in the memory map Shrink vmalloc space from 16384TiB to 12800TiB to enlarge the hole starting at 0xff90000000000000 to be a full PGD entry. A subsequent patch will use this hole for the pagetable isolation LDT alias. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 4 ++-- arch/x86/include/asm/pgtable_64_types.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 51101708a03a..496a1dbf139d 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -29,8 +29,8 @@ Virtual memory map with 5 level page tables: hole caused by [56:63] sign extension ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory -ff90000000000000 - ff91ffffffffffff (=49 bits) hole -ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space +ff90000000000000 - ff9fffffffffffff (=52 bits) hole +ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ... unused hole ... diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3d27831bc58d..83e9489ae944 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -79,8 +79,8 @@ typedef struct { pteval_t pte; } pte_t; #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #ifdef CONFIG_X86_5LEVEL -# define VMALLOC_SIZE_TB _AC(16384, UL) -# define __VMALLOC_BASE _AC(0xff92000000000000, UL) +# define VMALLOC_SIZE_TB _AC(12800, UL) +# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) #else # define VMALLOC_SIZE_TB _AC(32, UL) -- cgit v1.2.3 From f55f0501cbf65ec41cca5058513031b711730b1d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 12 Dec 2017 07:56:45 -0800 Subject: x86/pti: Put the LDT in its own PGD if PTI is on With PTI enabled, the LDT must be mapped in the usermode tables somewhere. The LDT is per process, i.e. per mm. An earlier approach mapped the LDT on context switch into a fixmap area, but that's a big overhead and exhausted the fixmap space when NR_CPUS got big. Take advantage of the fact that there is an address space hole which provides a completely unused pgd. Use this pgd to manage per-mm LDT mappings. This has a down side: the LDT isn't (currently) randomized, and an attack that can write the LDT is instant root due to call gates (thanks, AMD, for leaving call gates in AMD64 but designing them wrong so they're only useful for exploits). This can be mitigated by making the LDT read-only or randomizing the mapping, either of which is strightforward on top of this patch. This will significantly slow down LDT users, but that shouldn't matter for important workloads -- the LDT is only used by DOSEMU(2), Wine, and very old libc implementations. [ tglx: Cleaned it up. ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 3 +- arch/x86/include/asm/mmu_context.h | 59 ++++++++++++-- arch/x86/include/asm/pgtable_64_types.h | 4 + arch/x86/include/asm/processor.h | 23 ++++-- arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++- arch/x86/mm/dump_pagetables.c | 9 +++ 6 files changed, 220 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 496a1dbf139d..ad41b3813f0a 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... +fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... @@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables: hole caused by [56:63] sign extension ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory -ff90000000000000 - ff9fffffffffffff (=52 bits) hole +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5ede7cae1d67..c931b88982a0 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -50,10 +50,33 @@ struct ldt_struct { * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ - struct desc_struct *entries; - unsigned int nr_entries; + struct desc_struct *entries; + unsigned int nr_entries; + + /* + * If PTI is in use, then the entries array is not mapped while we're + * in user mode. The whole array will be aliased at the addressed + * given by ldt_slot_va(slot). We use two slots so that we can allocate + * and map, and enable a new LDT without invalidating the mapping + * of an older, still-in-use LDT. + * + * slot will be -1 if this LDT doesn't have an alias mapping. + */ + int slot; }; +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ +#ifdef CONFIG_X86_64 + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +#else + BUG(); +#endif +} + /* * Used for LDT copy/destruction. */ @@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm) } int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); +void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ static inline void init_new_context_ldt(struct mm_struct *mm) { } static inline int ldt_dup_context(struct mm_struct *oldmm, @@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm, { return 0; } -static inline void destroy_context_ldt(struct mm_struct *mm) {} +static inline void destroy_context_ldt(struct mm_struct *mm) { } +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } #endif static inline void load_mm_ldt(struct mm_struct *mm) @@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm) * that we can see. */ - if (unlikely(ldt)) - set_ldt(ldt->entries, ldt->nr_entries); - else + if (unlikely(ldt)) { + if (static_cpu_has(X86_FEATURE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + /* + * Whoops -- either the new LDT isn't mapped + * (if slot == -1) or is mapped into a bogus + * slot (if slot > 1). + */ + clear_LDT(); + return; + } + + /* + * If page table isolation is enabled, ldt->entries + * will not be mapped in the userspace pagetables. + * Tell the CPU to access the LDT through the alias + * at ldt_slot_va(ldt->slot). + */ + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { clear_LDT(); + } #else clear_LDT(); #endif @@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); + ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 83e9489ae944..b97a539bcdee 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -82,10 +82,14 @@ typedef struct { pteval_t pte; } pte_t; # define VMALLOC_SIZE_TB _AC(12800, UL) # define __VMALLOC_BASE _AC(0xffa0000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +# define LDT_PGD_ENTRY _AC(-112, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #else # define VMALLOC_SIZE_TB _AC(32, UL) # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +# define LDT_PGD_ENTRY _AC(-4, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif #ifdef CONFIG_RANDOMIZE_MEMORY diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9e482d8b0b97..9c18da64daa9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x) #else /* - * User space process size. 47bits minus one guard page. The guard - * page is necessary on Intel CPUs: if a SYSCALL instruction is at - * the highest possible canonical userspace address, then that - * syscall will enter the kernel with a non-canonical return - * address, and SYSRET will explode dangerously. We avoid this - * particular problem by preventing anything from being mapped - * at the maximum canonical address. + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] */ #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a6b5d62f45a7..9629c5d8267a 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -51,13 +52,11 @@ static void refresh_ldt_segments(void) static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; - mm_context_t *pc; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + load_mm_ldt(mm); refresh_ldt_segments(); } @@ -94,10 +93,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return NULL; } + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + new_ldt->nr_entries = num_entries; return new_ldt; } +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + * + * There is no corresponding unmap function. Even if the LDT is freed, we + * leave the PTEs around until the slot is reused or the mm is destroyed. + * This is harmless: the LDT is always in ordinary memory, and no one will + * access the freed slot. + * + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make + * it useful, and the flush would slow down modify_ldt(). + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + bool is_vmalloc, had_top_level_entry; + unsigned long va; + spinlock_t *ptl; + pgd_t *pgd; + int i; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* + * Did we already have the top level entry allocated? We can't + * use pgd_none() for this because it doens't do anything on + * 4-level page table kernels. + */ + pgd = pgd_offset(mm, LDT_BASE_ADDR); + had_top_level_entry = (pgd->pgd != 0); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON(kernel_to_user_pgdp(pgd)->pgd); + set_pgd(kernel_to_user_pgdp(pgd), *pgd); + } + } + + va = (unsigned long)ldt_slot_va(slot); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + + ldt->slot = slot; +#endif + return 0; +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = start + (1UL << PGDIR_SHIFT); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { @@ -156,6 +266,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } mm->context.ldt = new_ldt; out_unlock: @@ -174,6 +290,11 @@ void destroy_context_ldt(struct mm_struct *mm) mm->context.ldt = NULL; } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; @@ -287,6 +408,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + free_ldt_struct(old_ldt); + goto out_unlock; + } + install_ldt(mm, new_ldt); free_ldt_struct(old_ldt); error = 0; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 43dedbfb7257..690eaf31ca34 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -52,11 +52,17 @@ enum address_markers_idx { USER_SPACE_NR = 0, KERNEL_SPACE_NR, LOW_KERNEL_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) + LDT_NR, +#endif VMALLOC_START_NR, VMEMMAP_START_NR, #ifdef CONFIG_KASAN KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, +#endif +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) + LDT_NR, #endif CPU_ENTRY_AREA_NR, #ifdef CONFIG_X86_ESPFIX64 @@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = { #ifdef CONFIG_KASAN [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, #endif [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, #ifdef CONFIG_X86_ESPFIX64 -- cgit v1.2.3 From 85900ea51577e31b186e523c8f4e068c79ecc7d3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 12 Dec 2017 07:56:42 -0800 Subject: x86/pti: Map the vsyscall page if needed Make VSYSCALLs work fully in PTI mode by mapping them properly to the user space visible page tables. [ tglx: Hide unused functions (Patch by Arnd Bergmann) ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 6 ++-- arch/x86/include/asm/vsyscall.h | 1 + arch/x86/mm/pti.c | 65 +++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 1faf40f2dda9..577fa8adb785 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr) * vsyscalls but leave the page not present. If so, we skip calling * this. */ -static void __init set_vsyscall_pgtable_user_bits(void) +void __init set_vsyscall_pgtable_user_bits(pgd_t *root) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; - pgd = pgd_offset_k(VSYSCALL_ADDR); + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); #if CONFIG_PGTABLE_LEVELS >= 5 @@ -373,7 +373,7 @@ void __init map_vsyscall(void) vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); - set_vsyscall_pgtable_user_bits(); + set_vsyscall_pgtable_user_bits(swapper_pg_dir); } BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d9a7c659009c..b986b2ca688a 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -7,6 +7,7 @@ #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); +extern void set_vsyscall_pgtable_user_bits(pgd_t *root); /* * Called on instruction fetch fault in vsyscall page. diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index b1c38ef9fbbb..bce8aea65606 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -223,6 +224,69 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) return pmd_offset(pud, address); } +#ifdef CONFIG_X86_VSYSCALL_EMULATION +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down. Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables. It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + pmd_t *pmd = pti_user_pagetable_walk_pmd(address); + pte_t *pte; + + /* We can't do anything sensible if we hit a large mapping. */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + new_pte_page = 0; + } + if (new_pte_page) + free_page(new_pte_page); + } + + pte = pte_offset_kernel(pmd, address); + if (pte_flags(*pte) & _PAGE_USER) { + WARN_ONCE(1, "attempt to walk to user pte\n"); + return NULL; + } + return pte; +} + +static void __init pti_setup_vsyscall(void) +{ + pte_t *pte, *target_pte; + unsigned int level; + + pte = lookup_address(VSYSCALL_ADDR, &level); + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) + return; + + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + if (WARN_ON(!target_pte)) + return; + + *target_pte = *pte; + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + static void __init pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) { @@ -319,4 +383,5 @@ void __init pti_init(void) pti_clone_user_shared(); pti_clone_entry_text(); pti_setup_espfix64(); + pti_setup_vsyscall(); } -- cgit v1.2.3 From 2ea907c4fe7b78e5840c1dc07800eae93248cad1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:57 +0100 Subject: x86/mm: Allow flushing for future ASID switches If changing the page tables in such a way that an invalidation of all contexts (aka. PCIDs / ASIDs) is required, they can be actively invalidated by: 1. INVPCID for each PCID (works for single pages too). 2. Load CR3 with each PCID without the NOFLUSH bit set 3. Load CR3 with the NOFLUSH bit set for each and do INVLPG for each address. But, none of these are really feasible since there are ~6 ASIDs (12 with PAGE_TABLE_ISOLATION) at the time that invalidation is required. Instead of actively invalidating them, invalidate the *current* context and also mark the cpu_tlbstate _quickly_ to indicate future invalidation to be required. At the next context-switch, look for this indicator ('invalidate_other' being set) invalidate all of the cpu_tlbstate.ctxs[] entries. This ensures that any future context switches will do a full flush of the TLB, picking up the previous changes. [ tglx: Folded more fixups from Peter ] Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 37 +++++++++++++++++++++++++++++-------- arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 171b429f43a2..490a706fdba8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -134,6 +134,17 @@ struct tlb_state { */ bool is_lazy; + /* + * If set we changed the page tables in such a way that we + * needed an invalidation of all contexts (aka. PCIDs / ASIDs). + * This tells us to go invalidate all the non-loaded ctxs[] + * on the next context switch. + * + * The current ctx was kept up-to-date as it ran and does not + * need to be invalidated. + */ + bool invalidate_other; + /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. @@ -211,6 +222,14 @@ static inline unsigned long cr4_read_shadow(void) return this_cpu_read(cpu_tlbstate.cr4); } +/* + * Mark all other ASIDs as invalid, preserves the current. + */ +static inline void invalidate_other_asid(void) +{ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + /* * Save some of cr4 feature set we're using (e.g. Pentium 4MB * enable and PPro Global page enable), so that any CPU's that boot @@ -298,14 +317,6 @@ static inline void __flush_tlb_all(void) */ __flush_tlb(); } - - /* - * Note: if we somehow had PCID but not PGE, then this wouldn't work -- - * we'd end up flushing kernel translations for the current ASID but - * we might fail to flush kernel translations for other cached ASIDs. - * - * To avoid this issue, we force PCID off if PGE is off. - */ } /* @@ -315,6 +326,16 @@ static inline void __flush_tlb_one(unsigned long addr) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * __flush_tlb_single() will have cleared the TLB entry for this ASID, + * but since kernel space is replicated across all, we must also + * invalidate all others. + */ + invalidate_other_asid(); } #define TLB_FLUSH_ALL -1UL diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0a1be3adc97e..254c9eb79fe5 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,38 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts. We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +void clear_asid_other(void) +{ + u16 asid; + + /* + * This is only expected to be set if we have disabled + * kernel _PAGE_GLOBAL pages. + */ + if (!static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON_ONCE(1); + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + /* Do not need to flush the current asid */ + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) + continue; + /* + * Make sure the next time we go to switch to + * this asid, we do a flush: + */ + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); + } + this_cpu_write(cpu_tlbstate.invalidate_other, false); +} + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, return; } + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != next->context.ctx_id) -- cgit v1.2.3 From 48e111982cda033fec832c6b0592c2acedd85d04 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:07:58 +0100 Subject: x86/mm: Abstract switching CR3 In preparation to adding additional PCID flushing, abstract the loading of a new ASID into CR3. [ PeterZ: Split out from big combo patch ] Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 254c9eb79fe5..42a8875f73fe 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -100,6 +100,24 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +{ + unsigned long new_mm_cr3; + + if (need_flush) { + new_mm_cr3 = build_cr3(pgdir, new_asid); + } else { + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + } + + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -230,7 +248,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(build_cr3(next->pgd, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, true); /* * NB: This gets called via leave_mm() in the idle path @@ -243,7 +261,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(build_cr3_noflush(next->pgd, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, false); /* See above wrt _rcuidle. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); -- cgit v1.2.3 From 6fd166aae78c0ab738d49bda653cbd9e3b1491cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Dec 2017 15:07:59 +0100 Subject: x86/mm: Use/Fix PCID to optimize user/kernel switches We can use PCID to retain the TLBs across CR3 switches; including those now part of the user/kernel switch. This increases performance of kernel entry/exit at the cost of more expensive/complicated TLB flushing. Now that we have two address spaces, one for kernel and one for user space, we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID (just like we use the PFN LSB for the PGD). Since we do TLB invalidation from kernel space, the existing code will only invalidate the kernel PCID, we augment that by marking the corresponding user PCID invalid, and upon switching back to userspace, use a flushing CR3 write for the switch. In order to access the user_pcid_flush_mask we use PER_CPU storage, which means the previously established SWAPGS vs CR3 ordering is now mandatory and required. Having to do this memory access does require additional registers, most sites have a functioning stack and we can spill one (RAX), sites without functional stack need to otherwise provide the second scratch register. Note: PCID is generally available on Intel Sandybridge and later CPUs. Note: Up until this point TLB flushing was broken in this series. Based-on-code-from: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 72 ++++++++++++++++++----- arch/x86/entry/entry_64.S | 9 +-- arch/x86/entry/entry_64_compat.S | 4 +- arch/x86/include/asm/processor-flags.h | 5 ++ arch/x86/include/asm/tlbflush.h | 91 +++++++++++++++++++++++++---- arch/x86/include/uapi/asm/processor-flags.h | 7 ++- arch/x86/kernel/asm-offsets.c | 4 ++ arch/x86/mm/init.c | 2 +- arch/x86/mm/tlb.c | 1 + 9 files changed, 162 insertions(+), 33 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3d3389a92c33..7894e5c0eef7 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include /* @@ -191,17 +194,21 @@ For 32-bit we have the following conventions - kernel is built with #ifdef CONFIG_PAGE_TABLE_ISOLATION -/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ -#define PTI_SWITCH_MASK (1< #include #include -#include "calling.h" #include #include #include @@ -40,6 +39,8 @@ #include #include +#include "calling.h" + .code64 .section .entry.text, "ax" @@ -406,7 +407,7 @@ syscall_return_via_sysret: * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ - SWITCH_TO_USER_CR3 scratch_reg=%rdi + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi popq %rdi popq %rsp @@ -744,7 +745,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * We can do future final exit work right here. */ - SWITCH_TO_USER_CR3 scratch_reg=%rdi + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi /* Restore RDI. */ popq %rdi @@ -857,7 +858,7 @@ native_irq_return_ldt: */ orq PER_CPU_VAR(espfix_stack), %rax - SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi SWAPGS /* to user GS */ popq %rdi /* Restore user RDI */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 05238b29895e..40f17009ec20 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -275,9 +275,9 @@ sysret32_from_system_call: * switch until after after the last reference to the process * stack. * - * %r8 is zeroed before the sysret, thus safe to clobber. + * %r8/%r9 are zeroed before the sysret, thus safe to clobber. */ - SWITCH_TO_USER_CR3 scratch_reg=%r8 + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 xorq %r8, %r8 xorq %r9, %r9 diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 43212a43ee69..6a60fea90b9d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -38,6 +38,11 @@ #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_SWITCH_BIT 11 +#endif + #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 490a706fdba8..5dcc38b16604 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { @@ -24,24 +26,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) /* There are 12 bits of space for ASIDS in CR3 */ #define CR3_HW_ASID_BITS 12 + /* * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ -#define PTI_CONSUMED_ASID_BITS 0 +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) -#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) /* * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account * for them being zero-based. Another -1 is because ASID 0 is reserved for * use by non-PCID-aware users. */ -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS 6 static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * If current->mm == NULL then we borrow a mm which may change + * during a task switch and therefore we must not be preempted + * while we write CR3 back: */ preempt_disable(); native_write_cr3(__native_read_cr3()); @@ -301,7 +361,14 @@ static inline void __native_flush_tlb_global(void) */ static inline void __native_flush_tlb_single(unsigned long addr) { + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + invalidate_user_asid(loaded_mm_asid); } /* diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 53b4ca55ebb6..97abdaab9535 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -78,7 +78,12 @@ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ + +#define X86_CR3_PCID_BITS 12 +#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) + +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 676b7cf4b62b..76417a9aab73 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -94,6 +95,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index af75069fb116..caeb8a7bf0a4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -855,7 +855,7 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 42a8875f73fe..a1561957dccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -105,6 +105,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) unsigned long new_mm_cr3; if (need_flush) { + invalidate_user_asid(new_asid); new_mm_cr3 = build_cr3(pgdir, new_asid); } else { new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); -- cgit v1.2.3 From 21e94459110252d41b45c0c8ba50fd72a664d50c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Dec 2017 15:08:00 +0100 Subject: x86/mm: Optimize RESTORE_CR3 Most NMI/paranoid exceptions will not in fact change pagetables and would thus not require TLB flushing, however RESTORE_CR3 uses flushing CR3 writes. Restores to kernel PCIDs can be NOFLUSH, because we explicitly flush the kernel mappings and now that we track which user PCIDs need flushing we can avoid those too when possible. This does mean RESTORE_CR3 needs an additional scratch_reg, luckily both sites have plenty available. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 30 ++++++++++++++++++++++++++++-- arch/x86/entry/entry_64.S | 4 ++-- 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 7894e5c0eef7..45a63e00a6af 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -281,8 +281,34 @@ For 32-bit we have the following conventions - kernel is built with .Ldone_\@: .endm -.macro RESTORE_CR3 save_reg:req +.macro RESTORE_CR3 scratch_reg:req save_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + + /* + * KERNEL pages can always resume with NOFLUSH as we do + * explicit flushes. + */ + bt $X86_CR3_PTI_SWITCH_BIT, \save_reg + jnc .Lnoflush_\@ + + /* + * Check if there's a pending flush for the user ASID we're + * about to set. + */ + movq \save_reg, \scratch_reg + andq $(0x7FF), \scratch_reg + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask + jnc .Lnoflush_\@ + + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask + jmp .Lwrcr3_\@ + +.Lnoflush_\@: + SET_NOFLUSH_BIT \save_reg + +.Lwrcr3_\@: /* * The CR3 write could be avoided when not changing its value, * but would require a CR3 read *and* a scratch register. @@ -301,7 +327,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req .endm -.macro RESTORE_CR3 save_reg:req +.macro RESTORE_CR3 scratch_reg:req save_reg:req .endm #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fd501844af1f..ed31d00dc5ee 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1288,7 +1288,7 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ - RESTORE_CR3 save_reg=%r14 + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: @@ -1730,7 +1730,7 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi - RESTORE_CR3 save_reg=%r14 + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore -- cgit v1.2.3 From 6cff64b86aaaa07f89f50498055a20e45754b0c1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 4 Dec 2017 15:08:01 +0100 Subject: x86/mm: Use INVPCID for __native_flush_tlb_single() This uses INVPCID to shoot down individual lines of the user mapping instead of marking the entire user map as invalid. This could/might/possibly be faster. This for sure needs tlb_single_page_flush_ceiling to be redetermined; esp. since INVPCID is _slow_. A detailed performance analysis is available here: https://lkml.kernel.org/r/3062e486-3539-8a1f-5724-16199420be71@intel.com [ Peterz: Split out from big combo patch ] Signed-off-by: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/tlbflush.h | 23 +++++++++++++- arch/x86/mm/init.c | 64 ++++++++++++++++++++++---------------- 3 files changed, 60 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d8ec834ea884..07cdd1715705 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -197,6 +197,7 @@ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 5dcc38b16604..57072a1052fe 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -85,6 +85,18 @@ static inline u16 kern_pcid(u16 asid) return asid + 1; } +/* + * The user PCID is just the kernel one, plus the "switch bit". + */ +static inline u16 user_pcid(u16 asid) +{ + u16 ret = kern_pcid(asid); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + ret |= 1 << X86_CR3_PTI_SWITCH_BIT; +#endif + return ret; +} + struct pgd_t; static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) { @@ -335,6 +347,8 @@ static inline void __native_flush_tlb_global(void) /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -368,7 +382,14 @@ static inline void __native_flush_tlb_single(unsigned long addr) if (!static_cpu_has(X86_FEATURE_PTI)) return; - invalidate_user_asid(loaded_mm_asid); + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); } /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index caeb8a7bf0a4..80259ad8c386 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -203,34 +203,44 @@ static void __init probe_page_size_mask(void) static void setup_pcid(void) { -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_PCID)) { - if (boot_cpu_has(X86_FEATURE_PGE)) { - /* - * This can't be cr4_set_bits_and_update_boot() -- - * the trampoline code can't handle CR4.PCIDE and - * it wouldn't do any good anyway. Despite the name, - * cr4_set_bits_and_update_boot() doesn't actually - * cause the bits in question to remain set all the - * way through the secondary boot asm. - * - * Instead, we brute-force it and set CR4.PCIDE - * manually in start_secondary(). - */ - cr4_set_bits(X86_CR4_PCIDE); - } else { - /* - * flush_tlb_all(), as currently implemented, won't - * work if PCID is on but PGE is not. Since that - * combination doesn't exist on real hardware, there's - * no reason to try to fully support it, but it's - * polite to avoid corrupting data if we're on - * an improperly configured VM. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); - } + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- the + * trampoline code can't handle CR4.PCIDE and it wouldn't + * do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually cause + * the bits in question to remain set all the way through + * the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE manually in + * start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + + /* + * INVPCID's single-context modes (2/3) only work if we set + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable + * on systems that have X86_CR4_PCIDE clear, or that have + * no INVPCID support at all. + */ + if (boot_cpu_has(X86_FEATURE_INVPCID)) + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't work if + * PCID is on but PGE is not. Since that combination + * doesn't exist on real hardware, there's no reason to try + * to fully support it, but it's polite to avoid corrupting + * data if we're on an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); } -#endif } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 0a126abd576ebc6403f063dbe20cf7416c9d9393 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Dec 2017 13:34:53 +0100 Subject: x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming Ideally we'd also use sparse to enforce this separation so it becomes much more difficult to mess up. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 55 ++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 57072a1052fe..b519da4fc03c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -13,16 +13,33 @@ #include #include -static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) -{ - /* - * Bump the generation count. This also serves as a full barrier - * that synchronizes with switch_mm(): callers are required to order - * their read of mm_cpumask after their writes to the paging - * structures. - */ - return atomic64_inc_return(&mm->context.tlb_gen); -} +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ /* There are 12 bits of space for ASIDS in CR3 */ #define CR3_HW_ASID_BITS 12 @@ -41,7 +58,7 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) /* * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account - * for them being zero-based. Another -1 is because ASID 0 is reserved for + * for them being zero-based. Another -1 is because PCID 0 is reserved for * use by non-PCID-aware users. */ #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) @@ -52,6 +69,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) */ #define TLB_NR_DYN_ASIDS 6 +/* + * Given @asid, compute kPCID + */ static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); @@ -86,7 +106,7 @@ static inline u16 kern_pcid(u16 asid) } /* - * The user PCID is just the kernel one, plus the "switch bit". + * Given @asid, compute uPCID */ static inline u16 user_pcid(u16 asid) { @@ -484,6 +504,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + return atomic64_inc_return(&mm->context.tlb_gen); +} + static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { -- cgit v1.2.3 From 5f26d76c3fd67c48806415ef8b1116c97beff8ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 19 Dec 2017 22:33:46 +0100 Subject: x86/dumpstack: Indicate in Oops whether PTI is configured and enabled CONFIG_PAGE_TABLE_ISOLATION is relatively new and intrusive feature that may still have some corner cases which could take some time to manifest and be fixed. It would be useful to have Oops messages indicate whether it was enabled for building the kernel, and whether it was disabled during boot. Example of fully enabled: Oops: 0001 [#1] SMP PTI Example of enabled during build, but disabled during boot: Oops: 0001 [#1] SMP NOPTI We can decide to remove this after the feature has been tested in the field long enough. [ tglx: Made it use boot_cpu_has() as requested by Borislav ] Signed-off-by: Vlastimil Babka Signed-off-by: Thomas Gleixner Reviewed-by: Eduardo Valentin Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: Andy Lutomirsky Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: bpetkov@suse.de Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: jkosina@suse.cz Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 36b17e0febe8..5fa110699ed2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) -- cgit v1.2.3 From 75298aa179d56cd64f54e58a19fffc8ab922b4c0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 4 Dec 2017 15:08:04 +0100 Subject: x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy The upcoming support for dumping the kernel and the user space page tables of the current process would create more random files in the top level debugfs directory. Add a page table directory and move the existing file to it. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/debug_pagetables.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index bfcffdf6c577..d1449fb6dc7a 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -22,21 +22,26 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -static struct dentry *pe; +static struct dentry *dir, *pe; static int __init pt_dump_debug_init(void) { - pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, - &ptdump_fops); - if (!pe) + dir = debugfs_create_dir("page_tables", NULL); + if (!dir) return -ENOMEM; + pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); + if (!pe) + goto err; return 0; +err: + debugfs_remove_recursive(dir); + return -ENOMEM; } static void __exit pt_dump_debug_exit(void) { - debugfs_remove_recursive(pe); + debugfs_remove_recursive(dir); } module_init(pt_dump_debug_init); -- cgit v1.2.3 From b4bf4f924b1d7bade38fd51b2e401d20d0956e4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:08:05 +0100 Subject: x86/mm/dump_pagetables: Check user space page table for WX pages ptdump_walk_pgd_level_checkwx() checks the kernel page table for WX pages, but does not check the PAGE_TABLE_ISOLATION user space page table. Restructure the code so that dmesg output is selected by an explicit argument and not implicit via checking the pgd argument for !NULL. Add the check for the user space page table. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 1 + arch/x86/mm/debug_pagetables.c | 2 +- arch/x86/mm/dump_pagetables.c | 30 +++++++++++++++++++++++++----- 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index cc6fa75884e9..03780d5c41c5 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index d1449fb6dc7a..8e70c1599e51 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -5,7 +5,7 @@ static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk_pgd_level(m, NULL); + ptdump_walk_pgd_level_debugfs(m, NULL); return 0; } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 690eaf31ca34..17f5b417f95e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -476,7 +476,7 @@ static inline bool is_hypervisor_range(int idx) } static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, - bool checkwx) + bool checkwx, bool dmesg) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_top_pgt; @@ -489,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, if (pgd) { start = pgd; - st.to_dmesg = true; + st.to_dmesg = dmesg; } st.check_wx = checkwx; @@ -527,13 +527,33 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { - ptdump_walk_pgd_level_core(m, pgd, false); + ptdump_walk_pgd_level_core(m, pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd) +{ + ptdump_walk_pgd_level_core(m, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +static void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pgd_t *pgd = (pgd_t *) &init_top_pgt; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("x86/mm: Checking user space page tables\n"); + pgd = kernel_to_user_pgdp(pgd); + ptdump_walk_pgd_level_core(NULL, pgd, true, false); +#endif } -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); void ptdump_walk_pgd_level_checkwx(void) { - ptdump_walk_pgd_level_core(NULL, NULL, true); + ptdump_walk_pgd_level_core(NULL, NULL, true, false); + ptdump_walk_user_pgd_level_checkwx(); } static int __init pt_dump_init(void) -- cgit v1.2.3 From a4b51ef6552c704764684cef7e753162dc87c5fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:08:06 +0100 Subject: x86/mm/dump_pagetables: Allow dumping current pagetables Add two debugfs files which allow to dump the pagetable of the current task. current_kernel dumps the regular page table. This is the page table which is normally shared between kernel and user space. If kernel page table isolation is enabled this is the kernel space mapping. If kernel page table isolation is enabled the second file, current_user, dumps the user space page table. These files allow to verify the resulting page tables for page table isolation, but even in the normal case its useful to be able to inspect user space page tables of current for debugging purposes. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/mm/debug_pagetables.c | 71 +++++++++++++++++++++++++++++++++++++++--- arch/x86/mm/dump_pagetables.c | 6 +++- 3 files changed, 73 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 03780d5c41c5..6b43d677f8ca 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,7 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); -void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); void ptdump_walk_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index 8e70c1599e51..421f2664ffa0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -5,7 +5,7 @@ static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk_pgd_level_debugfs(m, NULL); + ptdump_walk_pgd_level_debugfs(m, NULL, false); return 0; } @@ -22,7 +22,57 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -static struct dentry *dir, *pe; +static int ptdump_show_curknl(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curknl(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curknl, NULL); +} + +static const struct file_operations ptdump_curknl_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curknl, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static struct dentry *pe_curusr; + +static int ptdump_show_curusr(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curusr(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curusr, NULL); +} + +static const struct file_operations ptdump_curusr_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curusr, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static struct dentry *dir, *pe_knl, *pe_curknl; static int __init pt_dump_debug_init(void) { @@ -30,9 +80,22 @@ static int __init pt_dump_debug_init(void) if (!dir) return -ENOMEM; - pe = debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); - if (!pe) + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, + &ptdump_fops); + if (!pe_knl) + goto err; + + pe_curknl = debugfs_create_file("current_kernel", 0400, + dir, NULL, &ptdump_curknl_fops); + if (!pe_curknl) + goto err; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pe_curusr = debugfs_create_file("current_user", 0400, + dir, NULL, &ptdump_curusr_fops); + if (!pe_curusr) goto err; +#endif return 0; err: debugfs_remove_recursive(dir); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 17f5b417f95e..f56902c1f04b 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -530,8 +530,12 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) ptdump_walk_pgd_level_core(m, pgd, false, true); } -void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd) +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (user && static_cpu_has(X86_FEATURE_PTI)) + pgd = kernel_to_user_pgdp(pgd); +#endif ptdump_walk_pgd_level_core(m, pgd, false, false); } EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); -- cgit v1.2.3 From 9f5cb6b32d9e0a3a7453222baaf15664d92adbf2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Dec 2017 20:35:11 +0100 Subject: x86/ldt: Make the LDT mapping RO Now that the LDT mapping is in a known area when PAGE_TABLE_ISOLATION is enabled its a primary target for attacks, if a user space interface fails to validate a write address correctly. That can never happen, right? The SDM states: If the segment descriptors in the GDT or an LDT are placed in ROM, the processor can enter an indefinite loop if software or the processor attempts to update (write to) the ROM-based segment descriptors. To prevent this problem, set the accessed bits for all segment descriptors placed in a ROM. Also, remove operating-system or executive code that attempts to modify segment descriptors located in ROM. So its a valid approach to set the ACCESS bit when setting up the LDT entry and to map the table RO. Fixup the selftest so it can handle that new mode. Remove the manual ACCESS bit setter in set_tls_desc() as this is now pointless. Folded the patch from Peter Ziljstra. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 2 ++ arch/x86/kernel/ldt.c | 7 ++++++- arch/x86/kernel/tls.c | 11 ++--------- tools/testing/selftests/x86/ldt_gdt.c | 3 +-- 4 files changed, 11 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index bc359dd2f7f6..85e23bb7b34e 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->type = (info->read_exec_only ^ 1) << 1; desc->type |= info->contents << 2; + /* Set the ACCESS bit so it can be mapped RO */ + desc->type |= 1; desc->s = 1; desc->dpl = 0x3; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 9629c5d8267a..579cc4a66fdf 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -158,7 +158,12 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) ptep = get_locked_pte(mm, va, &ptl); if (!ptep) return -ENOMEM; - pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)); + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); set_pte_at(mm, va, ptep, pte); pte_unmap_unlock(ptep, ptl); } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9a9c9b076955..a5b802a12212 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) { + if (LDT_empty(info) || LDT_zero(info)) memset(desc, 0, sizeof(*desc)); - } else { + else fill_ldt(desc, info); - - /* - * Always set the accessed bit so that the CPU - * doesn't try to write to the (read-only) GDT. - */ - desc->type |= 1; - } ++info; ++desc; } diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 0304ffb714f2..1aef72df20a1 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt, * NB: Different Linux versions do different things with the * accessed bit in set_thread_area(). */ - if (ar != expected_ar && - (ldt || ar != (expected_ar | AR_ACCESSED))) { + if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) { printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", (ldt ? "LDT" : "GDT"), index, ar, expected_ar); nerrs++; -- cgit v1.2.3 From 7ad1437d6ace0e450a6c1167720608ad660b191d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Dec 2017 19:45:31 +0100 Subject: perf/x86/intel: Plug memory leak in intel_pmu_init() A recent commit introduced an extra merge_attr() call in the skylake branch, which causes a memory leak. Store the pointer to the extra allocated memory and free it at the end of the function. Fixes: a5df70c354c2 ("perf/x86: Only show format attributes when supported") Reported-by: Tommi Rantala Signed-off-by: Thomas Gleixner Cc: Andi Kleen --- arch/x86/events/intel/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 09c26a4f139c..731153a4681e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = { __init int intel_pmu_init(void) { + struct attribute **extra_attr = NULL; + struct attribute **to_free = NULL; union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; @@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; - struct attribute **extra_attr = NULL; char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void) extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; extra_attr = merge_attr(extra_attr, skl_format_attr); + to_free = extra_attr; x86_pmu.cpu_events = get_hsw_events_attrs(); intel_pmu_pebs_data_source_skl( boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); @@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } + kfree(to_free); return 0; } -- cgit v1.2.3 From 7ac139eaa6bbdb07c547b6916a808eab3897e0e3 Mon Sep 17 00:00:00 2001 From: rodrigosiqueira Date: Fri, 15 Dec 2017 11:15:33 -0200 Subject: x86: Remove unused parameter of prepare_switch_to Commit e37e43a497d5 ("x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)") added prepare_switch_to with one extra parameter which is not used by the function, remove it. Signed-off-by: Rodrigo Siqueira Signed-off-by: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: https://lkml.kernel.org/r/20171215131533.hp6kqebw45o7uvsb@smtp.gmail.com --- arch/x86/include/asm/switch_to.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8c6bd6863db9..1008d4622709 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); /* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *prev, - struct task_struct *next) +static inline void prepare_switch_to(struct task_struct *next) { #ifdef CONFIG_VMAP_STACK /* @@ -70,7 +69,7 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(prev, next); \ + prepare_switch_to(next); \ \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) -- cgit v1.2.3 From ac461122c88a10b7d775de2f56467f097c9e627a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 27 Dec 2017 11:48:50 -0800 Subject: x86-32: Fix kexec with stack canary (CONFIG_CC_STACKPROTECTOR) Commit e802a51ede91 ("x86/idt: Consolidate IDT invalidation") cleaned up and unified the IDT invalidation that existed in a couple of places. It changed no actual real code. Despite not changing any actual real code, it _did_ change code generation: by implementing the common idt_invalidate() function in archx86/kernel/idt.c, it made the use of the function in arch/x86/kernel/machine_kexec_32.c be a real function call rather than an (accidental) inlining of the function. That, in turn, exposed two issues: - in load_segments(), we had incorrectly reset all the segment registers, which then made the stack canary load (which gcc does using offset of %gs) cause a trap. Instead of %gs pointing to the stack canary, it will be the normal zero-based kernel segment, and the stack canary load will take a page fault at address 0x14. - to make this even harder to debug, we had invalidated the GDT just before calling idt_invalidate(), which meant that the fault happened with an invalid GDT, which in turn causes a triple fault and immediate reboot. Fix this by (a) not reloading the special segments in load_segments(). We currently don't do any percpu accesses (which would require %fs on x86-32) in this area, but there's no reason to think that we might not want to do them, and like %gs, it's pointless to break it. (b) doing idt_invalidate() before invalidating the GDT, to keep things at least _slightly_ more debuggable for a bit longer. Without a IDT, traps will not work. Without a GDT, traps also will not work, but neither will any segment loads etc. So in a very real sense, the GDT is even more core than the IDT. Fixes: e802a51ede91 ("x86/idt: Consolidate IDT invalidation") Reported-and-tested-by: Alexandru Chirvasitu Signed-off-by: Linus Torvalds Signed-off-by: Thomas Gleixner Cc: Denys Vlasenko Cc: Peter Zijlstra Cc: Brian Gerst Cc: Steven Rostedt Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.LFD.2.21.1712271143180.8572@i7.lan --- arch/x86/kernel/machine_kexec_32.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 00bc751c861c..edfede768688 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -48,8 +48,6 @@ static void load_segments(void) "\tmovl $"STR(__KERNEL_DS)",%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" : : : "eax", "memory"); #undef STR @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); idt_invalidate(phys_to_virt(0)); + set_gdt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, -- cgit v1.2.3 From 59585b4be9ae4dc6506551709bdcd6f5210b8a01 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 25 Dec 2017 03:43:53 +0100 Subject: sparc64: repair calling incorrect hweight function from stubs Commit v4.12-rc4-1-g9289ea7f952b introduced a mistake that made the 64-bit hweight stub call the 16-bit hweight function. Fixes: 9289ea7f952b ("sparc64: Use indirect calls in hamming weight stubs") Signed-off-by: Jan Engelhardt Signed-off-by: David S. Miller --- arch/sparc/lib/hweight.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S index e5547b22cd18..0ddbbb031822 100644 --- a/arch/sparc/lib/hweight.S +++ b/arch/sparc/lib/hweight.S @@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32) .previous ENTRY(__arch_hweight64) - sethi %hi(__sw_hweight16), %g1 - jmpl %g1 + %lo(__sw_hweight16), %g0 + sethi %hi(__sw_hweight64), %g1 + jmpl %g1 + %lo(__sw_hweight64), %g0 nop ENDPROC(__arch_hweight64) EXPORT_SYMBOL(__arch_hweight64) -- cgit v1.2.3 From 39c3fd58952d7599d367c84c1330b785d91d6088 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 2 Dec 2017 18:11:04 +0100 Subject: kernel/irq: Extend lockdep class for request mutex The IRQ code already has support for lockdep class for the lock mutex in an interrupt descriptor. Extend this to add a second class for the request mutex in the descriptor. Not having a class is resulting in false positive splats in some code paths. Signed-off-by: Andrew Lunn Signed-off-by: Thomas Gleixner Acked-by: linus.walleij@linaro.org Cc: grygorii.strashko@ti.com Cc: f.fainelli@gmail.com Link: https://lkml.kernel.org/r/1512234664-21555-1-git-send-email-andrew@lunn.ch --- arch/powerpc/sysdev/fsl_msi.c | 4 +++- drivers/gpio/gpio-bcm-kona.c | 3 ++- drivers/gpio/gpio-brcmstb.c | 4 +++- drivers/gpio/gpio-tegra.c | 4 +++- drivers/gpio/gpiolib.c | 27 ++++++++++++++++--------- drivers/irqchip/irq-renesas-intc-irqpin.c | 6 +++++- drivers/mfd/arizona-irq.c | 4 +++- drivers/pinctrl/pinctrl-single.c | 5 ++++- include/linux/gpio/driver.h | 33 ++++++++++++++++++++----------- include/linux/irqdesc.h | 9 ++++++--- kernel/irq/generic-chip.c | 11 +++++++---- 11 files changed, 75 insertions(+), 35 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 44cbf4c12ea1..df95102e732c 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev) } static struct lock_class_key fsl_msi_irq_class; +static struct lock_class_key fsl_msi_irq_request_class; static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev, int offset, int irq_index) @@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev, dev_err(&dev->dev, "No memory for MSI cascade data\n"); return -ENOMEM; } - irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class); + irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class, + &fsl_msi_irq_request_class); cascade_data->index = offset; cascade_data->msi_data = msi; cascade_data->virq = virt_msir; diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c index dfcf56ee3c61..76861a00bb92 100644 --- a/drivers/gpio/gpio-bcm-kona.c +++ b/drivers/gpio/gpio-bcm-kona.c @@ -522,6 +522,7 @@ static struct of_device_id const bcm_kona_gpio_of_match[] = { * category than their parents, so it won't report false recursion. */ static struct lock_class_key gpio_lock_class; +static struct lock_class_key gpio_request_class; static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) @@ -531,7 +532,7 @@ static int bcm_kona_gpio_irq_map(struct irq_domain *d, unsigned int irq, ret = irq_set_chip_data(irq, d->host_data); if (ret < 0) return ret; - irq_set_lockdep_class(irq, &gpio_lock_class); + irq_set_lockdep_class(irq, &gpio_lock_class, &gpio_request_class); irq_set_chip_and_handler(irq, &bcm_gpio_irq_chip, handle_simple_irq); irq_set_noprobe(irq); diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c index 545d43a587b7..5b24801bffef 100644 --- a/drivers/gpio/gpio-brcmstb.c +++ b/drivers/gpio/gpio-brcmstb.c @@ -327,6 +327,7 @@ static struct brcmstb_gpio_bank *brcmstb_gpio_hwirq_to_bank( * category than their parents, so it won't report false recursion. */ static struct lock_class_key brcmstb_gpio_irq_lock_class; +static struct lock_class_key brcmstb_gpio_irq_request_class; static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq, @@ -346,7 +347,8 @@ static int brcmstb_gpio_irq_map(struct irq_domain *d, unsigned int irq, ret = irq_set_chip_data(irq, &bank->gc); if (ret < 0) return ret; - irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class); + irq_set_lockdep_class(irq, &brcmstb_gpio_irq_lock_class, + &brcmstb_gpio_irq_lock_class); irq_set_chip_and_handler(irq, &priv->irq_chip, handle_level_irq); irq_set_noprobe(irq); return 0; diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c index 8db47f671708..02fa8fe2292a 100644 --- a/drivers/gpio/gpio-tegra.c +++ b/drivers/gpio/gpio-tegra.c @@ -565,6 +565,7 @@ static const struct dev_pm_ops tegra_gpio_pm_ops = { * than their parents, so it won't report false recursion. */ static struct lock_class_key gpio_lock_class; +static struct lock_class_key gpio_request_class; static int tegra_gpio_probe(struct platform_device *pdev) { @@ -670,7 +671,8 @@ static int tegra_gpio_probe(struct platform_device *pdev) bank = &tgi->bank_info[GPIO_BANK(gpio)]; - irq_set_lockdep_class(irq, &gpio_lock_class); + irq_set_lockdep_class(irq, &gpio_lock_class, + &gpio_request_class); irq_set_chip_data(irq, bank); irq_set_chip_and_handler(irq, &tgi->ic, handle_simple_irq); } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index aad84a6306c4..44332b793718 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -73,7 +73,8 @@ LIST_HEAD(gpio_devices); static void gpiochip_free_hogs(struct gpio_chip *chip); static int gpiochip_add_irqchip(struct gpio_chip *gpiochip, - struct lock_class_key *key); + struct lock_class_key *lock_key, + struct lock_class_key *request_key); static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip); static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip); static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip); @@ -1100,7 +1101,8 @@ static void gpiochip_setup_devs(void) } int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, - struct lock_class_key *key) + struct lock_class_key *lock_key, + struct lock_class_key *request_key) { unsigned long flags; int status = 0; @@ -1246,7 +1248,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, if (status) goto err_remove_from_list; - status = gpiochip_add_irqchip(chip, key); + status = gpiochip_add_irqchip(chip, lock_key, request_key); if (status) goto err_remove_chip; @@ -1632,7 +1634,7 @@ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq, * This lock class tells lockdep that GPIO irqs are in a different * category than their parents, so it won't report false recursion. */ - irq_set_lockdep_class(irq, chip->irq.lock_key); + irq_set_lockdep_class(irq, chip->irq.lock_key, chip->irq.request_key); irq_set_chip_and_handler(irq, chip->irq.chip, chip->irq.handler); /* Chips that use nested thread handlers have them marked */ if (chip->irq.threaded) @@ -1712,10 +1714,12 @@ static int gpiochip_to_irq(struct gpio_chip *chip, unsigned offset) /** * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip * @gpiochip: the GPIO chip to add the IRQ chip to - * @lock_key: lockdep class + * @lock_key: lockdep class for IRQ lock + * @request_key: lockdep class for IRQ request */ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip, - struct lock_class_key *lock_key) + struct lock_class_key *lock_key, + struct lock_class_key *request_key) { struct irq_chip *irqchip = gpiochip->irq.chip; const struct irq_domain_ops *ops; @@ -1753,6 +1757,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip, gpiochip->to_irq = gpiochip_to_irq; gpiochip->irq.default_type = type; gpiochip->irq.lock_key = lock_key; + gpiochip->irq.request_key = request_key; if (gpiochip->irq.domain_ops) ops = gpiochip->irq.domain_ops; @@ -1850,7 +1855,8 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip) * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE * to have the core avoid setting up any default type in the hardware. * @threaded: whether this irqchip uses a nested thread handler - * @lock_key: lockdep class + * @lock_key: lockdep class for IRQ lock + * @request_key: lockdep class for IRQ request * * This function closely associates a certain irqchip with a certain * gpiochip, providing an irq domain to translate the local IRQs to @@ -1872,7 +1878,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip, irq_flow_handler_t handler, unsigned int type, bool threaded, - struct lock_class_key *lock_key) + struct lock_class_key *lock_key, + struct lock_class_key *request_key) { struct device_node *of_node; @@ -1913,6 +1920,7 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip, gpiochip->irq.default_type = type; gpiochip->to_irq = gpiochip_to_irq; gpiochip->irq.lock_key = lock_key; + gpiochip->irq.request_key = request_key; gpiochip->irq.domain = irq_domain_add_simple(of_node, gpiochip->ngpio, first_irq, &gpiochip_domain_ops, gpiochip); @@ -1940,7 +1948,8 @@ EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key); #else /* CONFIG_GPIOLIB_IRQCHIP */ static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip, - struct lock_class_key *key) + struct lock_class_key *lock_key, + struct lock_class_key *request_key) { return 0; } diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c index 06f29cf5018a..cee59fe1321c 100644 --- a/drivers/irqchip/irq-renesas-intc-irqpin.c +++ b/drivers/irqchip/irq-renesas-intc-irqpin.c @@ -342,6 +342,9 @@ static irqreturn_t intc_irqpin_shared_irq_handler(int irq, void *dev_id) */ static struct lock_class_key intc_irqpin_irq_lock_class; +/* And this is for the request mutex */ +static struct lock_class_key intc_irqpin_irq_request_class; + static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw) { @@ -352,7 +355,8 @@ static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq, intc_irqpin_dbg(&p->irq[hw], "map"); irq_set_chip_data(virq, h->host_data); - irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class); + irq_set_lockdep_class(virq, &intc_irqpin_irq_lock_class, + &intc_irqpin_irq_request_class); irq_set_chip_and_handler(virq, &p->irq_chip, handle_level_irq); return 0; } diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c index 09cf3699e354..a307832d7e45 100644 --- a/drivers/mfd/arizona-irq.c +++ b/drivers/mfd/arizona-irq.c @@ -184,6 +184,7 @@ static struct irq_chip arizona_irq_chip = { }; static struct lock_class_key arizona_irq_lock_class; +static struct lock_class_key arizona_irq_request_class; static int arizona_irq_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw) @@ -191,7 +192,8 @@ static int arizona_irq_map(struct irq_domain *h, unsigned int virq, struct arizona *data = h->host_data; irq_set_chip_data(virq, data); - irq_set_lockdep_class(virq, &arizona_irq_lock_class); + irq_set_lockdep_class(virq, &arizona_irq_lock_class, + &arizona_irq_request_class); irq_set_chip_and_handler(virq, &arizona_irq_chip, handle_simple_irq); irq_set_nested_thread(virq, 1); irq_set_noprobe(virq); diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index e6cd8de793e2..3501491e5bfc 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -222,6 +222,9 @@ static enum pin_config_param pcs_bias[] = { */ static struct lock_class_key pcs_lock_class; +/* Class for the IRQ request mutex */ +static struct lock_class_key pcs_request_class; + /* * REVISIT: Reads and writes could eventually use regmap or something * generic. But at least on omaps, some mux registers are performance @@ -1486,7 +1489,7 @@ static int pcs_irqdomain_map(struct irq_domain *d, unsigned int irq, irq_set_chip_data(irq, pcs_soc); irq_set_chip_and_handler(irq, &pcs->chip, handle_level_irq); - irq_set_lockdep_class(irq, &pcs_lock_class); + irq_set_lockdep_class(irq, &pcs_lock_class, &pcs_request_class); irq_set_noprobe(irq); return 0; diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 55e672592fa9..7258cd676df4 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -66,9 +66,10 @@ struct gpio_irq_chip { /** * @lock_key: * - * Per GPIO IRQ chip lockdep class. + * Per GPIO IRQ chip lockdep classes. */ struct lock_class_key *lock_key; + struct lock_class_key *request_key; /** * @parent_handler: @@ -323,7 +324,8 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip, /* add/remove chips */ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, - struct lock_class_key *lock_key); + struct lock_class_key *lock_key, + struct lock_class_key *request_key); /** * gpiochip_add_data() - register a gpio_chip @@ -350,11 +352,13 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, */ #ifdef CONFIG_LOCKDEP #define gpiochip_add_data(chip, data) ({ \ - static struct lock_class_key key; \ - gpiochip_add_data_with_key(chip, data, &key); \ + static struct lock_class_key lock_key; \ + static struct lock_class_key request_key; \ + gpiochip_add_data_with_key(chip, data, &lock_key, \ + &request_key); \ }) #else -#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL) +#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL, NULL) #endif static inline int gpiochip_add(struct gpio_chip *chip) @@ -429,7 +433,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip, irq_flow_handler_t handler, unsigned int type, bool threaded, - struct lock_class_key *lock_key); + struct lock_class_key *lock_key, + struct lock_class_key *request_key); #ifdef CONFIG_LOCKDEP @@ -445,10 +450,12 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, irq_flow_handler_t handler, unsigned int type) { - static struct lock_class_key key; + static struct lock_class_key lock_key; + static struct lock_class_key request_key; return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, false, &key); + handler, type, false, + &lock_key, &request_key); } static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, @@ -458,10 +465,12 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, unsigned int type) { - static struct lock_class_key key; + static struct lock_class_key lock_key; + static struct lock_class_key request_key; return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, true, &key); + handler, type, true, + &lock_key, &request_key); } #else static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, @@ -471,7 +480,7 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, unsigned int type) { return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, false, NULL); + handler, type, false, NULL, NULL); } static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, @@ -481,7 +490,7 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, unsigned int type) { return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, true, NULL); + handler, type, true, NULL, NULL); } #endif /* CONFIG_LOCKDEP */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 39fb3700f7a9..25b33b664537 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -255,12 +255,15 @@ static inline bool irq_is_percpu_devid(unsigned int irq) } static inline void -irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class) +irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) - lockdep_set_class(&desc->lock, class); + if (desc) { + lockdep_set_class(&desc->lock, lock_class); + lockdep_set_class(&desc->request_mutex, request_class); + } } #ifdef CONFIG_IRQ_PREFLOW_FASTEOI diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c26c5bb6b491..508c03dfef25 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); /* - * Separate lockdep class for interrupt chip which can nest irq_desc - * lock. + * Separate lockdep classes for interrupt chip which can nest irq_desc + * lock and request mutex. */ static struct lock_class_key irq_nested_lock_class; +static struct lock_class_key irq_nested_request_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain @@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, set_bit(idx, &gc->installed); if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(virq, &irq_nested_lock_class); + irq_set_lockdep_class(virq, &irq_nested_lock_class, + &irq_nested_request_class); if (chip->irq_calc_mask) chip->irq_calc_mask(data); @@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, continue; if (flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(i, &irq_nested_lock_class); + irq_set_lockdep_class(i, &irq_nested_lock_class, + &irq_nested_request_class); if (!(flags & IRQ_GC_NO_MASK)) { struct irq_data *d = irq_get_irq_data(i); -- cgit v1.2.3 From 4fcab6693445cfb84f2b65868c58043535090e52 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Mon, 4 Dec 2017 12:03:12 +0800 Subject: x86/apic: Avoid wrong warning when parsing 'apic=' in X86-32 case There are two consumers of apic=: apic_set_verbosity() for setting the APIC debug level; parse_apic() for registering APIC driver by hand. X86-32 supports both of them, but sometimes, kernel issues a weird warning. eg: when kernel was booted up with 'apic=bigsmp' in command line, early_param would warn like that: ... [ 0.000000] APIC Verbosity level bigsmp not recognised use apic=verbose or apic=debug [ 0.000000] Malformed early option 'apic' ... Wrap the warning code in CONFIG_X86_64 case to avoid this. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: rdunlap@infradead.org Cc: corbet@lwn.net Link: https://lkml.kernel.org/r/20171204040313.24824-1-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/apic/apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6e272f3ea984..880441f24146 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64 else { pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } +#endif return 0; } -- cgit v1.2.3 From 7d7fb91cb43aebdcadca6a0fce25c3174feab980 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 15 Dec 2017 23:25:08 +0200 Subject: ACPI / x86: boot: Swap variables in condition in acpi_register_gsi_ioapic() For better readability compare input to something considered settled down. Additionally move it to one line (while it's slightly longer 80 characters it makes readability better). No functional change intended. Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f4c463df8b08..4bad714d6227 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -676,8 +676,7 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, mutex_lock(&acpi_ioapic_lock); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); /* Don't set up the ACPI SCI because it's already set up */ - if (irq >= 0 && enable_update_mptable && - acpi_gbl_FADT.sci_interrupt != gsi) + if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt) mp_config_acpi_gsi(dev, gsi, trigger, polarity); mutex_unlock(&acpi_ioapic_lock); #endif -- cgit v1.2.3 From 220580fb0d0a71cff0f17c24463c787f43dd6626 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 15 Dec 2017 23:25:09 +0200 Subject: ACPI / x86: boot: Get rid of ACPI_INVALID_GSI Commit 49e4b84333f3 (ACPI: Use correct IRQ when uninstalling ACPI interrupt handler) brings a new definition for invalid ACPI IRQ, i.e. INVALID_ACPI_IRQ, which is defined to 0xffffffff (or -1 for unsigned value). Get rid of a former one, which was brought in by commit 2c0a6894df19 (x86, ACPI, irq: Enhance error handling in function acpi_register_gsi()), in favour of latter. To clarify the rationale of changing from INT_MIN to ((unsigned)-1) definition consider the following: - IRQ 0 is valid one in hardware, so, better not to use it everywhere (Linux uses 0 as NO IRQ, though it's another story) - INT_MIN splits the range into two, while 0xffffffff reserves only the last item - when type casting is done in most cases 0xff, 0xffff is naturally used as a marker of invalid HW IRQ: for example PCI INT line 0xff means no IRQ assigned by BIOS Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4bad714d6227..56752b48e480 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -112,8 +112,6 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; -#define ACPI_INVALID_GSI INT_MIN - /* * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. @@ -372,7 +370,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, * and acpi_isa_irq_to_gsi() may give wrong result. */ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi) - isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI; + isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ; isa_irq_to_gsi[bus_irq] = gsi; } @@ -637,7 +635,7 @@ EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { if (isa_irq < nr_legacy_irqs() && - isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) { + isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) { *gsi = isa_irq_to_gsi[isa_irq]; return 0; } -- cgit v1.2.3 From 4565c4f6056967ee8844fa550e3cbbe1c0e65a11 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 15 Dec 2017 23:25:10 +0200 Subject: ACPI / x86: boot: Use INVALID_ACPI_IRQ instead of 0 for acpi_sci_override_gsi 0 is valid hardware interrupt which might be in some cases overridden. Due to this, switch to INVALID_ACPI_IRQ to mark SCI override not set. While here, change the type of the variable from int to u32 to match the GSI type used in the rest of the code. Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/acpi.h | 2 +- arch/x86/kernel/acpi/boot.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 8d0ec9df1cbe..44f5d79d5105 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -49,7 +49,7 @@ extern int acpi_fix_pin2_polarity; extern int acpi_disable_cmcff; extern u8 acpi_sci_flags; -extern int acpi_sci_override_gsi; +extern u32 acpi_sci_override_gsi; void acpi_pic_sci_set_trigger(unsigned int, u16); struct device; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 56752b48e480..5a12cadbf019 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -68,8 +68,9 @@ int acpi_ioapic; int acpi_strict; int acpi_disable_cmcff; +/* ACPI SCI override configuration */ u8 acpi_sci_flags __initdata; -int acpi_sci_override_gsi __initdata; +u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ; int acpi_skip_timer_override __initdata; int acpi_use_timer_override __initdata; int acpi_fix_pin2_polarity __initdata; @@ -1209,7 +1210,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) * If BIOS did not supply an INT_SRC_OVR for the SCI * pretend we got one so we can set the SCI flags. */ - if (!acpi_sci_override_gsi) + if (acpi_sci_override_gsi == INVALID_ACPI_IRQ) acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, acpi_gbl_FADT.sci_interrupt); -- cgit v1.2.3 From 7c7bcfeae2d8e59066bd273b7d70392574e14c15 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 15 Dec 2017 23:25:11 +0200 Subject: ACPI / x86: boot: Don't setup SCI on HW-reduced platforms As per note in 5.2.9 Fixed ACPI Description Table (FADT) chapter of ACPI specification, on HW-reduced platforma OSPM should ignore fields related to the ACPI HW register interface, one of which is SCI_INT. Follow the spec and ignore any configuration done for interrupt line defined by SCI_INT if FADT specifies a HW-reduced platform. HW-reduced platforms will still be able to use SCI in case it provides an override record in MADT table. Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 5a12cadbf019..4bf004bab4b2 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1209,8 +1209,9 @@ static int __init acpi_parse_madt_ioapic_entries(void) /* * If BIOS did not supply an INT_SRC_OVR for the SCI * pretend we got one so we can set the SCI flags. + * But ignore setting up SCI on hardware reduced platforms. */ - if (acpi_sci_override_gsi == INVALID_ACPI_IRQ) + if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware) acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, acpi_gbl_FADT.sci_interrupt); -- cgit v1.2.3 From a31e58e129f73ab5b04016330b13ed51fde7a961 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 Dec 2017 11:33:33 +0100 Subject: x86/apic: Switch all APICs to Fixed delivery mode Some of the APIC incarnations are operating in lowest priority delivery mode. This worked as long as the vector management code allocated the same vector on all possible CPUs for each interrupt. Lowest priority delivery mode does not necessarily respect the affinity setting and may redirect to some other online CPU. This was documented somewhere in the old code and the conversion to single target delivery missed to update the delivery mode of the affected APIC drivers which results in spurious interrupts on some of the affected CPU/Chipset combinations. Switch the APIC drivers over to Fixed delivery mode and remove all leftovers of lowest priority delivery mode. Switching to Fixed delivery mode is not a problem on these CPUs because the kernel already uses Fixed delivery mode for IPIs. The reason for this is that th SDM explicitely forbids lowest prio mode for IPIs. The reason is obvious: If the irq routing does not honor destination targets in lowest prio mode then an IPI targeted at CPU1 might end up on CPU0, which would be a fatal problem in many cases. As a consequence of this change, the apic::irq_delivery_mode field is now pointless, but this needs to be cleaned up in a separate patch. Fixes: fdba46ffb4c2 ("x86/apic: Get rid of multi CPU affinity") Reported-by: vcaputo@pengaru.com Signed-off-by: Thomas Gleixner Tested-by: vcaputo@pengaru.com Cc: Pavel Machek Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712281140440.1688@nanos --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/apic/apic_noop.c | 2 +- arch/x86/kernel/apic/msi.c | 8 ++------ arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- drivers/pci/host/pci-hyperv.c | 8 ++------ 6 files changed, 8 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index aa85690e9b64..25a87028cb3f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 7b659c4480c9..5078b5ce63a7 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be764422..ce503c99f5c4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index fa22017de806..02e8acb134f8 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 622f13ca8a94..8b04234e010b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c index 0fe3ea164ee5..e7d94473aedd 100644 --- a/drivers/pci/host/pci-hyperv.c +++ b/drivers/pci/host/pci-hyperv.c @@ -985,9 +985,7 @@ static u32 hv_compose_msi_req_v1( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.vector_count = 1; - int_pkt->int_desc.delivery_mode = - (apic->irq_delivery_mode == dest_LowestPrio) ? - dest_LowestPrio : dest_Fixed; + int_pkt->int_desc.delivery_mode = dest_Fixed; /* * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in @@ -1008,9 +1006,7 @@ static u32 hv_compose_msi_req_v2( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.vector_count = 1; - int_pkt->int_desc.delivery_mode = - (apic->irq_delivery_mode == dest_LowestPrio) ? - dest_LowestPrio : dest_Fixed; + int_pkt->int_desc.delivery_mode = dest_Fixed; /* * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten -- cgit v1.2.3 From 945f50a591783ac6e9bd59694f34d1ba03b778a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:57:00 +0100 Subject: x86/vector: Use IRQD_CAN_RESERVE flag Set the new CAN_RESERVE flag when the initial reservation for an interrupt happens. The flag is used in a subsequent patch to disable reservation mode for a certain class of MSI devices. Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- arch/x86/kernel/apic/vector.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 750449152b04..1e969dba0476 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd) irq_matrix_reserve(vector_matrix); apicd->can_reserve = true; apicd->has_reserved = true; + irqd_set_can_reserve(irqd); trace_vector_reserve(irqd->irq, 0); vector_assign_managed_shutdown(irqd); } @@ -478,6 +479,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, } else { /* Release the vector */ apicd->can_reserve = true; + irqd_set_can_reserve(irqd); clear_irq_vector(irqd); realloc = true; } -- cgit v1.2.3 From 702cb0a02813299d6911b775c637906ae21b737d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:59:06 +0100 Subject: genirq/irqdomain: Rename early argument of irq_domain_activate_irq() The 'early' argument of irq_domain_activate_irq() is actually used to denote reservation mode. To avoid confusion, rename it before abuse happens. No functional change. Fixes: 72491643469a ("genirq/irqdomain: Update irq_domain_ops.activate() signature") Signed-off-by: Thomas Gleixner Cc: Alexandru Chirvasitu Cc: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- arch/x86/include/asm/irqdomain.h | 2 +- arch/x86/include/asm/trace/irq_vectors.h | 16 ++++++++-------- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/apic/vector.c | 6 +++--- arch/x86/platform/uv/uv_irq.c | 2 +- drivers/gpio/gpio-xgene-sb.c | 2 +- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/intel_irq_remapping.c | 2 +- drivers/irqchip/irq-gic-v3-its.c | 4 ++-- drivers/pinctrl/stm32/pinctrl-stm32.c | 2 +- include/linux/irqdomain.h | 2 +- kernel/irq/internals.h | 2 +- kernel/irq/irqdomain.c | 13 +++++++------ 13 files changed, 29 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 139feef467f7..c066ffae222b 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs); extern int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early); + struct irq_data *irq_data, bool reserve); extern void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data); extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 84b9ec0c1bc0..22647a642e98 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed, DECLARE_EVENT_CLASS(vector_activate, TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve, - bool early), + bool reserve), - TP_ARGS(irq, is_managed, can_reserve, early), + TP_ARGS(irq, is_managed, can_reserve, reserve), TP_STRUCT__entry( __field( unsigned int, irq ) __field( bool, is_managed ) __field( bool, can_reserve ) - __field( bool, early ) + __field( bool, reserve ) ), TP_fast_assign( __entry->irq = irq; __entry->is_managed = is_managed; __entry->can_reserve = can_reserve; - __entry->early = early; + __entry->reserve = reserve; ), - TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d", + TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d", __entry->irq, __entry->is_managed, __entry->can_reserve, - __entry->early) + __entry->reserve) ); #define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \ DEFINE_EVENT_FN(vector_activate, name, \ TP_PROTO(unsigned int irq, bool is_managed, \ - bool can_reserve, bool early), \ - TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL); \ + bool can_reserve, bool reserve), \ + TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL); \ DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate); DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 201579dc5242..8a7963421460 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, } int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { unsigned long flags; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1e969dba0476..52c85c8147e9 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -399,21 +399,21 @@ static int activate_managed(struct irq_data *irqd) } static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, - bool early) + bool reserve) { struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned long flags; int ret = 0; trace_vector_activate(irqd->irq, apicd->is_managed, - apicd->can_reserve, early); + apicd->can_reserve, reserve); /* Nothing to do for fixed assigned vectors */ if (!apicd->can_reserve && !apicd->is_managed) return 0; raw_spin_lock_irqsave(&vector_lock, flags); - if (early || irqd_is_managed_and_shutdown(irqd)) + if (reserve || irqd_is_managed_and_shutdown(irqd)) vector_assign_managed_shutdown(irqd); else if (apicd->is_managed) ret = activate_managed(irqd); diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 5f6fd860820a..e4cb9f4cde8a 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -128,7 +128,7 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq, * on the specified blade to allow the sending of MSIs to the specified CPU. */ static int uv_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); return 0; diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c index 2313af82fad3..acd59113e08b 100644 --- a/drivers/gpio/gpio-xgene-sb.c +++ b/drivers/gpio/gpio-xgene-sb.c @@ -139,7 +139,7 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio) static int xgene_gpio_sb_domain_activate(struct irq_domain *d, struct irq_data *irq_data, - bool early) + bool reserve) { struct xgene_gpio_sb *priv = d->host_data; u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq); diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 7d5eb004091d..97baf88d9505 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4184,7 +4184,7 @@ static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu, struct irq_cfg *cfg); static int irq_remapping_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 76a193c7fcfc..66f69af2c219 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -1397,7 +1397,7 @@ static void intel_irq_remapping_free(struct irq_domain *domain, } static int intel_irq_remapping_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { intel_ir_reconfigure_irte(irq_data, true); return 0; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 4039e64cd342..06f025fd5726 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2303,7 +2303,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, } static int its_irq_domain_activate(struct irq_domain *domain, - struct irq_data *d, bool early) + struct irq_data *d, bool reserve) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); u32 event = its_get_event_id(d); @@ -2818,7 +2818,7 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq } static int its_vpe_irq_domain_activate(struct irq_domain *domain, - struct irq_data *d, bool early) + struct irq_data *d, bool reserve) { struct its_vpe *vpe = irq_data_get_irq_chip_data(d); struct its_node *its; diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index a276c61be217..e62ab087bfd8 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -290,7 +290,7 @@ static int stm32_gpio_domain_translate(struct irq_domain *d, } static int stm32_gpio_domain_activate(struct irq_domain *d, - struct irq_data *irq_data, bool early) + struct irq_data *irq_data, bool reserve) { struct stm32_gpio_bank *bank = d->host_data; struct stm32_pinctrl *pctl = dev_get_drvdata(bank->gpio_chip.parent); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a34355d19546..48c7e86bb556 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -113,7 +113,7 @@ struct irq_domain_ops { unsigned int nr_irqs, void *arg); void (*free)(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs); - int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early); + int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve); void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 07d08ca701ec..ab19371eab9b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline int irq_domain_activate_irq(struct irq_data *data, bool early) +static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) { irqd_set_activated(data); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4f4f60015e8a..62068ad46930 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } -static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; @@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, - early); + reserve); if (!ret && domain->ops->activate) { - ret = domain->ops->activate(domain, irqd, early); + ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); @@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt - * @irq_data: outermost irq_data associated with interrupt + * @irq_data: Outermost irq_data associated with interrupt + * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -int irq_domain_activate_irq(struct irq_data *irq_data, bool early) +int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) - ret = __irq_domain_activate_irq(irq_data, early); + ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; -- cgit v1.2.3 From bc976233a872c0f20f018fb1e89264a541584e25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 10:47:22 +0100 Subject: genirq/msi, x86/vector: Prevent reservation mode for non maskable MSI The new reservation mode for interrupts assigns a dummy vector when the interrupt is allocated and assigns a real vector when the interrupt is requested. The reservation mode prevents vector pressure when devices with a large amount of queues/interrupts are initialized, but only a minimal subset of those queues/interrupts is actually used. This mode has an issue with MSI interrupts which cannot be masked. If the driver is not careful or the hardware emits an interrupt before the device irq is requestd by the driver then the interrupt ends up on the dummy vector as a spurious interrupt which can cause malfunction of the device or in the worst case a lockup of the machine. Change the logic for the reservation mode so that the early activation of MSI interrupts checks whether: - the device is a PCI/MSI device - the reservation mode of the underlying irqdomain is activated - PCI/MSI masking is globally enabled - the PCI/MSI device uses either MSI-X, which supports masking, or MSI with the maskbit supported. If one of those conditions is false, then clear the reservation mode flag in the irq data of the interrupt and invoke irq_domain_activate_irq() with the reserve argument cleared. In the x86 vector code, clear the can_reserve flag in the vector allocation data so a subsequent free_irq() won't create the same situation again. The interrupt stays assigned to a real vector until pci_disable_msi() is invoked and all allocations are undone. Fixes: 4900be83602b ("x86/vector/msi: Switch to global reservation mode") Reported-by: Alexandru Chirvasitu Reported-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291406420.1899@nanos Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712291409460.1899@nanos --- arch/x86/kernel/apic/vector.c | 12 +++++++++++- kernel/irq/msi.c | 37 +++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 52c85c8147e9..f8b03bb8e725 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -369,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd) int ret; ret = assign_irq_vector_any_locked(irqd); - if (!ret) + if (!ret) { apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; + } return ret; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9ba954331171..2f3c4f5382cc 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,11 +339,38 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, return ret; } -static bool msi_check_reservation_mode(struct msi_domain_info *info) +/* + * Carefully check whether the device can use reservation mode. If + * reservation mode is enabled then the early activation will assign a + * dummy vector to the device. If the PCI/MSI device does not support + * masking of the entry then this can result in spurious interrupts when + * the device driver is not absolutely careful. But even then a malfunction + * of the hardware could result in a spurious interrupt on the dummy vector + * and render the device unusable. If the entry can be masked then the core + * logic will prevent the spurious interrupt and reservation mode can be + * used. For now reservation mode is restricted to PCI/MSI. + */ +static bool msi_check_reservation_mode(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) { + struct msi_desc *desc; + + if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + return false; + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - return true; + + if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + return false; + + /* + * Checking the first MSI descriptor is sufficient. MSIX supports + * masking and MSI does so when the maskbit is set. + */ + desc = first_msi_entry(dev); + return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; } /** @@ -394,7 +421,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, if (ops->msi_finish) ops->msi_finish(&arg, 0); - can_reserve = msi_check_reservation_mode(info); + can_reserve = msi_check_reservation_mode(domain, info, dev); for_each_msi_entry(desc, dev) { virq = desc->irq; @@ -412,7 +439,9 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, continue; irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data, true); + if (!can_reserve) + irqd_clr_can_reserve(irq_data); + ret = irq_domain_activate_irq(irq_data, can_reserve); if (ret) goto cleanup; } -- cgit v1.2.3 From 322f8b8b340c824aef891342b0f5795d15e11562 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 30 Dec 2017 22:13:53 +0100 Subject: x86/smpboot: Remove stale TLB flush invocations smpboot_setup_warm_reset_vector() and smpboot_restore_warm_reset_vector() invoke local_flush_tlb() for no obvious reason. Digging in history revealed that the original code in the 2.1 era added those because the code manipulated a swapper_pg_dir pagetable entry. The pagetable manipulation was removed long ago in the 2.3 timeframe, but the TLB flush invocations stayed around forever. Remove them along with the pointless pr_debug()s which come from the same 2.1 change. Reported-by: Dominik Brodowski Signed-off-by: Thomas Gleixner Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171230211829.586548655@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33d6000265aa..c3402fc30865 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0xa, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; - pr_debug("2.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; - pr_debug("3.\n"); } static inline void smpboot_restore_warm_reset_vector(void) { unsigned long flags; - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - /* * Paranoid: Set warm reset code and vector here back * to default values. -- cgit v1.2.3 From decab0888e6e14e11d53cefa85f8b3d3b45ce73c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 30 Dec 2017 22:13:54 +0100 Subject: x86/mm: Remove preempt_disable/enable() from __native_flush_tlb() The preempt_disable/enable() pair in __native_flush_tlb() was added in commit: 5cf0791da5c1 ("x86/mm: Disable preemption during CR3 read+write") ... to protect the UP variant of flush_tlb_mm_range(). That preempt_disable/enable() pair should have been added to the UP variant of flush_tlb_mm_range() instead. The UP variant was removed with commit: ce4a4e565f52 ("x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code") ... but the preempt_disable/enable() pair stayed around. The latest change to __native_flush_tlb() in commit: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") ... added an access to a per CPU variable outside the preempt disabled regions, which makes no sense at all. __native_flush_tlb() must always be called with at least preemption disabled. Remove the preempt_disable/enable() pair and add a WARN_ON_ONCE() to catch bad callers independent of the smp_processor_id() debugging. Signed-off-by: Thomas Gleixner Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dominik Brodowski Cc: Linus Torvalds Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171230211829.679325424@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index b519da4fc03c..f9b48ce152eb 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -345,15 +345,17 @@ static inline void invalidate_user_asid(u16 asid) */ static inline void __native_flush_tlb(void) { - invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* - * If current->mm == NULL then we borrow a mm which may change - * during a task switch and therefore we must not be preempted - * while we write CR3 back: + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). */ - preempt_disable(); + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ native_write_cr3(__native_read_cr3()); - preempt_enable(); } /* -- cgit v1.2.3 From a62d69857aab4caa43049e72fe0ed5c4a60518dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 31 Dec 2017 11:24:34 +0100 Subject: x86/ldt: Plug memory leak in error path The error path in write_ldt() tries to free 'old_ldt' instead of the newly allocated 'new_ldt', resulting in a memory leak. It also misses to clean up a half populated LDT pagetable, which is not a leak as it gets cleaned up when the process exits. Free both the potentially half populated LDT pagetable and the newly allocated LDT struct. This can be done unconditionally because once an LDT is mapped subsequent maps will succeed, because the PTE page is already populated and the two LDTs fit into that single page. Reported-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Dominik Brodowski Cc: Linus Torvalds Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: f55f0501cbf6 ("x86/pti: Put the LDT in its own PGD if PTI is on") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1712311121340.1899@nanos Signed-off-by: Ingo Molnar --- arch/x86/kernel/ldt.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 579cc4a66fdf..500e90e44f86 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -421,7 +421,13 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) */ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); if (error) { - free_ldt_struct(old_ldt); + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); goto out_unlock; } -- cgit v1.2.3 From 7f414195b0c3612acd12b4611a5fe75995cf10c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 31 Dec 2017 16:52:15 +0100 Subject: x86/ldt: Make LDT pgtable free conditional Andy prefers to be paranoid about the pagetable free in the error path of write_ldt(). Make it conditional and warn whenever the installment of a secondary LDT fails. Requested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ldt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 500e90e44f86..26d713ecad34 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -426,7 +426,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) * already installed then the PTE page is already * populated. Mop up a half populated page table. */ - free_ldt_pgtables(mm); + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); free_ldt_struct(new_ldt); goto out_unlock; } -- cgit v1.2.3 From ecb101aed86156ec7cd71e5dca668e09146e6994 Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Sun, 31 Dec 2017 21:24:58 -0800 Subject: powerpc/mm: Fix SEGV on mapped region to return SEGV_ACCERR The recent refactoring of the powerpc page fault handler in commit c3350602e876 ("powerpc/mm: Make bad_area* helper functions") caused access to protected memory regions to indicate SEGV_MAPERR instead of the traditional SEGV_ACCERR in the si_code field of a user-space signal handler. This can confuse debug libraries that temporarily change the protection of memory regions, and expect to use SEGV_ACCERR as an indication to restore access to a region. This commit restores the previous behavior. The following program exhibits the issue: $ ./repro read || echo "FAILED" $ ./repro write || echo "FAILED" $ ./repro exec || echo "FAILED" #include #include #include #include #include #include #include static void segv_handler(int n, siginfo_t *info, void *arg) { _exit(info->si_code == SEGV_ACCERR ? 0 : 1); } int main(int argc, char **argv) { void *p = NULL; struct sigaction act = { .sa_sigaction = segv_handler, .sa_flags = SA_SIGINFO, }; assert(argc == 2); p = mmap(NULL, getpagesize(), (strcmp(argv[1], "write") == 0) ? PROT_READ : 0, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); assert(p != MAP_FAILED); assert(sigaction(SIGSEGV, &act, NULL) == 0); if (strcmp(argv[1], "read") == 0) printf("%c", *(unsigned char *)p); else if (strcmp(argv[1], "write") == 0) *(unsigned char *)p = 0; else if (strcmp(argv[1], "exec") == 0) ((void (*)(void))p)(); return 1; /* failed to generate SEGV */ } Fixes: c3350602e876 ("powerpc/mm: Make bad_area* helper functions") Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: John Sperbeck Acked-by: Benjamin Herrenschmidt [mpe: Add commit references in change log] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 4797d08581ce..6e1e39035380 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -145,6 +145,11 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address) return __bad_area(regs, address, SEGV_MAPERR); } +static noinline int bad_access(struct pt_regs *regs, unsigned long address) +{ + return __bad_area(regs, address, SEGV_ACCERR); +} + static int do_sigbus(struct pt_regs *regs, unsigned long address, unsigned int fault) { @@ -490,7 +495,7 @@ retry: good_area: if (unlikely(access_error(is_write, is_exec, vma))) - return bad_area(regs, address); + return bad_access(regs, address); /* * If for any reason at all we couldn't handle the fault, -- cgit v1.2.3 From 32bb954dbf6db98562cb4477608dc546421caaf6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 2 Jan 2018 12:00:22 +0100 Subject: xtensa: shut up gcc-8 warnings Many uses of strncpy() on xtensa causes a warning like arch/xtensa/include/asm/string.h:56:42: warning: array subscript is above array bounds [-Warray-bounds] : "0" (__dest), "1" (__src), "r" (__src+__n) This avoids the warning by turning the pointer arithmetic into an integer operation that does not get checked the same way. Signed-off-by: Arnd Bergmann Signed-off-by: Max Filippov --- arch/xtensa/include/asm/string.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/string.h b/arch/xtensa/include/asm/string.h index 586bad9fe187..89b51a0c752f 100644 --- a/arch/xtensa/include/asm/string.h +++ b/arch/xtensa/include/asm/string.h @@ -53,7 +53,7 @@ static inline char *strncpy(char *__dest, const char *__src, size_t __n) "bne %1, %5, 1b\n" "2:" : "=r" (__dest), "=r" (__src), "=&r" (__dummy) - : "0" (__dest), "1" (__src), "r" (__src+__n) + : "0" (__dest), "1" (__src), "r" ((uintptr_t)__src+__n) : "memory"); return __xdest; @@ -101,7 +101,7 @@ static inline int strncmp(const char *__cs, const char *__ct, size_t __n) "2:\n\t" "sub %2, %2, %3" : "=r" (__cs), "=r" (__ct), "=&r" (__res), "=&r" (__dummy) - : "0" (__cs), "1" (__ct), "r" (__cs+__n)); + : "0" (__cs), "1" (__ct), "r" ((uintptr_t)__cs+__n)); return __res; } -- cgit v1.2.3 From 0ae60d0c4f191c4241377cc3fc5931dc90ca3bbd Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 2 Jan 2018 20:40:21 +0100 Subject: parisc: Show unhashed hardware inventory Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Helge Deller --- arch/parisc/kernel/drivers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index d8f77358e2ba..29b99b8964aa 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -870,7 +870,7 @@ static void print_parisc_device(struct parisc_device *dev) static int count; print_pa_hwpath(dev, hw_path); - printk(KERN_INFO "%d. %s at 0x%p [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }", + printk(KERN_INFO "%d. %s at 0x%px [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }", ++count, dev->name, (void*) dev->hpa.start, hw_path, dev->id.hw_type, dev->id.hversion_rev, dev->id.hversion, dev->id.sversion); -- cgit v1.2.3 From 63b2c373137b16d948b08cffacc6abfcf4cffea6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 2 Jan 2018 20:42:59 +0100 Subject: parisc: Show initial kernel memory layout unhashed Fixes: ad67b74d2469d9b8 ("printk: hash addresses printed with %p") Signed-off-by: Helge Deller --- arch/parisc/mm/init.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 13f7854e0d49..48f41399fc0b 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -631,11 +631,11 @@ void __init mem_init(void) mem_init_print_info(NULL); #ifdef CONFIG_DEBUG_KERNEL /* double-sanity-check paranoia */ printk("virtual kernel memory layout:\n" - " vmalloc : 0x%p - 0x%p (%4ld MB)\n" - " memory : 0x%p - 0x%p (%4ld MB)\n" - " .init : 0x%p - 0x%p (%4ld kB)\n" - " .data : 0x%p - 0x%p (%4ld kB)\n" - " .text : 0x%p - 0x%p (%4ld kB)\n", + " vmalloc : 0x%px - 0x%px (%4ld MB)\n" + " memory : 0x%px - 0x%px (%4ld MB)\n" + " .init : 0x%px - 0x%px (%4ld kB)\n" + " .data : 0x%px - 0x%px (%4ld kB)\n" + " .text : 0x%px - 0x%px (%4ld kB)\n", (void*)VMALLOC_START, (void*)VMALLOC_END, (VMALLOC_END - VMALLOC_START) >> 20, -- cgit v1.2.3 From 88776c0e70be0290f8357019d844aae15edaa967 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 2 Jan 2018 20:36:44 +0100 Subject: parisc: Fix alignment of pa_tlb_lock in assembly on 32-bit SMP kernel Qemu for PARISC reported on a 32bit SMP parisc kernel strange failures about "Not-handled unaligned insn 0x0e8011d6 and 0x0c2011c9." Those opcodes evaluate to the ldcw() assembly instruction which requires (on 32bit) an alignment of 16 bytes to ensure atomicity. As it turns out, qemu is correct and in our assembly code in entry.S and pacache.S we don't pay attention to the required alignment. This patch fixes the problem by aligning the lock offset in assembly code in the same manner as we do in our C-code. Signed-off-by: Helge Deller Cc: # v4.0+ --- arch/parisc/include/asm/ldcw.h | 2 ++ arch/parisc/kernel/entry.S | 13 +++++++++++-- arch/parisc/kernel/pacache.S | 9 +++++++-- 3 files changed, 20 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h index dd5a08aaa4da..3eb4bfc1fb36 100644 --- a/arch/parisc/include/asm/ldcw.h +++ b/arch/parisc/include/asm/ldcw.h @@ -12,6 +12,7 @@ for the semaphore. */ #define __PA_LDCW_ALIGNMENT 16 +#define __PA_LDCW_ALIGN_ORDER 4 #define __ldcw_align(a) ({ \ unsigned long __ret = (unsigned long) &(a)->lock[0]; \ __ret = (__ret + __PA_LDCW_ALIGNMENT - 1) \ @@ -29,6 +30,7 @@ ldcd). */ #define __PA_LDCW_ALIGNMENT 4 +#define __PA_LDCW_ALIGN_ORDER 2 #define __ldcw_align(a) (&(a)->slock) #define __LDCW "ldcw,co" diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index f3cecf5117cf..e95207c0565e 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,14 @@ #endif .import pa_tlb_lock,data + .macro load_pa_tlb_lock reg +#if __PA_LDCW_ALIGNMENT > 4 + load32 PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg + depi 0,31,__PA_LDCW_ALIGN_ORDER, \reg +#else + load32 PA(pa_tlb_lock), \reg +#endif + .endm /* space_to_prot macro creates a prot id from a space id */ @@ -457,7 +466,7 @@ .macro tlb_lock spc,ptp,pte,tmp,tmp1,fault #ifdef CONFIG_SMP cmpib,COND(=),n 0,\spc,2f - load32 PA(pa_tlb_lock),\tmp + load_pa_tlb_lock \tmp 1: LDCW 0(\tmp),\tmp1 cmpib,COND(=) 0,\tmp1,1b nop @@ -480,7 +489,7 @@ /* Release pa_tlb_lock lock. */ .macro tlb_unlock1 spc,tmp #ifdef CONFIG_SMP - load32 PA(pa_tlb_lock),\tmp + load_pa_tlb_lock \tmp tlb_unlock0 \spc,\tmp #endif .endm diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index adf7187f8951..2d40c4ff3f69 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -36,6 +36,7 @@ #include #include #include +#include #include .text @@ -333,8 +334,12 @@ ENDPROC_CFI(flush_data_cache_local) .macro tlb_lock la,flags,tmp #ifdef CONFIG_SMP - ldil L%pa_tlb_lock,%r1 - ldo R%pa_tlb_lock(%r1),\la +#if __PA_LDCW_ALIGNMENT > 4 + load32 pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la + depi 0,31,__PA_LDCW_ALIGN_ORDER, \la +#else + load32 pa_tlb_lock, \la +#endif rsm PSW_SM_I,\flags 1: LDCW 0(\la),\tmp cmpib,<>,n 0,\tmp,3f -- cgit v1.2.3 From af1be2e21203867cb958aaceed5366e2e24b88e8 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 8 Dec 2017 08:45:57 -0800 Subject: ARC: handle gcc generated __builtin_trap for older compiler ARC gcc prior to GNU 2018.03 release didn't have a target specific __builtin_trap() implementation, generating default abort() call. Implement the abort() call - emulating what newer gcc does for the same, as suggested by Arnd. Acked-by: Arnd Bergmann Signed-off-by: Vineet Gupta --- arch/arc/kernel/traps.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index 004f4e4a4c10..133a4dae41fe 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -161,3 +161,11 @@ void do_insterror_or_kprobe(unsigned long address, struct pt_regs *regs) insterror_is_error(address, regs); } + +/* + * abort() call generated by older gcc for __builtin_trap() + */ +void abort(void) +{ + __asm__ __volatile__("trap_s 5\n"); +} -- cgit v1.2.3 From 835bcec5fdf3f9e880111b482177e7e70e3596da Mon Sep 17 00:00:00 2001 From: Dave Young Date: Tue, 2 Jan 2018 17:21:09 +0000 Subject: x86/efi: Fix kernel param add_efi_memmap regression 'add_efi_memmap' is an early param, but do_add_efi_memmap() has no chance to run because the code path is before parse_early_param(). I believe it worked when the param was introduced but probably later some other changes caused the wrong order and nobody noticed it. Move efi_memblock_x86_reserve_range() after parse_early_param() to fix it. Signed-off-by: Dave Young Signed-off-by: Matt Fleming Signed-off-by: Ard Biesheuvel Cc: Bryan O'Donoghue Cc: Ge Song Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180102172110.17018-2-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8af2e8d0c0a1..145810b0edf6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -906,9 +906,6 @@ void __init setup_arch(char **cmdline_p) set_bit(EFI_BOOT, &efi.flags); set_bit(EFI_64BIT, &efi.flags); } - - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); #endif x86_init.oem.arch_setup(); @@ -962,6 +959,8 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux -- cgit v1.2.3 From f24c4d478013d82bd1b943df566fff3561d52864 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Jan 2018 17:21:10 +0000 Subject: efi/capsule-loader: Reinstate virtual capsule mapping Commit: 82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header") ... refactored the capsule loading code that maps the capsule header, to avoid having to map it several times. However, as it turns out, the vmap() call we ended up removing did not just map the header, but the entire capsule image, and dropping this virtual mapping breaks capsules that are processed by the firmware immediately (i.e., without a reboot). Unfortunately, that change was part of a larger refactor that allowed a quirk to be implemented for Quark, which has a non-standard memory layout for capsules, and we have slightly painted ourselves into a corner by allowing quirk code to mangle the capsule header and memory layout. So we need to fix this without breaking Quark. Fortunately, Quark does not appear to care about the virtual mapping, and so we can simply do a partial revert of commit: 2a457fb31df6 ("efi/capsule-loader: Use page addresses rather than struct page pointers") ... and create a vmap() mapping of the entire capsule (including header) based on the reinstated struct page array, unless running on Quark, in which case we pass the capsule header copy as before. Reported-by: Ge Song Tested-by: Bryan O'Donoghue Tested-by: Ge Song Signed-off-by: Ard Biesheuvel Cc: Cc: Dave Young Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Fixes: 82c3768b8d68 ("efi/capsule-loader: Use a cached copy of the capsule header") Link: http://lkml.kernel.org/r/20180102172110.17018-3-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/quirks.c | 13 +++++++++- drivers/firmware/efi/capsule-loader.c | 45 ++++++++++++++++++++++++++++------- include/linux/efi.h | 4 +++- 3 files changed, 52 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 8a99a2e96537..5b513ccffde4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff, /* * Update the first page pointer to skip over the CSH header. */ - cap_info->pages[0] += csh->headersize; + cap_info->phys[0] += csh->headersize; + + /* + * cap_info->capsule should point at a virtual mapping of the entire + * capsule, starting at the capsule header. Our image has the Quark + * security header prepended, so we cannot rely on the default vmap() + * mapping created by the generic capsule code. + * Given that the Quark firmware does not appear to care about the + * virtual mapping, let's just point cap_info->capsule at our copy + * of the capsule header. + */ + cap_info->capsule = &cap_info->header; return 1; } diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c index ec8ac5c4dd84..055e2e8f985a 100644 --- a/drivers/firmware/efi/capsule-loader.c +++ b/drivers/firmware/efi/capsule-loader.c @@ -20,10 +20,6 @@ #define NO_FURTHER_WRITE_ACTION -1 -#ifndef phys_to_page -#define phys_to_page(x) pfn_to_page((x) >> PAGE_SHIFT) -#endif - /** * efi_free_all_buff_pages - free all previous allocated buffer pages * @cap_info: pointer to current instance of capsule_info structure @@ -35,7 +31,7 @@ static void efi_free_all_buff_pages(struct capsule_info *cap_info) { while (cap_info->index > 0) - __free_page(phys_to_page(cap_info->pages[--cap_info->index])); + __free_page(cap_info->pages[--cap_info->index]); cap_info->index = NO_FURTHER_WRITE_ACTION; } @@ -71,6 +67,14 @@ int __efi_capsule_setup_info(struct capsule_info *cap_info) cap_info->pages = temp_page; + temp_page = krealloc(cap_info->phys, + pages_needed * sizeof(phys_addr_t *), + GFP_KERNEL | __GFP_ZERO); + if (!temp_page) + return -ENOMEM; + + cap_info->phys = temp_page; + return 0; } @@ -105,9 +109,24 @@ int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, **/ static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info) { + bool do_vunmap = false; int ret; - ret = efi_capsule_update(&cap_info->header, cap_info->pages); + /* + * cap_info->capsule may have been assigned already by a quirk + * handler, so only overwrite it if it is NULL + */ + if (!cap_info->capsule) { + cap_info->capsule = vmap(cap_info->pages, cap_info->index, + VM_MAP, PAGE_KERNEL); + if (!cap_info->capsule) + return -ENOMEM; + do_vunmap = true; + } + + ret = efi_capsule_update(cap_info->capsule, cap_info->phys); + if (do_vunmap) + vunmap(cap_info->capsule); if (ret) { pr_err("capsule update failed\n"); return ret; @@ -165,10 +184,12 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff, goto failed; } - cap_info->pages[cap_info->index++] = page_to_phys(page); + cap_info->pages[cap_info->index] = page; + cap_info->phys[cap_info->index] = page_to_phys(page); cap_info->page_bytes_remain = PAGE_SIZE; + cap_info->index++; } else { - page = phys_to_page(cap_info->pages[cap_info->index - 1]); + page = cap_info->pages[cap_info->index - 1]; } kbuff = kmap(page); @@ -252,6 +273,7 @@ static int efi_capsule_release(struct inode *inode, struct file *file) struct capsule_info *cap_info = file->private_data; kfree(cap_info->pages); + kfree(cap_info->phys); kfree(file->private_data); file->private_data = NULL; return 0; @@ -281,6 +303,13 @@ static int efi_capsule_open(struct inode *inode, struct file *file) return -ENOMEM; } + cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL); + if (!cap_info->phys) { + kfree(cap_info->pages); + kfree(cap_info); + return -ENOMEM; + } + file->private_data = cap_info; return 0; diff --git a/include/linux/efi.h b/include/linux/efi.h index d813f7b04da7..29fdf8029cf6 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -140,11 +140,13 @@ struct efi_boot_memmap { struct capsule_info { efi_capsule_header_t header; + efi_capsule_header_t *capsule; int reset_type; long index; size_t count; size_t total_size; - phys_addr_t *pages; + struct page **pages; + phys_addr_t *phys; size_t page_bytes_remain; }; -- cgit v1.2.3 From 1e9de1d2207d67b97bb0b62e38454b663d6542fa Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 2 Jan 2018 18:10:39 +0000 Subject: arm64/efi: Ignore EFI_MEMORY_XP attribute if RP and/or WP are set The UEFI memory map is a bit vague about how to interpret the EFI_MEMORY_XP attribute when it is combined with EFI_MEMORY_RP and/or EFI_MEMORY_WP, which have retroactively been redefined as cacheability attributes rather than permission attributes. So let's ignore EFI_MEMORY_XP if _RP and/or _WP are also set. In this case, it is likely that they are being used to describe the capability of the region (i.e., whether it has the controls to reconfigure it as non-executable) rather than the nature of the contents of the region (i.e., whether it contains data that we will never attempt to execute) Reported-by: Stephen Boyd Tested-by: Stephen Boyd Signed-off-by: Ard Biesheuvel Cc: Arvind Yadav Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tyler Baicar Cc: Vasyl Gomonovych Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180102181042.19074-3-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- arch/arm64/kernel/efi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 82cd07592519..f85ac58d08a3 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -48,7 +48,9 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) return pgprot_val(PAGE_KERNEL_ROX); /* RW- */ - if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE) + if (((attr & (EFI_MEMORY_RP | EFI_MEMORY_WP | EFI_MEMORY_XP)) == + EFI_MEMORY_XP) || + type != EFI_RUNTIME_SERVICES_CODE) return pgprot_val(PAGE_KERNEL); /* RWX */ -- cgit v1.2.3 From 694d99d40972f12e59a3696effee8a376b79d7c8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 26 Dec 2017 23:43:54 -0600 Subject: x86/cpu, x86/pti: Do not enable PTI on AMD processors AMD processors are not subject to the types of attacks that the kernel page table isolation feature protects against. The AMD microarchitecture does not allow memory references, including speculative references, that access higher privileged data when running in a lesser privileged mode when that access would result in a page fault. Disable page table isolation by default on AMD processors by not setting the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI is set. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Dave Hansen Cc: Andy Lutomirski Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f2a94dfb434e..b1be494ab4e8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -899,8 +899,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - /* Assume for now that ALL x86 CPUs are insecure */ - setup_force_cpu_bug(X86_BUG_CPU_INSECURE); + if (c->x86_vendor != X86_VENDOR_AMD) + setup_force_cpu_bug(X86_BUG_CPU_INSECURE); fpu__init_system(c); -- cgit v1.2.3 From 52994c256df36fda9a715697431cba9daecb6b11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Jan 2018 15:57:59 +0100 Subject: x86/pti: Make sure the user/kernel PTEs match Meelis reported that his K8 Athlon64 emits MCE warnings when PTI is enabled: [Hardware Error]: Error Addr: 0x0000ffff81e000e0 [Hardware Error]: MC1 Error: L1 TLB multimatch. [Hardware Error]: cache level: L1, tx: INSN The address is in the entry area, which is mapped into kernel _AND_ user space. That's special because we switch CR3 while we are executing there. User mapping: 0xffffffff81e00000-0xffffffff82000000 2M ro PSE GLB x pmd Kernel mapping: 0xffffffff81000000-0xffffffff82000000 16M ro PSE x pmd So the K8 is complaining that the TLB entries differ. They differ in the GLB bit. Drop the GLB bit when installing the user shared mapping. Fixes: 6dc72c3cbca0 ("x86/mm/pti: Share entry text PMD") Reported-by: Meelis Roos Signed-off-by: Thomas Gleixner Tested-by: Meelis Roos Cc: Borislav Petkov Cc: Tom Lendacky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031407180.1957@nanos --- arch/x86/mm/pti.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index bce8aea65606..2da28ba97508 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -367,7 +367,8 @@ static void __init pti_setup_espfix64(void) static void __init pti_clone_entry_text(void) { pti_clone_pmds((unsigned long) __entry_text_start, - (unsigned long) __irqentry_text_end, _PAGE_RW); + (unsigned long) __irqentry_text_end, + _PAGE_RW | _PAGE_GLOBAL); } /* -- cgit v1.2.3 From a9cdbe72c4e8bf3b38781c317a79326e2e1a230d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 31 Dec 2017 10:18:06 -0600 Subject: x86/dumpstack: Fix partial register dumps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The show_regs_safe() logic is wrong. When there's an iret stack frame, it prints the entire pt_regs -- most of which is random stack data -- instead of just the five registers at the end. show_regs_safe() is also poorly named: the on_stack() checks aren't for safety. Rename the function to show_regs_if_on_stack() and add a comment to explain why the checks are needed. These issues were introduced with the "partial register dump" feature of the following commit: b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully") That patch had gone through a few iterations of development, and the above issues were artifacts from a previous iteration of the patch where 'regs' pointed directly to the iret frame rather than to the (partially empty) pt_regs. Tested-by: Alexander Tsoy Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toralf Förster Cc: stable@vger.kernel.org Fixes: b02fcf9ba121 ("x86/unwinder: Handle stack overflows more gracefully") Link: http://lkml.kernel.org/r/5b05b8b344f59db2d3d50dbdeba92d60f2304c54.1514736742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/unwind.h | 17 +++++++++++++---- arch/x86/kernel/dumpstack.c | 28 ++++++++++++++++++++-------- arch/x86/kernel/stacktrace.c | 2 +- 3 files changed, 34 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index c1688c2d0a12..1f86e1b0a5cd 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -56,18 +56,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* - * WARNING: The entire pt_regs may not be safe to dereference. In some cases, - * only the iret frame registers are accessible. Use with caution! + * If 'partial' returns true, only the iret frame registers are valid. */ -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, + bool *partial) { if (unwind_done(state)) return NULL; + if (partial) { +#ifdef CONFIG_UNWINDER_ORC + *partial = !state->full_regs; +#else + *partial = false; +#endif + } + return state->regs; } #else -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, + bool *partial) { return NULL; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5fa110699ed2..d0bb176a7261 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs) regs->sp, regs->flags); } -static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, + bool partial) { - if (on_stack(info, regs, sizeof(*regs))) + /* + * These on_stack() checks aren't strictly necessary: the unwind code + * has already validated the 'regs' pointer. The checks are done for + * ordering reasons: if the registers are on the next stack, we don't + * want to print them out yet. Otherwise they'll be shown as part of + * the wrong stack. Later, when show_trace_log_lvl() switches to the + * next stack, this function will be called again with the same regs so + * they can be printed in the right context. + */ + if (!partial && on_stack(info, regs, sizeof(*regs))) { __show_regs(regs, 0); - else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, - IRET_FRAME_SIZE)) { + + } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { /* * When an interrupt or exception occurs in entry code, the * full pt_regs might not have been saved yet. In that case @@ -98,6 +109,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, struct stack_info stack_info = {0}; unsigned long visit_mask = 0; int graph_idx = 0; + bool partial; printk("%sCall Trace:\n", log_lvl); @@ -140,7 +152,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_safe(&stack_info, regs); + show_regs_if_on_stack(&stack_info, regs, partial); /* * Scan the stack, printing any text addresses we find. At the @@ -164,7 +176,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by show_regs_safe() below. + * by show_regs_if_on_stack(). */ if (regs && stack == ®s->ip) goto next; @@ -199,9 +211,9 @@ next: unwind_next_frame(&state); /* if the frame has entry regs, print them */ - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_safe(&stack_info, regs); + show_regs_if_on_stack(&stack_info, regs, partial); } if (stack_name) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8dabd7bf1673..60244bfaf88f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace, for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, NULL); if (regs) { /* * Kernel mode registers on the stack indicate an -- cgit v1.2.3 From 3ffdeb1a02be3086f1411a15c5b9c481fa28e21f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 31 Dec 2017 10:18:07 -0600 Subject: x86/dumpstack: Print registers for first stack frame MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the stack dump code, if the frame after the starting pt_regs is also a regs frame, the registers don't get printed. Fix that. Reported-by: Andy Lutomirski Tested-by: Alexander Tsoy Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toralf Förster Cc: stable@vger.kernel.org Fixes: 3b3fa11bc700 ("x86/dumpstack: Print any pt_regs found on the stack") Link: http://lkml.kernel.org/r/396f84491d2f0ef64eda4217a2165f5712f6a115.1514736742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index d0bb176a7261..afbecff161d1 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -115,6 +115,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unwind_start(&state, task, regs, stack); stack = stack ? : get_stack_pointer(task, regs); + regs = unwind_get_entry_regs(&state, &partial); /* * Iterate through the stacks, starting with the current stack pointer. @@ -132,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; if (get_stack_info(stack, task, &stack_info, &visit_mask)) { -- cgit v1.2.3 From d7732ba55c4b6a2da339bb12589c515830cfac2c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Jan 2018 19:52:04 +0100 Subject: x86/pti: Switch to kernel CR3 at early in entry_SYSCALL_compat() The preparation for PTI which added CR3 switching to the entry code misplaced the CR3 switch in entry_SYSCALL_compat(). With PTI enabled the entry code tries to access a per cpu variable after switching to kernel GS. This fails because that variable is not mapped to user space. This results in a double fault and in the worst case a kernel crash. Move the switch ahead of the access and clobber RSP which has been saved already. Fixes: 8a09317b895f ("x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching") Reported-by: Lars Wendler Reported-by: Laura Abbott Signed-off-by: Thomas Gleixner Cc: Borislav Betkov Cc: Andy Lutomirski , Cc: Dave Hansen , Cc: Peter Zijlstra , Cc: Greg KH , , Cc: Boris Ostrovsky , Cc: Juergen Gross Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801031949200.1957@nanos --- arch/x86/entry/entry_64_compat.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 40f17009ec20..98d5358e4041 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -190,8 +190,13 @@ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ swapgs - /* Stash user ESP and switch to the kernel stack. */ + /* Stash user ESP */ movl %esp, %r8d + + /* Use %rsp as scratch reg. User ESP is stashed in r8 */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ @@ -219,12 +224,6 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) pushq $0 /* pt_regs->r14 = 0 */ pushq $0 /* pt_regs->r15 = 0 */ - /* - * We just saved %rdi so it is safe to clobber. It is not - * preserved during the C calls inside TRACE_IRQS_OFF anyway. - */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi - /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. -- cgit v1.2.3 From 2fd9c41aea47f4ad071accf94b94f94f2c4d31eb Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 3 Jan 2018 12:39:52 -0800 Subject: x86/process: Define cpu_tss_rw in same section as declaration cpu_tss_rw is declared with DECLARE_PER_CPU_PAGE_ALIGNED but then defined with DEFINE_PER_CPU_SHARED_ALIGNED leading to section mismatch warnings. Use DEFINE_PER_CPU_PAGE_ALIGNED consistently. This is necessary because it's mapped to the cpu entry area and must be page aligned. [ tglx: Massaged changelog a bit ] Fixes: 1a935bc3d4ea ("x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct") Suggested-by: Thomas Gleixner Signed-off-by: Nick Desaulniers Signed-off-by: Thomas Gleixner Cc: thomas.lendacky@amd.com Cc: Borislav Petkov Cc: tklauser@distanz.ch Cc: minipli@googlemail.com Cc: me@kylehuey.com Cc: namit@vmware.com Cc: luto@kernel.org Cc: jpoimboe@redhat.com Cc: tj@kernel.org Cc: cl@linux.com Cc: bp@suse.de Cc: thgarnie@google.com Cc: kirill.shutemov@linux.intel.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180103203954.183360-1-ndesaulniers@google.com --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 517415978409..3cb2486c47e4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower -- cgit v1.2.3 From b12f5d0ffcdcd1dc9c732a5be72afdc6a7d627cf Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Mon, 11 Dec 2017 21:39:12 +0100 Subject: dt-bindings/bcm283x: Define polarity of per-cpu interrupts This patch define the polarity of the per-cpu interrupts on BCM2836 and BCM2837 in order to avoid the warnings from ARM arch timer code: arch_timer: WARNING: Invalid trigger for IRQ19, assuming level low arch_timer: WARNING: Please fix your firmware arch_timer: cp15 timer(s) running at 19.20MHz (virt). Signed-off-by: Stefan Wahren Signed-off-by: Marc Zyngier --- arch/arm/boot/dts/bcm2836.dtsi | 14 +++++++------- arch/arm/boot/dts/bcm2837.dtsi | 12 ++++++------ arch/arm/boot/dts/bcm283x.dtsi | 1 + 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/bcm2836.dtsi b/arch/arm/boot/dts/bcm2836.dtsi index 61e158003509..1dfd76442777 100644 --- a/arch/arm/boot/dts/bcm2836.dtsi +++ b/arch/arm/boot/dts/bcm2836.dtsi @@ -13,24 +13,24 @@ compatible = "brcm,bcm2836-l1-intc"; reg = <0x40000000 0x100>; interrupt-controller; - #interrupt-cells = <1>; + #interrupt-cells = <2>; interrupt-parent = <&local_intc>; }; arm-pmu { compatible = "arm,cortex-a7-pmu"; interrupt-parent = <&local_intc>; - interrupts = <9>; + interrupts = <9 IRQ_TYPE_LEVEL_HIGH>; }; }; timer { compatible = "arm,armv7-timer"; interrupt-parent = <&local_intc>; - interrupts = <0>, // PHYS_SECURE_PPI - <1>, // PHYS_NONSECURE_PPI - <3>, // VIRT_PPI - <2>; // HYP_PPI + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>, // PHYS_SECURE_PPI + <1 IRQ_TYPE_LEVEL_HIGH>, // PHYS_NONSECURE_PPI + <3 IRQ_TYPE_LEVEL_HIGH>, // VIRT_PPI + <2 IRQ_TYPE_LEVEL_HIGH>; // HYP_PPI always-on; }; @@ -76,7 +76,7 @@ compatible = "brcm,bcm2836-armctrl-ic"; reg = <0x7e00b200 0x200>; interrupt-parent = <&local_intc>; - interrupts = <8>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; }; &cpu_thermal { diff --git a/arch/arm/boot/dts/bcm2837.dtsi b/arch/arm/boot/dts/bcm2837.dtsi index bc1cca5cf43c..efa7d3387ab2 100644 --- a/arch/arm/boot/dts/bcm2837.dtsi +++ b/arch/arm/boot/dts/bcm2837.dtsi @@ -12,7 +12,7 @@ compatible = "brcm,bcm2836-l1-intc"; reg = <0x40000000 0x100>; interrupt-controller; - #interrupt-cells = <1>; + #interrupt-cells = <2>; interrupt-parent = <&local_intc>; }; }; @@ -20,10 +20,10 @@ timer { compatible = "arm,armv7-timer"; interrupt-parent = <&local_intc>; - interrupts = <0>, // PHYS_SECURE_PPI - <1>, // PHYS_NONSECURE_PPI - <3>, // VIRT_PPI - <2>; // HYP_PPI + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>, // PHYS_SECURE_PPI + <1 IRQ_TYPE_LEVEL_HIGH>, // PHYS_NONSECURE_PPI + <3 IRQ_TYPE_LEVEL_HIGH>, // VIRT_PPI + <2 IRQ_TYPE_LEVEL_HIGH>; // HYP_PPI always-on; }; @@ -73,7 +73,7 @@ compatible = "brcm,bcm2836-armctrl-ic"; reg = <0x7e00b200 0x200>; interrupt-parent = <&local_intc>; - interrupts = <8>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>; }; &cpu_thermal { diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi index dcde93c85c2d..18db25a5a66e 100644 --- a/arch/arm/boot/dts/bcm283x.dtsi +++ b/arch/arm/boot/dts/bcm283x.dtsi @@ -2,6 +2,7 @@ #include #include #include +#include /* firmware-provided startup stubs live here, where the secondary CPUs are * spinning. -- cgit v1.2.3 From abb62c46d4949d44979fa647740feff3f7538799 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 29 Dec 2017 21:15:54 +0900 Subject: arm64: dts: uniphier: fix gpio-ranges property of PXs3 SoC This is probably a copy-paste mistake. The gpio-ranges of PXs3 is different from that of LD20. Fixes: 277b51e7050f ("arm64: dts: uniphier: add GPIO controller nodes") Signed-off-by: Masahiro Yamada Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi index 48e733136db4..0ac2ace82435 100644 --- a/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi +++ b/arch/arm64/boot/dts/socionext/uniphier-pxs3.dtsi @@ -198,8 +198,8 @@ gpio-controller; #gpio-cells = <2>; gpio-ranges = <&pinctrl 0 0 0>, - <&pinctrl 96 0 0>, - <&pinctrl 160 0 0>; + <&pinctrl 104 0 0>, + <&pinctrl 168 0 0>; gpio-ranges-group-names = "gpio_range0", "gpio_range1", "gpio_range2"; -- cgit v1.2.3 From f5a40711fa58f1c109165a4fec6078bf2dfd2bdc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 28 Dec 2017 19:06:20 +0300 Subject: x86/mm: Set MODULES_END to 0xffffffffff000000 Since f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size") kasan_mem_to_shadow(MODULES_END) could be not aligned to a page boundary. So passing page unaligned address to kasan_populate_zero_shadow() have two possible effects: 1) It may leave one page hole in supposed to be populated area. After commit 21506525fb8d ("x86/kasan/64: Teach KASAN about the cpu_entry_area") that hole happens to be in the shadow covering fixmap area and leads to crash: BUG: unable to handle kernel paging request at fffffbffffe8ee04 RIP: 0010:check_memory_region+0x5c/0x190 Call Trace: memcpy+0x1f/0x50 ghes_copy_tofrom_phys+0xab/0x180 ghes_read_estatus+0xfb/0x280 ghes_notify_nmi+0x2b2/0x410 nmi_handle+0x115/0x2c0 default_do_nmi+0x57/0x110 do_nmi+0xf8/0x150 end_repeat_nmi+0x1a/0x1e Note, the crash likely disappeared after commit 92a0f81d8957, which changed kasan_populate_zero_shadow() call the way it was before commit 21506525fb8d. 2) Attempt to load module near MODULES_END will fail, because __vmalloc_node_range() called from kasan_module_alloc() will hit the WARN_ON(!pte_none(*pte)) in the vmap_pte_range() and bail out with error. To fix this we need to make kasan_mem_to_shadow(MODULES_END) page aligned which means that MODULES_END should be 8*PAGE_SIZE aligned. The whole point of commit f06bdd4001c2 was to move MODULES_END down if NR_CPUS is big, so the cpu_entry_area takes a lot of space. But since 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") the cpu_entry_area is no longer in fixmap, so we could just set MODULES_END to a fixed 8*PAGE_SIZE aligned address. Fixes: f06bdd4001c2 ("x86/mm: Adapt MODULES_END based on fixmap section size") Reported-by: Jakub Kicinski Signed-off-by: Andrey Ryabinin Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: Thomas Garnier Link: https://lkml.kernel.org/r/20171228160620.23818-1-aryabinin@virtuozzo.com --- Documentation/x86/x86_64/mm.txt | 5 +---- arch/x86/include/asm/pgtable_64_types.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index ad41b3813f0a..ddd5ffd31bd0 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -43,7 +43,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space +ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space [fixmap start] - ffffffffff5fffff kernel-internal fixmap range ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole @@ -67,9 +67,6 @@ memory window (this size is arbitrary, it can be raised later if needed). The mappings are not part of any other kernel PGD and are only available during EFI runtime calls. -The module mapping space size changes based on the CONFIG requirements for the -following fixmap section. - Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index b97a539bcdee..6233e5595389 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -104,7 +104,7 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) +#define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) -- cgit v1.2.3 From f2078904810373211fb15f91888fba14c01a4acc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Jan 2018 13:01:40 +0100 Subject: x86/mm: Map cpu_entry_area at the same place on 4/5 level There is no reason for 4 and 5 level pagetables to have a different layout. It just makes determining vaddr_end for KASLR harder than necessary. Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Benjamin Gilbert Cc: Greg Kroah-Hartman Cc: stable Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Garnier , Cc: Alexander Kuleshov Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos --- Documentation/x86/x86_64/mm.txt | 7 ++++--- arch/x86/include/asm/pgtable_64_types.h | 4 ++-- arch/x86/mm/dump_pagetables.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index ddd5ffd31bd0..f7dabe1f01e9 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,8 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... -fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI -fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping +fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping +fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space @@ -37,7 +37,8 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ... unused hole ... ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ... unused hole ... -fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping +fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping +... unused hole ... ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6233e5595389..61b4b60bdc13 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -88,7 +88,7 @@ typedef struct { pteval_t pte; } pte_t; # define VMALLOC_SIZE_TB _AC(32, UL) # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -# define LDT_PGD_ENTRY _AC(-4, UL) +# define LDT_PGD_ENTRY _AC(-3, UL) # define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif @@ -110,7 +110,7 @@ typedef struct { pteval_t pte; } pte_t; #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define CPU_ENTRY_AREA_PGD _AC(-3, UL) +#define CPU_ENTRY_AREA_PGD _AC(-4, UL) #define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index f56902c1f04b..2a4849e92831 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -61,10 +61,10 @@ enum address_markers_idx { KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif + CPU_ENTRY_AREA_NR, #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) LDT_NR, #endif - CPU_ENTRY_AREA_NR, #ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, #endif -- cgit v1.2.3 From 1dddd25125112ba49706518ac9077a1026a18f37 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Jan 2018 12:32:03 +0100 Subject: x86/kaslr: Fix the vaddr_end mess vaddr_end for KASLR is only documented in the KASLR code itself and is adjusted depending on config options. So it's not surprising that a change of the memory layout causes KASLR to have the wrong vaddr_end. This can map arbitrary stuff into other areas causing hard to understand problems. Remove the whole ifdef magic and define the start of the cpu_entry_area to be the end of the KASLR vaddr range. Add documentation to that effect. Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") Reported-by: Benjamin Gilbert Signed-off-by: Thomas Gleixner Tested-by: Benjamin Gilbert Cc: Andy Lutomirski Cc: Greg Kroah-Hartman Cc: stable Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Garnier , Cc: Alexander Kuleshov Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801041320360.1771@nanos --- Documentation/x86/x86_64/mm.txt | 6 ++++++ arch/x86/include/asm/pgtable_64_types.h | 8 +++++++- arch/x86/mm/kaslr.c | 32 +++++++++----------------------- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index f7dabe1f01e9..ea91cb61a602 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... + vaddr_end for KASLR fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks @@ -37,6 +38,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ... unused hole ... ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ... unused hole ... + vaddr_end for KASLR fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping ... unused hole ... ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks @@ -71,3 +73,7 @@ during EFI runtime calls. Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. + +Be very careful vs. KASLR when changing anything here. The KASLR address +range must not overlap with anything except the KASAN shadow area, which is +correct as KASAN disables KASLR. diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 61b4b60bdc13..6b8f73dcbc2c 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -75,7 +75,13 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +/* + * See Documentation/x86/x86_64/mm.txt for a description of the memory map. + * + * Be very careful vs. KASLR when changing anything here. The KASLR address + * range must not overlap with anything except the KASAN shadow area, which + * is correct as KASAN disables KASLR. + */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 879ef930e2c2..aedebd2ebf1e 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,25 +34,14 @@ #define TB_SHIFT 40 /* - * Virtual address start and end range for randomization. The end changes base - * on configuration to have the highest amount of space for randomization. - * It increases the possible random position for each randomized region. + * Virtual address start and end range for randomization. * - * You need to add an if/def entry if you introduce a new memory region - * compatible with KASLR. Your entry must be in logical order with memory - * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to - * ensure that this order is correct and won't be changed. + * The end address could depend on more configuration options to make the + * highest amount of space for randomization available, but that's too hard + * to keep straight and caused issues already. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; - -#if defined(CONFIG_X86_ESPFIX64) -static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; -#elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_END; -#else -static const unsigned long vaddr_end = __START_KERNEL_map; -#endif +static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; @@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void) unsigned long remain_entropy; /* - * All these BUILD_BUG_ON checks ensures the memory layout is - * consistent with the vaddr_start/vaddr_end variables. + * These BUILD_BUG_ON checks ensure the memory layout is consistent + * with the vaddr_start/vaddr_end variables. These checks are very + * limited.... */ BUILD_BUG_ON(vaddr_start >= vaddr_end); - BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && - vaddr_end >= EFI_VA_END); - BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || - IS_ENABLED(CONFIG_EFI)) && - vaddr_end >= __START_KERNEL_map); + BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); if (!kaslr_memory_enabled()) -- cgit v1.2.3 From 42f3bdc5dd962a5958bc024c1e1444248a6b8b4a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jan 2018 18:07:12 +0100 Subject: x86/events/intel/ds: Use the proper cache flush method for mapping ds buffers Thomas reported the following warning: BUG: using smp_processor_id() in preemptible [00000000] code: ovsdb-server/4498 caller is native_flush_tlb_single+0x57/0xc0 native_flush_tlb_single+0x57/0xc0 __set_pte_vaddr+0x2d/0x40 set_pte_vaddr+0x2f/0x40 cea_set_pte+0x30/0x40 ds_update_cea.constprop.4+0x4d/0x70 reserve_ds_buffers+0x159/0x410 x86_reserve_hardware+0x150/0x160 x86_pmu_event_init+0x3e/0x1f0 perf_try_init_event+0x69/0x80 perf_event_alloc+0x652/0x740 SyS_perf_event_open+0x3f6/0xd60 do_syscall_64+0x5c/0x190 set_pte_vaddr is used to map the ds buffers into the cpu entry area, but there are two problems with that: 1) The resulting flush is not supposed to be called in preemptible context 2) The cpu entry area is supposed to be per CPU, but the debug store buffers are mapped for all CPUs so these mappings need to be flushed globally. Add the necessary preemption protection across the mapping code and flush TLBs globally. Fixes: c1961a4631da ("x86/events/intel/ds: Map debug buffers in cpu_entry_area") Reported-by: Thomas Zeitlhofer Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Tested-by: Thomas Zeitlhofer Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180104170712.GB3040@hirez.programming.kicks-ass.net --- arch/x86/events/intel/ds.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8f0aace08b87..8156e47da7ba 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -5,6 +5,7 @@ #include #include +#include #include #include "../perf_event.h" @@ -283,20 +284,35 @@ static DEFINE_PER_CPU(void *, insn_buffer); static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) { + unsigned long start = (unsigned long)cea; phys_addr_t pa; size_t msz = 0; pa = virt_to_phys(addr); + + preempt_disable(); for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) cea_set_pte(cea, pa, prot); + + /* + * This is a cross-CPU update of the cpu_entry_area, we must shoot down + * all TLB entries for it. + */ + flush_tlb_kernel_range(start, start + size); + preempt_enable(); } static void ds_clear_cea(void *cea, size_t size) { + unsigned long start = (unsigned long)cea; size_t msz = 0; + preempt_disable(); for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) cea_set_pte(cea, 0, PAGE_NONE); + + flush_tlb_kernel_range(start, start + size); + preempt_enable(); } static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) -- cgit v1.2.3 From 1e5476815fd7f98b888e01a0f9522b63085f96c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Jan 2018 22:19:04 +0100 Subject: x86/tlb: Drop the _GPL from the cpu_tlbstate export The recent changes for PTI touch cpu_tlbstate from various tlb_flush inlines. cpu_tlbstate is exported as GPL symbol, so this causes a regression when building out of tree drivers for certain graphics cards. Aside of that the export was wrong since it was introduced as it should have been EXPORT_PER_CPU_SYMBOL_GPL(). Use the correct PER_CPU export and drop the _GPL to restore the previous state which allows users to utilize the cards they payed for. As always I'm really thrilled to make this kind of change to support the #friends (or however the hot hashtag of today is spelled) from that closet sauce graphics corp. Fixes: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4") Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") Reported-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: stable@vger.kernel.org --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 80259ad8c386..6b462a472a7b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -870,7 +870,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; -EXPORT_SYMBOL_GPL(cpu_tlbstate); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { -- cgit v1.2.3 From dc8635b78cd8669c37e230058d18c33af7451ab1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 4 Jan 2018 16:17:56 -0800 Subject: kernel/exit.c: export abort() to modules gcc -fisolate-erroneous-paths-dereference can generate calls to abort() from modular code too. [arnd@arndb.de: drop duplicate exports of abort()] Link: http://lkml.kernel.org/r/20180102103311.706364-1-arnd@arndb.de Reported-by: Vineet Gupta Cc: Sudip Mukherjee Cc: Arnd Bergmann Cc: Alexey Brodkin Cc: Russell King Cc: Jose Abreu Signed-off-by: Andrew Morton Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- arch/arm/kernel/traps.c | 1 - arch/m32r/kernel/traps.c | 1 - arch/unicore32/kernel/traps.c | 1 - kernel/exit.c | 1 + 4 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 5cf04888c581..3e26c6f7a191 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -793,7 +793,6 @@ void abort(void) /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } -EXPORT_SYMBOL(abort); void __init trap_init(void) { diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index cb79fba79d43..b88a8dd14933 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -122,7 +122,6 @@ void abort(void) /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } -EXPORT_SYMBOL(abort); void __init trap_init(void) { diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 5f25b39f04d4..c4ac6043ebb0 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -298,7 +298,6 @@ void abort(void) /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } -EXPORT_SYMBOL(abort); void __init trap_init(void) { diff --git a/kernel/exit.c b/kernel/exit.c index df0c91d5606c..995453d9fb55 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1763,3 +1763,4 @@ __weak void abort(void) /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } +EXPORT_SYMBOL(abort); -- cgit v1.2.3 From 56aeb07c914a616ab84357d34f8414a69b140cdf Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Thu, 4 Jan 2018 17:53:12 +0100 Subject: ARM: dts: kirkwood: fix pin-muxing of MPP7 on OpenBlocks A7 MPP7 is currently muxed as "gpio", but this function doesn't exist for MPP7, only "gpo" is available. This causes the following error: kirkwood-pinctrl f1010000.pin-controller: unsupported function gpio on pin mpp7 pinctrl core: failed to register map default (6): invalid type given kirkwood-pinctrl f1010000.pin-controller: error claiming hogs: -22 kirkwood-pinctrl f1010000.pin-controller: could not claim hogs: -22 kirkwood-pinctrl f1010000.pin-controller: unable to register pinctrl driver kirkwood-pinctrl: probe of f1010000.pin-controller failed with error -22 So the pinctrl driver is not probed, all device drivers (including the UART driver) do a -EPROBE_DEFER, and therefore the system doesn't really boot (well, it boots, but with no UART, and no devices that require pin-muxing). Back when the Device Tree file for this board was introduced, the definition was already wrong. The pinctrl driver also always described as "gpo" this function for MPP7. However, between Linux 4.10 and 4.11, a hog pin failing to be muxed was turned from a simple warning to a hard error that caused the entire pinctrl driver probe to bail out. This is probably the result of commit 6118714275f0a ("pinctrl: core: Fix pinctrl_register_and_init() with pinctrl_enable()"). This commit fixes the Device Tree to use the proper "gpo" function for MPP7, which fixes the boot of OpenBlocks A7, which was broken since Linux 4.11. Fixes: f24b56cbcd9d ("ARM: kirkwood: add support for OpenBlocks A7 platform") Cc: Signed-off-by: Thomas Petazzoni Reviewed-by: Andrew Lunn Signed-off-by: Gregory CLEMENT --- arch/arm/boot/dts/kirkwood-openblocks_a7.dts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts index cf2f5240e176..27cc913ca0f5 100644 --- a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts +++ b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts @@ -53,7 +53,8 @@ }; pinctrl: pin-controller@10000 { - pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header>; + pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header + &pmx_gpio_header_gpo>; pinctrl-names = "default"; pmx_uart0: pmx-uart0 { @@ -85,11 +86,16 @@ * ground. */ pmx_gpio_header: pmx-gpio-header { - marvell,pins = "mpp17", "mpp7", "mpp29", "mpp28", + marvell,pins = "mpp17", "mpp29", "mpp28", "mpp35", "mpp34", "mpp40"; marvell,function = "gpio"; }; + pmx_gpio_header_gpo: pxm-gpio-header-gpo { + marvell,pins = "mpp7"; + marvell,function = "gpo"; + }; + pmx_gpio_init: pmx-init { marvell,pins = "mpp38"; marvell,function = "gpio"; -- cgit v1.2.3 From b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 4 Jan 2018 14:37:05 +0000 Subject: x86/alternatives: Add missing '\n' at end of ALTERNATIVE inline asm Where an ALTERNATIVE is used in the middle of an inline asm block, this would otherwise lead to the following instruction being appended directly to the trailing ".popsection", and a failed compile. Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: ak@linux.intel.com Cc: Tim Chen Cc: Peter Zijlstra Cc: Paul Turner Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk --- arch/x86/include/asm/alternative.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index dbfd0854651f..cf5961ca8677 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -140,7 +140,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ - ".popsection" + ".popsection\n" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ OLDINSTR_2(oldinstr, 1, 2) \ @@ -151,7 +151,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ - ".popsection" + ".popsection\n" /* * Alternative instructions for different CPU types or capabilities. -- cgit v1.2.3 From a89bca278220a4ea57ea5e57a037262f258c7d72 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 2 Jan 2018 16:08:36 +0200 Subject: ACPI / x86: boot: Propagate error code in acpi_gsi_to_irq() acpi_get_override_irq() followed by acpi_register_gsi() returns negative error code on failure. Propagate it from acpi_gsi_to_irq() to callers. Signed-off-by: Andy Shevchenko [ rjw : Subject/changelog ] Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4bf004bab4b2..ec3a286163c3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -619,17 +619,17 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) } rc = acpi_get_override_irq(gsi, &trigger, &polarity); - if (rc == 0) { - trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; - polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; - irq = acpi_register_gsi(NULL, gsi, trigger, polarity); - if (irq >= 0) { - *irqp = irq; - return 0; - } - } + if (rc) + return rc; - return -1; + trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; + polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; + irq = acpi_register_gsi(NULL, gsi, trigger, polarity); + if (irq < 0) + return irq; + + *irqp = irq; + return 0; } EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); -- cgit v1.2.3 From 7669b122085018c1f64720d11c24ae6d2549193d Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Fri, 15 Dec 2017 13:46:57 +0100 Subject: ARM: dts: da850-lcdk: Remove leading 0x and 0s from unit address Improve the DTS files by removing all the leading "0x" and zeros to fix the following dtc warnings: Warning (unit_address_format): Node /XXX unit name should not have leading "0x" and Warning (unit_address_format): Node /XXX unit name should not have leading 0s Converted using the following command: find . -type f \( -iname *.dts -o -iname *.dtsi \) -exec sed -i -e "s/@\([0-9a-fA-FxX\.;:#]+\)\s*{/@\L\1 {/g" -e "s/@0x\(.*\) {/@\1 {/g" -e "s/@0+\(.*\) {/@\1 {/g" {} +^C For simplicity, two sed expressions were used to solve each warnings separately. To make the regex expression more robust a few other issues were resolved, namely setting unit-address to lower case, and adding a whitespace before the the opening curly brace: https://elinux.org/Device_Tree_Linux#Linux_conventions This will solve as a side effect warning: Warning (simple_bus_reg): Node /XXX@ simple-bus unit address format error, expected "" This is a follow up to commit 4c9847b7375a ("dt-bindings: Remove leading 0x from bindings notation") Reported-by: David Daney Suggested-by: Rob Herring Signed-off-by: Mathieu Malaterre Signed-off-by: Sekhar Nori --- arch/arm/boot/dts/da850-lcdk.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/da850-lcdk.dts b/arch/arm/boot/dts/da850-lcdk.dts index eed89e659143..a1f4d6d5a569 100644 --- a/arch/arm/boot/dts/da850-lcdk.dts +++ b/arch/arm/boot/dts/da850-lcdk.dts @@ -293,12 +293,12 @@ label = "u-boot env"; reg = <0 0x020000>; }; - partition@0x020000 { + partition@20000 { /* The LCDK defaults to booting from this partition */ label = "u-boot"; reg = <0x020000 0x080000>; }; - partition@0x0a0000 { + partition@a0000 { label = "free space"; reg = <0x0a0000 0>; }; -- cgit v1.2.3 From de791821c295cc61419a06fe5562288417d1bc58 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 5 Jan 2018 15:27:34 +0100 Subject: x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN Use the name associated with the particular attack which needs page table isolation for mitigation. Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Alan Cox Cc: Jiri Koshina Cc: Linus Torvalds Cc: Tim Chen Cc: Andi Lutomirski Cc: Andi Kleen Cc: Peter Zijlstra Cc: Paul Turner Cc: Tom Lendacky Cc: Greg KH Cc: Dave Hansen Cc: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/mm/pti.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 07cdd1715705..21ac898df2d8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -341,6 +341,6 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b1be494ab4e8..2d3bd2215e5b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -900,7 +900,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); if (c->x86_vendor != X86_VENDOR_AMD) - setup_force_cpu_bug(X86_BUG_CPU_INSECURE); + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); fpu__init_system(c); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 2da28ba97508..43d4a4a29037 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -56,13 +56,13 @@ static void __init pti_print_if_insecure(const char *reason) { - if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } static void __init pti_print_if_secure(const char *reason) { - if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) pr_info("%s\n", reason); } @@ -96,7 +96,7 @@ void __init pti_check_boottime_disable(void) } autosel: - if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE)) + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) return; enable: setup_force_cpu_cap(X86_FEATURE_PTI); -- cgit v1.2.3 From 0cb5b30698fdc8f6b4646012e3acb4ddce430788 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jan 2018 14:31:38 -0800 Subject: kvm: vmx: Scrub hardware GPRs at VM-exit Guest GPR values are live in the hardware GPRs at VM-exit. Do not leave any guest values in hardware GPRs after the guest GPR values are saved to the vcpu_vmx structure. This is a partial mitigation for CVE 2017-5715 and CVE 2017-5753. Specifically, it defeats the Project Zero PoC for CVE 2017-5715. Suggested-by: Eric Northup Signed-off-by: Jim Mattson Reviewed-by: Eric Northup Reviewed-by: Benjamin Serebrin Reviewed-by: Andrew Honig [Paolo: Add AMD bits, Signed-off-by: Tom Lendacky ] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 19 +++++++++++++++++++ arch/x86/kvm/vmx.c | 14 +++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index eb714f1cdf7e..bb31c801f1fc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4985,6 +4985,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" +#endif + /* + * Clear host registers marked as clobbered to prevent + * speculative use. + */ + "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" + "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" + "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" + "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" + "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" +#ifdef CONFIG_X86_64 + "xor %%r8, %%r8 \n\t" + "xor %%r9, %%r9 \n\t" + "xor %%r10, %%r10 \n\t" + "xor %%r11, %%r11 \n\t" + "xor %%r12, %%r12 \n\t" + "xor %%r13, %%r13 \n\t" + "xor %%r14, %%r14 \n\t" + "xor %%r15, %%r15 \n\t" #endif "pop %%" _ASM_BP : diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eba631c4dbd..c1e7ed371259 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9415,6 +9415,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Save guest registers, load host registers, keep flags */ "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" "pop %0 \n\t" + "setbe %c[fail](%0)\n\t" "mov %%" _ASM_AX ", %c[rax](%0) \n\t" "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" __ASM_SIZE(pop) " %c[rcx](%0) \n\t" @@ -9431,12 +9432,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" + "xor %%r8d, %%r8d \n\t" + "xor %%r9d, %%r9d \n\t" + "xor %%r10d, %%r10d \n\t" + "xor %%r11d, %%r11d \n\t" + "xor %%r12d, %%r12d \n\t" + "xor %%r13d, %%r13d \n\t" + "xor %%r14d, %%r14d \n\t" + "xor %%r15d, %%r15d \n\t" #endif "mov %%cr2, %%" _ASM_AX " \n\t" "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" + "xor %%eax, %%eax \n\t" + "xor %%ebx, %%ebx \n\t" + "xor %%esi, %%esi \n\t" + "xor %%edi, %%edi \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - "setbe %c[fail](%0) \n\t" ".pushsection .rodata \n\t" ".global vmx_return \n\t" "vmx_return: " _ASM_PTR " 2b \n\t" -- cgit v1.2.3 From e3af9f7c6ece29fdb7fe0aeb83ac5d3077a06edb Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 25 Jul 2017 16:51:20 +0200 Subject: ARM64: dts: marvell: armada-cp110: Fix clock resources for various node On the CP modules we found on Armada 7K/8K, many IP block actually also need a "functional" clock (from the bus). This patch add them which allows to fix some issues hanging the kernel: If Ethernet and sdhci driver are built as modules and sdhci was loaded first then the kernel hang. Fixes: bb16ea1742c8 ("mmc: sdhci-xenon: Fix clock resource by adding an optional bus clock") Cc: stable@vger.kernel.org Reported-by: Riku Voipio Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi | 13 ++++++++----- arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi | 9 ++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi index e3b64d03fbd8..9c7724e82aff 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi @@ -63,8 +63,10 @@ cpm_ethernet: ethernet@0 { compatible = "marvell,armada-7k-pp22"; reg = <0x0 0x100000>, <0x129000 0xb000>; - clocks = <&cpm_clk 1 3>, <&cpm_clk 1 9>, <&cpm_clk 1 5>; - clock-names = "pp_clk", "gop_clk", "mg_clk"; + clocks = <&cpm_clk 1 3>, <&cpm_clk 1 9>, + <&cpm_clk 1 5>, <&cpm_clk 1 18>; + clock-names = "pp_clk", "gop_clk", + "mg_clk","axi_clk"; marvell,system-controller = <&cpm_syscon0>; status = "disabled"; dma-coherent; @@ -155,7 +157,8 @@ #size-cells = <0>; compatible = "marvell,orion-mdio"; reg = <0x12a200 0x10>; - clocks = <&cpm_clk 1 9>, <&cpm_clk 1 5>; + clocks = <&cpm_clk 1 9>, <&cpm_clk 1 5>, + <&cpm_clk 1 6>, <&cpm_clk 1 18>; status = "disabled"; }; @@ -338,8 +341,8 @@ compatible = "marvell,armada-cp110-sdhci"; reg = <0x780000 0x300>; interrupts = ; - clock-names = "core"; - clocks = <&cpm_clk 1 4>; + clock-names = "core","axi"; + clocks = <&cpm_clk 1 4>, <&cpm_clk 1 18>; dma-coherent; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi index 0d51096c69f8..87ac68b2cf37 100644 --- a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi @@ -63,8 +63,10 @@ cps_ethernet: ethernet@0 { compatible = "marvell,armada-7k-pp22"; reg = <0x0 0x100000>, <0x129000 0xb000>; - clocks = <&cps_clk 1 3>, <&cps_clk 1 9>, <&cps_clk 1 5>; - clock-names = "pp_clk", "gop_clk", "mg_clk"; + clocks = <&cps_clk 1 3>, <&cps_clk 1 9>, + <&cps_clk 1 5>, <&cps_clk 1 18>; + clock-names = "pp_clk", "gop_clk", + "mg_clk", "axi_clk"; marvell,system-controller = <&cps_syscon0>; status = "disabled"; dma-coherent; @@ -155,7 +157,8 @@ #size-cells = <0>; compatible = "marvell,orion-mdio"; reg = <0x12a200 0x10>; - clocks = <&cps_clk 1 9>, <&cps_clk 1 5>; + clocks = <&cps_clk 1 9>, <&cps_clk 1 5>, + <&cps_clk 1 6>, <&cps_clk 1 18>; status = "disabled"; }; -- cgit v1.2.3 From ca47480921587ae30417dd234a9f79af188e3666 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 5 Jan 2018 14:27:58 -0800 Subject: xtensa: fix futex_atomic_cmpxchg_inatomic Return 0 if the operation was successful, not the userspace memory value. Check that userspace value equals passed oldval, not itself. Don't update *uval if the value wasn't read from userspace memory. This fixes process hang due to infinite loop in futex_lock_pi. It also fixes a bunch of glibc tests nptl/tst-mutexpi*. Cc: stable@vger.kernel.org Signed-off-by: Max Filippov --- arch/xtensa/include/asm/futex.h | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h index eaaf1ebcc7a4..5bfbc1c401d4 100644 --- a/arch/xtensa/include/asm/futex.h +++ b/arch/xtensa/include/asm/futex.h @@ -92,7 +92,6 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { int ret = 0; - u32 prev; if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; @@ -103,26 +102,24 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, __asm__ __volatile__ ( " # futex_atomic_cmpxchg_inatomic\n" - "1: l32i %1, %3, 0\n" - " mov %0, %5\n" - " wsr %1, scompare1\n" - "2: s32c1i %0, %3, 0\n" - "3:\n" + " wsr %5, scompare1\n" + "1: s32c1i %1, %4, 0\n" + " s32i %1, %6, 0\n" + "2:\n" " .section .fixup,\"ax\"\n" " .align 4\n" - "4: .long 3b\n" - "5: l32r %1, 4b\n" - " movi %0, %6\n" + "3: .long 2b\n" + "4: l32r %1, 3b\n" + " movi %0, %7\n" " jx %1\n" " .previous\n" " .section __ex_table,\"a\"\n" - " .long 1b,5b,2b,5b\n" + " .long 1b,4b\n" " .previous\n" - : "+r" (ret), "=&r" (prev), "+m" (*uaddr) - : "r" (uaddr), "r" (oldval), "r" (newval), "I" (-EFAULT) + : "+r" (ret), "+r" (newval), "+m" (*uaddr), "+m" (*uval) + : "r" (uaddr), "r" (oldval), "r" (uval), "I" (-EFAULT) : "memory"); - *uval = prev; return ret; } -- cgit v1.2.3 From bdae44705c0d5b751fbd79bc4a169905b25ed335 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Thu, 4 Jan 2018 00:31:55 +0800 Subject: ARM: dts: sun[47]i: Fix display backend 1 output to TCON0 remote endpoint There is a copy-paste error in the display pipeline device tree graph. The remote endpoint of the display backend 1's output to TCON0 points to the wrong endpoint. This will result in the driver incorrectly parsing the relationship of the components. Reported-by: Andrea Venturi Fixes: 0df4cf33a594 ("ARM: dts: sun4i: Add device nodes for display pipelines") Fixes: 5b92b29bed45 ("ARM: dts: sun7i: Add device nodes for display pipelines") Signed-off-by: Chen-Yu Tsai Acked-by: Maxime Ripard --- arch/arm/boot/dts/sun4i-a10.dtsi | 2 +- arch/arm/boot/dts/sun7i-a20.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi index 5840f5c75c3b..4f2f2eea0755 100644 --- a/arch/arm/boot/dts/sun4i-a10.dtsi +++ b/arch/arm/boot/dts/sun4i-a10.dtsi @@ -1104,7 +1104,7 @@ be1_out_tcon0: endpoint@0 { reg = <0>; - remote-endpoint = <&tcon1_in_be0>; + remote-endpoint = <&tcon0_in_be1>; }; be1_out_tcon1: endpoint@1 { diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 59655e42e4b0..bd0cd3204273 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -1354,7 +1354,7 @@ be1_out_tcon0: endpoint@0 { reg = <0>; - remote-endpoint = <&tcon1_in_be0>; + remote-endpoint = <&tcon0_in_be1>; }; be1_out_tcon1: endpoint@1 { -- cgit v1.2.3 From 7729bebc619307a0233c86f8585a4bf3eadc7ce4 Mon Sep 17 00:00:00 2001 From: Valentin Ilie Date: Fri, 5 Jan 2018 23:12:59 +0000 Subject: ia64, sched/cputime: Fix build error if CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y Remove the extra parenthesis. This bug was introduced by: e2339a4caa5e: ("ia64: Convert vtime to use nsec units directly") Signed-off-by: Valentin Ilie Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fenghua.yu@intel.com Cc: linux-ia64@vger.kernel.org Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1515193979-24873-1-git-send-email-valentin.ilie@gmail.com Signed-off-by: Ingo Molnar --- arch/ia64/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index c6ecb97151a2..9025699049ca 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -88,7 +88,7 @@ void vtime_flush(struct task_struct *tsk) } if (ti->softirq_time) { - delta = cycle_to_nsec(ti->softirq_time)); + delta = cycle_to_nsec(ti->softirq_time); account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ); } -- cgit v1.2.3 From 9ae21dd66b970b5e3192a636353d75ede0529338 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 5 Jan 2018 08:18:52 -0800 Subject: perf/x86/msr: Add support for MSR_IA32_THERM_STATUS This patch adds support for the Digital Readout provided by the IA32_THERM_STATUS MSR (0x19C) on Intel X86 processors. The readout shows the number of degrees Celcius to the TCC (critical temperature) supported by the processor. Thus, the larger, the better. The perf_event support is provided via the msr PMU. The new logical event is called cpu_thermal_margin. It comes with a unit and snapshot files. The event shows the current temprature distance (margin). It is not an accumulating event. The unit is degrees C. The event is provided per logical CPU to make things simpler but it is the same for both hyper-threads sharing a physical core. $ perf stat -I 1000 -a -A -e msr/cpu_thermal_margin/ This will print the temperature for all logical CPUs. time CPU counts unit events 1.000123741 CPU0 38 C msr/cpu_thermal_margin/ 1.000161837 CPU1 37 C msr/cpu_thermal_margin/ 1.000187906 CPU2 36 C msr/cpu_thermal_margin/ 1.000189046 CPU3 39 C msr/cpu_thermal_margin/ 1.000283044 CPU4 40 C msr/cpu_thermal_margin/ 1.000344297 CPU5 40 C msr/cpu_thermal_margin/ 1.000365832 CPU6 39 C msr/cpu_thermal_margin/ ... In case the temperature margin cannot be read, the reported value would be -1. Works on all processors supporting the Digital Readout (dtherm in cpuinfo) Signed-off-by: Stephane Eranian Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1515169132-3980-1-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/events/msr.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 14efaa0e8684..0be15b9b2376 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -10,7 +10,9 @@ enum perf_msr_id { PERF_MSR_SMI = 4, PERF_MSR_PTSC = 5, PERF_MSR_IRPERF = 6, - + PERF_MSR_THERM = 7, + PERF_MSR_THERM_SNAP = 8, + PERF_MSR_THERM_UNIT = 9, PERF_MSR_EVENT_MAX, }; @@ -29,6 +31,12 @@ static bool test_irperf(int idx) return boot_cpu_has(X86_FEATURE_IRPERF); } +static bool test_therm_status(int idx) +{ + return boot_cpu_has(X86_FEATURE_DTHERM); +} + + static bool test_intel(int idx) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || @@ -102,6 +110,9 @@ PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05"); PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06"); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin, evattr_therm, "event=0x07"); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, evattr_therm_snap, "1"); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, evattr_therm_unit, "C"); static struct perf_msr msr[] = { [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, @@ -111,6 +122,9 @@ static struct perf_msr msr[] = { [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, + [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &evattr_therm, test_therm_status, }, + [PERF_MSR_THERM_SNAP] = { MSR_IA32_THERM_STATUS, &evattr_therm_snap, test_therm_status, }, + [PERF_MSR_THERM_UNIT] = { MSR_IA32_THERM_STATUS, &evattr_therm_unit, test_therm_status, }, }; static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { @@ -193,10 +207,15 @@ again: goto again; delta = now - prev; - if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) + if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { delta = sign_extend64(delta, 31); - - local64_add(delta, &event->count); + local64_add(delta, &event->count); + } else if (unlikely(event->hw.event_base == MSR_IA32_THERM_STATUS)) { + /* if valid, extract digital readout, other set to -1 */ + now = now & (1ULL << 31) ? (now >> 16) & 0x3f : -1; + local64_set(&event->count, now); + } else + local64_add(delta, &event->count); } static void msr_event_start(struct perf_event *event, int flags) -- cgit v1.2.3 From 9128d3ed9de3882c83b927eb553d5d44c84505f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jan 2018 08:18:52 -0800 Subject: perf/x86/msr: Clean up the code Recent changes made a bit of an inconsistent mess out of arch/x86/events/msr.c, fix it: - re-align the initialization tables to be vertically aligned and readable again - harmonize comment style in terms of punctuation, capitalization and spelling - use curly braces for multi-condition branches - remove extra newlines - simplify the code a bit Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1515169132-3980-1-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/events/msr.c | 61 ++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 0be15b9b2376..18e2628e2d8f 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -36,7 +36,6 @@ static bool test_therm_status(int idx) return boot_cpu_has(X86_FEATURE_DTHERM); } - static bool test_intel(int idx) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || @@ -103,28 +102,28 @@ struct perf_msr { bool (*test)(int idx); }; -PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); -PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); -PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); -PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); -PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); -PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05"); -PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06"); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin, evattr_therm, "event=0x07"); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, evattr_therm_snap, "1"); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, evattr_therm_unit, "C"); +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00" ); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01" ); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02" ); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03" ); +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04" ); +PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05" ); +PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin, evattr_therm, "event=0x07" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, evattr_therm_snap, "1" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, evattr_therm_unit, "C" ); static struct perf_msr msr[] = { - [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, - [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, - [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, - [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, - [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, - [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, - [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, - [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &evattr_therm, test_therm_status, }, - [PERF_MSR_THERM_SNAP] = { MSR_IA32_THERM_STATUS, &evattr_therm_snap, test_therm_status, }, - [PERF_MSR_THERM_UNIT] = { MSR_IA32_THERM_STATUS, &evattr_therm_unit, test_therm_status, }, + [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, + [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, + [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, + [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &evattr_therm, test_therm_status, }, + [PERF_MSR_THERM_SNAP] = { MSR_IA32_THERM_STATUS, &evattr_therm_snap, test_therm_status, }, + [PERF_MSR_THERM_UNIT] = { MSR_IA32_THERM_STATUS, &evattr_therm_unit, test_therm_status, }, }; static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { @@ -175,9 +174,9 @@ static int msr_event_init(struct perf_event *event) if (!msr[cfg].attr) return -EINVAL; - event->hw.idx = -1; - event->hw.event_base = msr[cfg].msr; - event->hw.config = cfg; + event->hw.idx = -1; + event->hw.event_base = msr[cfg].msr; + event->hw.config = cfg; return 0; } @@ -198,7 +197,7 @@ static void msr_event_update(struct perf_event *event) u64 prev, now; s64 delta; - /* Careful, an NMI might modify the previous event value. */ + /* Careful, an NMI might modify the previous event value: */ again: prev = local64_read(&event->hw.prev_count); now = msr_read_counter(event); @@ -211,18 +210,18 @@ again: delta = sign_extend64(delta, 31); local64_add(delta, &event->count); } else if (unlikely(event->hw.event_base == MSR_IA32_THERM_STATUS)) { - /* if valid, extract digital readout, other set to -1 */ + /* If valid, extract digital readout, otherwise set to -1: */ now = now & (1ULL << 31) ? (now >> 16) & 0x3f : -1; local64_set(&event->count, now); - } else + } else { local64_add(delta, &event->count); + } } static void msr_event_start(struct perf_event *event, int flags) { - u64 now; + u64 now = msr_read_counter(event); - now = msr_read_counter(event); local64_set(&event->hw.prev_count, now); } @@ -269,9 +268,7 @@ static int __init msr_init(void) for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { u64 val; - /* - * Virt sucks arse; you cannot tell if a R/O MSR is present :/ - */ + /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) msr[i].attr = NULL; } -- cgit v1.2.3 From 310d82784fb4d60c80569f5ca9f53a7f3bf1d477 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 5 Jan 2018 21:55:38 +0100 Subject: parisc: qemu idle sleep support Add qemu idle sleep support when running under qemu with SeaBIOS PDC firmware. Like the power architecture we use the "or" assembler instructions, which translate to nops on real hardware, to indicate that qemu shall idle sleep. Signed-off-by: Helge Deller Cc: Richard Henderson CC: stable@vger.kernel.org # v4.9+ --- arch/parisc/kernel/process.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'arch') diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 30f92391a93e..cad3e8661cd6 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -183,6 +184,44 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r) return 1; } +/* + * Idle thread support + * + * Detect when running on QEMU with SeaBIOS PDC Firmware and let + * QEMU idle the host too. + */ + +int running_on_qemu __read_mostly; + +void __cpuidle arch_cpu_idle_dead(void) +{ + /* nop on real hardware, qemu will offline CPU. */ + asm volatile("or %%r31,%%r31,%%r31\n":::); +} + +void __cpuidle arch_cpu_idle(void) +{ + local_irq_enable(); + + /* nop on real hardware, qemu will idle sleep. */ + asm volatile("or %%r10,%%r10,%%r10\n":::); +} + +static int __init parisc_idle_init(void) +{ + const char *marker; + + /* check QEMU/SeaBIOS marker in PAGE0 */ + marker = (char *) &PAGE0->pad0; + running_on_qemu = (memcmp(marker, "SeaBIOS", 8) == 0); + + if (!running_on_qemu) + cpu_idle_poll_ctrl(1); + + return 0; +} +arch_initcall(parisc_idle_init); + /* * Copy architecture-specific thread state */ -- cgit v1.2.3 From b94b7373317164402ff7728d10f7023127a02b60 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Mon, 1 Jan 2018 10:04:47 +0800 Subject: x86/microcode/intel: Extend BDW late-loading with a revision check Instead of blacklisting all model 79 CPUs when attempting a late microcode loading, limit that only to CPUs with microcode revisions < 0x0b000021 because only on those late loading may cause a system hang. For such processors either: a) a BIOS update which might contain a newer microcode revision or b) the early microcode loading method should be considered. Processors with revisions 0x0b000021 or higher will not experience such hangs. For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. [ bp: Heavily massage commit message and pr_* statements. ] Fixes: 723f2828a98c ("x86/microcode/intel: Disable late loading on model 79") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Cc: x86-ml Cc: # v4.14 Link: http://lkml.kernel.org/r/1514772287-92959-1-git-send-email-qianyue.zj@alibaba-inc.com --- arch/x86/kernel/cpu/microcode/intel.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8ccdca6d3f9e..d9e460fc7a3b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -910,8 +910,17 @@ static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { - pr_err_once("late loading on model 79 is disabled.\n"); + /* + * Late loading on model 79 with microcode revision less than 0x0b000021 + * may result in a system hang. This behavior is documented in item + * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + */ + if (c->x86 == 6 && + c->x86_model == INTEL_FAM6_BROADWELL_X && + c->x86_mask == 0x01 && + c->microcode < 0x0b000021) { + pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } -- cgit v1.2.3 From de53c3786a3ce162a1c815d0c04c766c23ec9c0a Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 5 Jan 2018 22:35:41 +0100 Subject: x86/pti: Unbreak EFI old_memmap EFI_OLD_MEMMAP's efi_call_phys_prolog() calls set_pgd() with swapper PGD that has PAGE_USER set, which makes PTI set NX on it, and therefore EFI can't execute it's code. Fix that by forcefully clearing _PAGE_NX from the PGD (this can't be done by the pgprot API). _PAGE_NX will be automatically reintroduced in efi_call_phys_epilog(), as _set_pgd() will again notice that this is _PAGE_USER, and set _PAGE_NX on it. Tested-by: Dimitri Sivanich Signed-off-by: Jiri Kosina Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Acked-by: Dave Hansen Cc: Andrea Arcangeli Cc: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1801052215460.11852@cbobk.fhfr.pm --- arch/x86/platform/efi/efi_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 39c4b35ac7a4..61975b6bcb1a 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -134,7 +134,9 @@ pgd_t * __init efi_call_phys_prolog(void) pud[j] = *pud_offset(p4d_k, vaddr); } } + pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX; } + out: __flush_tlb_all(); -- cgit v1.2.3 From 99c6fa2511d8a683e61468be91b83f85452115fa Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 6 Jan 2018 11:49:23 +0000 Subject: x86/cpufeatures: Add X86_BUG_SPECTRE_V[12] Add the bug bits for spectre v1/2 and force them unconditionally for all cpus. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/cpu/common.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 21ac898df2d8..1641c2f96363 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -342,5 +342,7 @@ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2d3bd2215e5b..372ba3fb400f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -902,6 +902,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_AMD) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + fpu__init_system(c); #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 33c57c0d3c67f51f491a9d27108f7e97adc03d96 Mon Sep 17 00:00:00 2001 From: Karsten Merker Date: Thu, 4 Jan 2018 23:37:02 +0100 Subject: RISC-V: Add a basic defconfig This patch provides a basic defconfig for the RISC-V architecture that enables enough kernel features to run a basic Linux distribution on qemu's "virt" board for native software development. Features include: - serial console - virtio block and network device support - VFAT and ext2/3/4 filesystem support - NFS client and NFS rootfs support - an assortment of other kernel features required for running systemd It also enables a number of drivers for physical hardware that target the "SiFive U500" SoC and the corresponding development platform. These include: - PCIe host controller support for the FPGA-based U500 development platform (PCIE_XILINX) - USB host controller support (OHCI/EHCI/XHCI) - USB HID (keyboard/mouse) support - USB mass storage support (bulk and UAS) - SATA support (AHCI) - ethernet drivers (MACB for a SoC-internal MAC block, microsemi ethernet phy, E1000E and R8169 for PCIe-connected external devices) - DRM and framebuffer console support for PCIe-connected Radeon graphics chips Signed-off-by: Karsten Merker Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'arch') diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index e69de29bb2d1..47dacf06c679 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -0,0 +1,75 @@ +CONFIG_SMP=y +CONFIG_PCI=y +CONFIG_PCIE_XILINX=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BPF_SYSCALL=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NETLINK_DIAG=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_MACB=y +CONFIG_E1000E=y +CONFIG_R8169=y +CONFIG_MICROSEMI_PHY=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PLATFORM=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_RAS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y +# CONFIG_RCU_TRACE is not set +CONFIG_CRYPTO_USER_API_HASH=y -- cgit v1.2.3 From 9e49a4ed072ab67b17238c5a45d7cba7f848659e Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 26 Dec 2017 19:11:22 -0800 Subject: RISC-V: Make __NR_riscv_flush_icache visible to userspace We were hoping to avoid making this visible to userspace, but it looks like we're going to have to because QEMU's user-mode emulation doesn't want to emulate a vDSO. Having vDSO-only system calls was a bit unothodox anyway, so I think in this case it's OK to just make the actual system call number public. This patch simply moves the definition of __NR_riscv_flush_icache availiable to userspace, which results in the deletion of the now empty vdso-syscalls.h. Changes since v1: * I've moved the definition into uapi/asm/syscalls.h rathen than uapi/asm/unistd.h. This allows me to keep asm/unistd.h, so we can keep the syscall table macros sane. * As a side effect of the above, this no longer disables all system calls on RISC-V. Whoops! Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/unistd.h | 1 + arch/riscv/include/asm/vdso-syscalls.h | 28 ---------------------------- arch/riscv/include/uapi/asm/syscalls.h | 26 ++++++++++++++++++++++++++ arch/riscv/kernel/syscall_table.c | 1 - arch/riscv/kernel/vdso/flush_icache.S | 1 - 5 files changed, 27 insertions(+), 30 deletions(-) delete mode 100644 arch/riscv/include/asm/vdso-syscalls.h create mode 100644 arch/riscv/include/uapi/asm/syscalls.h (limited to 'arch') diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h index 9f250ed007cd..2f704a5c4196 100644 --- a/arch/riscv/include/asm/unistd.h +++ b/arch/riscv/include/asm/unistd.h @@ -14,3 +14,4 @@ #define __ARCH_HAVE_MMU #define __ARCH_WANT_SYS_CLONE #include +#include diff --git a/arch/riscv/include/asm/vdso-syscalls.h b/arch/riscv/include/asm/vdso-syscalls.h deleted file mode 100644 index a2ccf1894929..000000000000 --- a/arch/riscv/include/asm/vdso-syscalls.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2017 SiFive - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef _ASM_RISCV_VDSO_SYSCALLS_H -#define _ASM_RISCV_VDSO_SYSCALLS_H - -#ifdef CONFIG_SMP - -/* These syscalls are only used by the vDSO and are not in the uapi. */ -#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) -__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) - -#endif - -#endif /* _ASM_RISCV_VDSO_H */ diff --git a/arch/riscv/include/uapi/asm/syscalls.h b/arch/riscv/include/uapi/asm/syscalls.h new file mode 100644 index 000000000000..818655b0d535 --- /dev/null +++ b/arch/riscv/include/uapi/asm/syscalls.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2017 SiFive + */ + +#ifndef _ASM__UAPI__SYSCALLS_H +#define _ASM__UAPI__SYSCALLS_H + +/* + * Allows the instruction cache to be flushed from userspace. Despite RISC-V + * having a direct 'fence.i' instruction available to userspace (which we + * can't trap!), that's not actually viable when running on Linux because the + * kernel might schedule a process on another hart. There is no way for + * userspace to handle this without invoking the kernel (as it doesn't know the + * thread->hart mappings), so we've defined a RISC-V specific system call to + * flush the instruction cache. + * + * __NR_riscv_flush_icache is defined to flush the instruction cache over an + * address range, with the flush applying to either all threads or just the + * caller. We don't currently do anything with the address range, that's just + * in there for forwards compatibility. + */ +#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) +__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) + +#endif diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index a5bd6401f95e..ade52b903a43 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -23,5 +23,4 @@ void *sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = sys_ni_syscall, #include -#include }; diff --git a/arch/riscv/kernel/vdso/flush_icache.S b/arch/riscv/kernel/vdso/flush_icache.S index b0fbad74e873..023e4d4aef58 100644 --- a/arch/riscv/kernel/vdso/flush_icache.S +++ b/arch/riscv/kernel/vdso/flush_icache.S @@ -13,7 +13,6 @@ #include #include -#include .text /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */ -- cgit v1.2.3 From c163fb38ca34694b0cce99bb5604257bc29bf200 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jan 2018 18:35:02 +0100 Subject: riscv: remove CONFIG_MMU ifdefs The RISC-V port doesn't suport a nommu mode, so there is no reason to provide some code only under a CONFIG_MMU ifdef. Signed-off-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/io.h | 4 ---- arch/riscv/include/asm/pgtable.h | 4 ---- arch/riscv/include/asm/tlbflush.h | 4 ---- arch/riscv/include/asm/uaccess.h | 12 ------------ 4 files changed, 24 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index a82ce599b639..b269451e7e85 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -21,8 +21,6 @@ #include -#ifdef CONFIG_MMU - extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); /* @@ -36,8 +34,6 @@ extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); extern void iounmap(volatile void __iomem *addr); -#endif /* CONFIG_MMU */ - /* Generic IO read/write. These perform native-endian accesses. */ #define __raw_writeb __raw_writeb static inline void __raw_writeb(u8 val, volatile void __iomem *addr) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2cbd92ed1629..16301966d65b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -20,8 +20,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_MMU - /* Page Upper Directory not used in RISC-V */ #include #include @@ -413,8 +411,6 @@ static inline void pgtable_cache_init(void) /* No page table caches to initialize */ } -#endif /* CONFIG_MMU */ - #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 715b0f10af58..7b9c24ebdf52 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -15,8 +15,6 @@ #ifndef _ASM_RISCV_TLBFLUSH_H #define _ASM_RISCV_TLBFLUSH_H -#ifdef CONFIG_MMU - #include /* @@ -64,6 +62,4 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -#endif /* CONFIG_MMU */ - #endif /* _ASM_RISCV_TLBFLUSH_H */ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 27b90d64814b..14b0b22fb578 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -127,7 +127,6 @@ extern int fixup_exception(struct pt_regs *state); * call. */ -#ifdef CONFIG_MMU #define __get_user_asm(insn, x, ptr, err) \ do { \ uintptr_t __tmp; \ @@ -153,13 +152,11 @@ do { \ __disable_user_access(); \ (x) = __x; \ } while (0) -#endif /* CONFIG_MMU */ #ifdef CONFIG_64BIT #define __get_user_8(x, ptr, err) \ __get_user_asm("ld", x, ptr, err) #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU #define __get_user_8(x, ptr, err) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ @@ -193,7 +190,6 @@ do { \ (x) = (__typeof__(x))((__typeof__((x)-(x)))( \ (((u64)__hi << 32) | __lo))); \ } while (0) -#endif /* CONFIG_MMU */ #endif /* CONFIG_64BIT */ @@ -267,8 +263,6 @@ do { \ ((x) = 0, -EFAULT); \ }) - -#ifdef CONFIG_MMU #define __put_user_asm(insn, x, ptr, err) \ do { \ uintptr_t __tmp; \ @@ -292,14 +286,11 @@ do { \ : "rJ" (__x), "i" (-EFAULT)); \ __disable_user_access(); \ } while (0) -#endif /* CONFIG_MMU */ - #ifdef CONFIG_64BIT #define __put_user_8(x, ptr, err) \ __put_user_asm("sd", x, ptr, err) #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU #define __put_user_8(x, ptr, err) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ @@ -329,7 +320,6 @@ do { \ : "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT)); \ __disable_user_access(); \ } while (0) -#endif /* CONFIG_MMU */ #endif /* CONFIG_64BIT */ @@ -438,7 +428,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) * will set "err" to -EFAULT, while successful accesses return the previous * value. */ -#ifdef CONFIG_MMU #define __cmpxchg_user(ptr, old, new, err, size, lrb, scb) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ @@ -508,6 +497,5 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) (err) = __err; \ __ret; \ }) -#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_UACCESS_H */ -- cgit v1.2.3 From 1125203c13b9da32125e171b4bd75e93d4918ddd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jan 2018 18:35:03 +0100 Subject: riscv: rename SR_* constants to match the spec Signed-off-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/csr.h | 8 ++++---- arch/riscv/include/asm/irqflags.h | 10 +++++----- arch/riscv/include/asm/ptrace.h | 2 +- arch/riscv/kernel/entry.S | 8 ++++---- arch/riscv/kernel/process.c | 4 ++-- arch/riscv/mm/fault.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 0d64bc9f4f91..3c7a2c97e377 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -17,10 +17,10 @@ #include /* Status register flags */ -#define SR_IE _AC(0x00000002, UL) /* Interrupt Enable */ -#define SR_PIE _AC(0x00000020, UL) /* Previous IE */ -#define SR_PS _AC(0x00000100, UL) /* Previously Supervisor */ -#define SR_SUM _AC(0x00040000, UL) /* Supervisor may access User Memory */ +#define SR_SIE _AC(0x00000002, UL) /* Supervisor Interrupt Enable */ +#define SR_SPIE _AC(0x00000020, UL) /* Previous Supervisor IE */ +#define SR_SPP _AC(0x00000100, UL) /* Previously Supervisor */ +#define SR_SUM _AC(0x00040000, UL) /* Supervisor may access User Memory */ #define SR_FS _AC(0x00006000, UL) /* Floating-point Status */ #define SR_FS_OFF _AC(0x00000000, UL) diff --git a/arch/riscv/include/asm/irqflags.h b/arch/riscv/include/asm/irqflags.h index 6fdc860d7f84..07a3c6d5706f 100644 --- a/arch/riscv/include/asm/irqflags.h +++ b/arch/riscv/include/asm/irqflags.h @@ -27,25 +27,25 @@ static inline unsigned long arch_local_save_flags(void) /* unconditionally enable interrupts */ static inline void arch_local_irq_enable(void) { - csr_set(sstatus, SR_IE); + csr_set(sstatus, SR_SIE); } /* unconditionally disable interrupts */ static inline void arch_local_irq_disable(void) { - csr_clear(sstatus, SR_IE); + csr_clear(sstatus, SR_SIE); } /* get status and disable interrupts */ static inline unsigned long arch_local_irq_save(void) { - return csr_read_clear(sstatus, SR_IE); + return csr_read_clear(sstatus, SR_SIE); } /* test flags */ static inline int arch_irqs_disabled_flags(unsigned long flags) { - return !(flags & SR_IE); + return !(flags & SR_SIE); } /* test hardware interrupt enable bit */ @@ -57,7 +57,7 @@ static inline int arch_irqs_disabled(void) /* set interrupt enabled status */ static inline void arch_local_irq_restore(unsigned long flags) { - csr_set(sstatus, flags & SR_IE); + csr_set(sstatus, flags & SR_SIE); } #endif /* _ASM_RISCV_IRQFLAGS_H */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 93b8956e25e4..2c5df945d43c 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -66,7 +66,7 @@ struct pt_regs { #define REG_FMT "%08lx" #endif -#define user_mode(regs) (((regs)->sstatus & SR_PS) == 0) +#define user_mode(regs) (((regs)->sstatus & SR_SPP) == 0) /* Helpers for working with the instruction pointer */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 20ee86f782a9..7404ec222406 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -196,7 +196,7 @@ handle_syscall: addi s2, s2, 0x4 REG_S s2, PT_SEPC(sp) /* System calls run with interrupts enabled */ - csrs sstatus, SR_IE + csrs sstatus, SR_SIE /* Trace syscalls, but only if requested by the user. */ REG_L t0, TASK_TI_FLAGS(tp) andi t0, t0, _TIF_SYSCALL_TRACE @@ -224,8 +224,8 @@ ret_from_syscall: ret_from_exception: REG_L s0, PT_SSTATUS(sp) - csrc sstatus, SR_IE - andi s0, s0, SR_PS + csrc sstatus, SR_SIE + andi s0, s0, SR_SPP bnez s0, restore_all resume_userspace: @@ -255,7 +255,7 @@ work_pending: bnez s1, work_resched work_notifysig: /* Handle pending signals and notify-resume requests */ - csrs sstatus, SR_IE /* Enable interrupts for do_notify_resume() */ + csrs sstatus, SR_SIE /* Enable interrupts for do_notify_resume() */ move a0, sp /* pt_regs */ move a1, s0 /* current_thread_info->flags */ tail do_notify_resume diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 0d90dcc1fbd3..d74d4adf2d54 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -76,7 +76,7 @@ void show_regs(struct pt_regs *regs) void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { - regs->sstatus = SR_PIE /* User mode, irqs on */ | SR_FS_INITIAL; + regs->sstatus = SR_SPIE /* User mode, irqs on */ | SR_FS_INITIAL; regs->sepc = pc; regs->sp = sp; set_fs(USER_DS); @@ -110,7 +110,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, const register unsigned long gp __asm__ ("gp"); memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp; - childregs->sstatus = SR_PS | SR_PIE; /* Supervisor, irqs on */ + childregs->sstatus = SR_SPP | SR_SPIE; /* Supervisor, irqs on */ p->thread.ra = (unsigned long)ret_from_kernel_thread; p->thread.s[0] = usp; /* fn */ diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index df2ca3c65048..0713f3c67ab4 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -63,7 +63,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) goto vmalloc_fault; /* Enable interrupts if they were enabled in the parent context. */ - if (likely(regs->sstatus & SR_PIE)) + if (likely(regs->sstatus & SR_SPIE)) local_irq_enable(); /* -- cgit v1.2.3 From e2d5915293ffdff977ddcfc12b817b08c53ffa7a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 8 Jan 2018 14:54:32 +1100 Subject: powerpc/pseries: Make RAS IRQ explicitly dependent on DLPAR WQ The hotplug code uses its own workqueue to handle IRQ requests (pseries_hp_wq), however that workqueue is initialized after init_ras_IRQ(). That can lead to a kernel panic if any hotplug interrupts fire after init_ras_IRQ() but before pseries_hp_wq is initialised. eg: UDP-Lite hash table entries: 2048 (order: 0, 65536 bytes) NET: Registered protocol family 1 Unpacking initramfs... (qemu) object_add memory-backend-ram,id=mem1,size=10G (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 Unable to handle kernel paging request for data at address 0xf94d03007c421378 Faulting instruction address: 0xc00000000012d744 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc2-ziviani+ #26 task: (ptrval) task.stack: (ptrval) NIP: c00000000012d744 LR: c00000000012d744 CTR: 0000000000000000 REGS: (ptrval) TRAP: 0380 Not tainted (4.15.0-rc2-ziviani+) MSR: 8000000000009033 CR: 28088042 XER: 20040000 CFAR: c00000000012d3c4 SOFTE: 0 ... NIP [c00000000012d744] __queue_work+0xd4/0x5c0 LR [c00000000012d744] __queue_work+0xd4/0x5c0 Call Trace: [c0000000fffefb90] [c00000000012d744] __queue_work+0xd4/0x5c0 (unreliable) [c0000000fffefc70] [c00000000012dce4] queue_work_on+0xb4/0xf0 This commit makes the RAS IRQ registration explicitly dependent on the creation of the pseries_hp_wq. Reported-by: Min Deng Reported-by: Daniel Henrique Barboza Tested-by: Jose Ricardo Ziviani Signed-off-by: Michael Ellerman Reviewed-by: David Gibson --- arch/powerpc/platforms/pseries/dlpar.c | 21 ++++++++++++++++++--- arch/powerpc/platforms/pseries/pseries.h | 2 ++ arch/powerpc/platforms/pseries/ras.c | 3 ++- 3 files changed, 22 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 6e35780c5962..a0b20c03f078 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -574,11 +574,26 @@ static ssize_t dlpar_show(struct class *class, struct class_attribute *attr, static CLASS_ATTR_RW(dlpar); -static int __init pseries_dlpar_init(void) +int __init dlpar_workqueue_init(void) { + if (pseries_hp_wq) + return 0; + pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue", - WQ_UNBOUND, 1); + WQ_UNBOUND, 1); + + return pseries_hp_wq ? 0 : -ENOMEM; +} + +static int __init dlpar_sysfs_init(void) +{ + int rc; + + rc = dlpar_workqueue_init(); + if (rc) + return rc; + return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr); } -machine_device_initcall(pseries, pseries_dlpar_init); +machine_device_initcall(pseries, dlpar_sysfs_init); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 4470a3194311..1ae1d9f4dbe9 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -98,4 +98,6 @@ static inline unsigned long cmo_get_page_size(void) return CMO_PageSize; } +int dlpar_workqueue_init(void); + #endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 4923ffe230cf..81d8614e7379 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -69,7 +69,8 @@ static int __init init_ras_IRQ(void) /* Hotplug Events */ np = of_find_node_by_path("/event-sources/hot-plug-events"); if (np != NULL) { - request_event_sources_irqs(np, ras_hotplug_interrupt, + if (dlpar_workqueue_init() == 0) + request_event_sources_irqs(np, ras_hotplug_interrupt, "RAS_HOTPLUG"); of_node_put(np); } -- cgit v1.2.3 From 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 7 Jan 2018 22:48:01 +0100 Subject: x86/cpu: Implement CPU vulnerabilites sysfs functions Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and spectre_v2. Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Will Deacon Cc: Dave Hansen Cc: Linus Torvalds Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180107214913.177414879@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd5199de231e..e23d21ac745a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -89,6 +89,7 @@ config X86 select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ba0b2424c9b0..76ad6cb44b40 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,7 @@ */ #include #include +#include #include #include #include @@ -60,3 +61,31 @@ void __init check_bugs(void) set_memory_4k((unsigned long)__va(0), 1); #endif } + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return sprintf(buf, "Not affected\n"); + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Vulnerable\n"); +} +#endif -- cgit v1.2.3 From 7066d746d1aa63c2e5d656eeb5c4b0bfad334c7a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 8 Jan 2018 10:39:35 +0000 Subject: cris: Make THREAD_SIZE available to vmlinux.lds Make THREAD_SIZE available to vmlinux.lds on cris by moving it to asm/thread_info.h and including that from the linker script. This allows init_stack to be allocated in the linker script in a subsequent patch. Reported-by: Guenter Roeck Signed-off-by: David Howells Tested-by: Guenter Roeck cc: Mikael Starvik cc: Jesper Nilsson cc: linux-cris-kernel@axis.com --- arch/cris/include/asm/processor.h | 7 ------- arch/cris/include/asm/thread_info.h | 7 +++++++ arch/cris/kernel/vmlinux.lds.S | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/cris/include/asm/processor.h b/arch/cris/include/asm/processor.h index 124dd5ec7f65..b50907799cb2 100644 --- a/arch/cris/include/asm/processor.h +++ b/arch/cris/include/asm/processor.h @@ -26,13 +26,6 @@ struct task_struct; */ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) -/* THREAD_SIZE is the size of the thread_info/kernel_stack combo. - * normally, the stack is found by doing something like p + THREAD_SIZE - * in CRIS, a page is 8192 bytes, which seems like a sane size - */ -#define THREAD_SIZE PAGE_SIZE -#define THREAD_SIZE_ORDER (0) - /* * At user->kernel entry, the pt_regs struct is stacked on the top of the kernel-stack. * This macro allows us to find those regs for a task. diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index 472830c90997..108f77081a3c 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -20,6 +20,13 @@ #endif +/* THREAD_SIZE is the size of the thread_info/kernel_stack combo. + * normally, the stack is found by doing something like p + THREAD_SIZE + * in CRIS, a page is 8192 bytes, which seems like a sane size + */ +#define THREAD_SIZE PAGE_SIZE +#define THREAD_SIZE_ORDER (0) + /* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index 6d1dbc1ba767..9b232e0f673e 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -11,6 +11,7 @@ #include #include +#include #ifdef CONFIG_ETRAX_VMEM_SIZE #define __CONFIG_ETRAX_VMEM_SIZE CONFIG_ETRAX_VMEM_SIZE -- cgit v1.2.3 From 0dd6d272d39c7c1fe2f4253197b505f2b66538ee Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 23 Dec 2017 21:50:13 -0500 Subject: x86/xen/time: fix section mismatch for xen_init_time_ops() The header declares this function as __init but is defined in __ref section. Signed-off-by: Nick Desaulniers Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/xen-ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f96dbedb33d4..1a7a9469e5a7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -71,7 +71,7 @@ u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); void xen_save_time_memory_area(void); void xen_restore_time_memory_area(void); -void __init xen_init_time_ops(void); +void __ref xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit v1.2.3 From 66a640e7823da803fdb68d5d88f7a8fbd11c29e6 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 6 Jan 2018 13:39:48 -0800 Subject: x86: xen: remove the use of VLAIS Variable Length Arrays In Structs (VLAIS) is not supported by Clang, and frowned upon by others. https://lkml.org/lkml/2013/9/23/500 Here, the VLAIS was used because the size of the bitmap returned from xen_mc_entry() depended on possibly (based on kernel configuration) runtime sized data. Rather than declaring args as a VLAIS then calling sizeof on *args, we calculate the appropriate sizeof args manually. Further, we can get rid of the #ifdef's and rely on num_possible_cpus() (thanks to a helpful checkpatch warning from an earlier version of this patch). Suggested-by: Juergen Gross Signed-off-by: Nick Desaulniers Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- arch/x86/xen/mmu_pv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 7118f776cd49..aa701d2a5023 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1339,20 +1339,18 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, { struct { struct mmuext_op op; -#ifdef CONFIG_SMP - DECLARE_BITMAP(mask, num_processors); -#else DECLARE_BITMAP(mask, NR_CPUS); -#endif } *args; struct multicall_space mcs; + const size_t mc_entry_size = sizeof(args->op) + + sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ - mcs = xen_mc_entry(sizeof(*args)); + mcs = xen_mc_entry(mc_entry_size); args = mcs.args; args->op.arg2.vcpumask = to_cpumask(args->mask); -- cgit v1.2.3 From 262b6b30087246abf09d6275eb0c0dc421bcbe38 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 6 Jan 2018 18:41:14 +0100 Subject: x86/tboot: Unbreak tboot with PTI enabled This is another case similar to what EFI does: create a new set of page tables, map some code at a low address, and jump to it. PTI mistakes this low address for userspace and mistakenly marks it non-executable in an effort to make it unusable for userspace. Undo the poison to allow execution. Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") Signed-off-by: Dave Hansen Signed-off-by: Andrea Arcangeli Signed-off-by: Thomas Gleixner Cc: Alan Cox Cc: Tim Chen Cc: Jon Masters Cc: Dave Hansen Cc: Andi Kleen Cc: Jeff Law Cc: Paolo Bonzini Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David" Cc: Nick Clifton Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180108102805.GK25546@redhat.com --- arch/x86/kernel/tboot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a4eb27918ceb..75869a4b6c41 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -127,6 +127,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, p4d = p4d_alloc(&tboot_mm, pgd, vaddr); if (!p4d) return -1; + pgd->pgd &= ~_PAGE_NX; pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; -- cgit v1.2.3 From 8d56eff266f3e41a6c39926269c4c3f58f881a8e Mon Sep 17 00:00:00 2001 From: Jike Song Date: Tue, 9 Jan 2018 00:03:41 +0800 Subject: x86/mm/pti: Remove dead logic in pti_user_pagetable_walk*() The following code contains dead logic: 162 if (pgd_none(*pgd)) { 163 unsigned long new_p4d_page = __get_free_page(gfp); 164 if (!new_p4d_page) 165 return NULL; 166 167 if (pgd_none(*pgd)) { 168 set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); 169 new_p4d_page = 0; 170 } 171 if (new_p4d_page) 172 free_page(new_p4d_page); 173 } There can't be any difference between two pgd_none(*pgd) at L162 and L167, so it's always false at L171. Dave Hansen explained: Yes, the double-test was part of an optimization where we attempted to avoid using a global spinlock in the fork() path. We would check for unallocated mid-level page tables without the lock. The lock was only taken when we needed to *make* an entry to avoid collisions. Now that it is all single-threaded, there is no chance of a collision, no need for a lock, and no need for the re-check. As all these functions are only called during init, mark them __init as well. Fixes: 03f4424f348e ("x86/mm/pti: Add functions to clone kernel PMDs") Signed-off-by: Jike Song Signed-off-by: Thomas Gleixner Cc: Alan Cox Cc: Andi Kleen Cc: Tom Lendacky Cc: Peter Zijlstra Cc: Tim Chen Cc: Jiri Koshina Cc: Dave Hansen Cc: Borislav Petkov Cc: Kees Cook Cc: Andi Lutomirski Cc: Linus Torvalds Cc: Greg KH Cc: David Woodhouse Cc: Paul Turner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180108160341.3461-1-albcamus@gmail.com --- arch/x86/mm/pti.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 43d4a4a29037..ce38f165489b 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) * * Returns a pointer to a P4D on success, or NULL on failure. */ -static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) { pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); @@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) if (!new_p4d_page) return NULL; - if (pgd_none(*pgd)) { - set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); - new_p4d_page = 0; - } - if (new_p4d_page) - free_page(new_p4d_page); + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); } BUILD_BUG_ON(pgd_large(*pgd) != 0); @@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) * * Returns a pointer to a PMD on success, or NULL on failure. */ -static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) { gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); p4d_t *p4d = pti_user_pagetable_walk_p4d(address); @@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!new_pud_page) return NULL; - if (p4d_none(*p4d)) { - set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); - new_pud_page = 0; - } - if (new_pud_page) - free_page(new_pud_page); + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); } pud = pud_offset(p4d, address); @@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) if (!new_pmd_page) return NULL; - if (pud_none(*pud)) { - set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); - new_pmd_page = 0; - } - if (new_pmd_page) - free_page(new_pmd_page); + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); } return pmd_offset(pud, address); @@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) if (!new_pte_page) return NULL; - if (pmd_none(*pmd)) { - set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); - new_pte_page = 0; - } - if (new_pte_page) - free_page(new_pte_page); + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); } pte = pte_offset_kernel(pmd, address); -- cgit v1.2.3 From 9d0513d82f1a8fe17b41f113ac5922fa57dbaf5c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 28 Dec 2017 14:25:23 +0200 Subject: x86/platform/intel-mid: Revert "Make 'bt_sfi_data' const" So one of the constification patches unearthed a type casting fragility of the underlying code: 276c87054751 ("x86/platform/intel-mid: Make 'bt_sfi_data' const") converted the struct to be const while it is also used as a temporary container for important data that is used to fill 'parent' and 'name' fields in struct platform_device_info. The compiler doesn't notice this due to an explicit type cast that loses the const - which fragility will be fixed separately. This type cast turned a seemingly trivial const propagation patch into a hard to debug data corruptor and crasher bug. Signed-off-by: Andy Shevchenko Cc: Bhumika Goyal Cc: Darren Hart Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: julia.lawall@lip6.fr Cc: platform-driver-x86@vger.kernel.org Link: http://lkml.kernel.org/r/20171228122523.21802-1-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/intel-mid/device_libs/platform_bt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index dc036e511f48..5a0483e7bf66 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) return 0; } -static const struct bt_sfi_data tng_bt_sfi_data __initdata = { +static struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; -- cgit v1.2.3 From f328299e54a94998b31baf788d2b33d8122a4acb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 13:53:03 -0600 Subject: locking/refcounts: Remove stale comment from the ARCH_HAS_REFCOUNT Kconfig entry ARCH_HAS_REFCOUNT is no longer marked as broken ('if BROKEN'), so remove the stale comment regarding it being broken. Signed-off-by: Eric Biggers Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171229195303.17781-1-ebiggers3@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d4fc98c50378..ff4e9cd99854 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,7 +55,6 @@ config X86 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 - # Causing hangs/crashes, see the commit that added this change for details. select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY -- cgit v1.2.3 From e4d0e84e490790798691aaa0f2e598637f1867ec Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:21 -0600 Subject: x86/cpu/AMD: Make LFENCE a serializing instruction To aid in speculation control, make LFENCE a serializing instruction since it has less overhead than MFENCE. This is done by setting bit 1 of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not have this MSR. For these families, the LFENCE instruction is already serializing. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ab022618a50a..1e7d710fef43 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -352,6 +352,8 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c +#define MSR_F10H_DECFG 0xc0011029 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bcb75dc97d44..5b438d81beb2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -829,6 +829,16 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } -- cgit v1.2.3 From 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:32 -0600 Subject: x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference to MFENCE_RDTSC. However, since the kernel could be running under a hypervisor that does not support writing that MSR, read the MSR back and verify that the bit has been set successfully. If the MSR can be read and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the MFENCE_RDTSC feature. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/amd.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1e7d710fef43..fa11fb1fa570 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -354,6 +354,7 @@ #define MSR_FAM10H_NODE_ID 0xc001100c #define MSR_F10H_DECFG 0xc0011029 #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5b438d81beb2..ea831c858195 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -829,6 +829,9 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { + unsigned long long val; + int ret; + /* * A serializing LFENCE has less overhead than MFENCE, so * use it for execution serialization. On families which @@ -839,8 +842,19 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + /* + * Verify that the MSR write was successful (could be running + * under a hypervisor) and only then assume that LFENCE is + * serializing. + */ + ret = rdmsrl_safe(MSR_F10H_DECFG, &val); + if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } else { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } } /* -- cgit v1.2.3 From 191eccb1580939fb0d47deb405b82a85b0379070 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 9 Jan 2018 03:52:05 +1100 Subject: powerpc/pseries: Add H_GET_CPU_CHARACTERISTICS flags & wrapper A new hypervisor call has been defined to communicate various characteristics of the CPU to guests. Add definitions for the hcall number, flags and a wrapper function. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hvcall.h | 17 +++++++++++++++++ arch/powerpc/include/asm/plpar_wrappers.h | 14 ++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a409177be8bd..f0461618bf7b 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -241,6 +241,7 @@ #define H_GET_HCA_INFO 0x1B8 #define H_GET_PERF_COUNT 0x1BC #define H_MANAGE_TRACE 0x1C0 +#define H_GET_CPU_CHARACTERISTICS 0x1C8 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4 #define H_QUERY_INT_STATE 0x1E4 #define H_POLL_PENDING 0x1D8 @@ -330,6 +331,17 @@ #define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 /* >= 0 values are CPU number */ +/* H_GET_CPU_CHARACTERISTICS return values */ +#define H_CPU_CHAR_SPEC_BAR_ORI31 (1ull << 63) // IBM bit 0 +#define H_CPU_CHAR_BCCTRL_SERIALISED (1ull << 62) // IBM bit 1 +#define H_CPU_CHAR_L1D_FLUSH_ORI30 (1ull << 61) // IBM bit 2 +#define H_CPU_CHAR_L1D_FLUSH_TRIG2 (1ull << 60) // IBM bit 3 +#define H_CPU_CHAR_L1D_THREAD_PRIV (1ull << 59) // IBM bit 4 + +#define H_CPU_BEHAV_FAVOUR_SECURITY (1ull << 63) // IBM bit 0 +#define H_CPU_BEHAV_L1D_FLUSH_PR (1ull << 62) // IBM bit 1 +#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ull << 61) // IBM bit 2 + /* Flag values used in H_REGISTER_PROC_TBL hcall */ #define PROC_TABLE_OP_MASK 0x18 #define PROC_TABLE_DEREG 0x10 @@ -436,6 +448,11 @@ static inline unsigned int get_longbusy_msecs(int longbusy_rc) } } +struct h_cpu_char_result { + u64 character; + u64 behaviour; +}; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 7f01b22fa6cb..55eddf50d149 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -326,4 +326,18 @@ static inline long plapr_signal_sys_reset(long cpu) return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu); } +static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf); + if (rc == H_SUCCESS) { + p->character = retbuf[0]; + p->behaviour = retbuf[1]; + } + + return rc; +} + #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ -- cgit v1.2.3 From 50e51c13b3822d14ff6df4279423e4b7b2269bc3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64: Add macros for annotating the destination of rfid/hrfid The rfid/hrfid ((Hypervisor) Return From Interrupt) instruction is used for switching from the kernel to userspace, and from the hypervisor to the guest kernel. However it can and is also used for other transitions, eg. from real mode kernel code to virtual mode kernel code, and it's not always clear from the code what the destination context is. To make it clearer when reading the code, add macros which encode the expected destination context. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64e.h | 6 ++++++ arch/powerpc/include/asm/exception-64s.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index a703452d67b6..555e22d5e07f 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -209,5 +209,11 @@ exc_##label##_book3e: ori r3,r3,vector_offset@l; \ mtspr SPRN_IVOR##vector_number,r3; +#define RFI_TO_KERNEL \ + rfi + +#define RFI_TO_USER \ + rfi + #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index b27205297e1d..1af427a3c74f 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -74,6 +74,35 @@ */ #define EX_R3 EX_DAR +/* Macros for annotating the expected destination of (h)rfid */ + +#define RFI_TO_KERNEL \ + rfid + +#define RFI_TO_USER \ + rfid + +#define RFI_TO_USER_OR_KERNEL \ + rfid + +#define RFI_TO_GUEST \ + rfid + +#define HRFI_TO_KERNEL \ + hrfid + +#define HRFI_TO_USER \ + hrfid + +#define HRFI_TO_USER_OR_KERNEL \ + hrfid + +#define HRFI_TO_GUEST \ + hrfid + +#define HRFI_TO_UNKNOWN \ + hrfid + #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ -- cgit v1.2.3 From 222f20f140623ef6033491d0103ee0875fe87d35 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Simple RFI macro conversions This commit does simple conversions of rfi/rfid to the new macros that include the expected destination context. By simple we mean cases where there is a single well known destination context, and it's simply a matter of substituting the instruction for the appropriate macro. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 4 ++-- arch/powerpc/kernel/entry_64.S | 14 +++++++++----- arch/powerpc/kernel/exceptions-64s.S | 24 ++++++++++++------------ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 ++++----- arch/powerpc/kvm/book3s_rmhandlers.S | 7 +++++-- arch/powerpc/kvm/book3s_segment.S | 4 ++-- 6 files changed, 34 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 1af427a3c74f..dfc56daed98b 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -247,7 +247,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1(label, h) \ __EXCEPTION_PROLOG_PSERIES_1(label, h) @@ -261,7 +261,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 3320bcac7192..e68faa4d1b13 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -37,6 +37,11 @@ #include #include #include +#ifdef CONFIG_PPC_BOOK3S +#include +#else +#include +#endif /* * System calls. @@ -397,8 +402,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mtmsrd r10, 1 mtspr SPRN_SRR0, r11 mtspr SPRN_SRR1, r12 - - rfid + RFI_TO_USER b . /* prevent speculative execution */ #endif _ASM_NOKPROBE_SYMBOL(system_call_common); @@ -1073,7 +1077,7 @@ __enter_rtas: mtspr SPRN_SRR0,r5 mtspr SPRN_SRR1,r6 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ rtas_return_loc: @@ -1098,7 +1102,7 @@ rtas_return_loc: mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ _ASM_NOKPROBE_SYMBOL(__enter_rtas) _ASM_NOKPROBE_SYMBOL(rtas_return_loc) @@ -1171,7 +1175,7 @@ _GLOBAL(enter_prom) LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE) andc r11,r11,r12 mtsrr1 r11 - rfid + RFI_TO_KERNEL #endif /* CONFIG_PPC_BOOK3E */ 1: /* Return from OF */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e441b469dc8f..5502b0147c4e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -256,7 +256,7 @@ BEGIN_FTR_SECTION LOAD_HANDLER(r12, machine_check_handle_early) 1: mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r11 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ 2: /* Stack overflow. Stay on emergency stack and panic. @@ -445,7 +445,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) li r3,MSR_ME andc r10,r10,r3 /* Turn off MSR_ME */ mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 2: /* @@ -463,7 +463,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) */ bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP - rfid + RFI_TO_USER_OR_KERNEL 9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP @@ -651,7 +651,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 8: std r3,PACA_EXSLB+EX_DAR(r13) @@ -662,7 +662,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . EXC_COMMON_BEGIN(unrecov_slb) @@ -901,7 +901,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) mtspr SPRN_SRR0,r10 ; \ ld r10,PACAKMSR(r13) ; \ mtspr SPRN_SRR1,r10 ; \ - rfid ; \ + RFI_TO_KERNEL ; \ b . ; /* prevent speculative execution */ #ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH @@ -917,7 +917,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ mr r13,r9 ; \ - rfid ; /* return to userspace */ \ + RFI_TO_USER ; /* return to userspace */ \ b . ; /* prevent speculative execution */ #else #define SYSCALL_FASTENDIAN_TEST @@ -1063,7 +1063,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early) mtcr r11 REST_GPR(11, r1) ld r1,GPR1(r1) - hrfid + HRFI_TO_USER_OR_KERNEL 1: mtcr r11 REST_GPR(11, r1) @@ -1314,7 +1314,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r11,PACA_EXGEN+EX_R11(r13) ld r12,PACA_EXGEN+EX_R12(r13) ld r13,PACA_EXGEN+EX_R13(r13) - HRFID + HRFI_TO_UNKNOWN b . #endif @@ -1418,7 +1418,7 @@ masked_##_H##interrupt: \ ld r10,PACA_EXGEN+EX_R10(r13); \ ld r11,PACA_EXGEN+EX_R11(r13); \ /* returns to kernel where r13 must be set up, so don't restore it */ \ - ##_H##rfid; \ + ##_H##RFI_TO_KERNEL; \ b .; \ MASKED_DEC_HANDLER(_H) @@ -1441,7 +1441,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_interrupt) addi r13, r13, 4 mtspr SPRN_SRR0, r13 GET_SCRATCH0(r13) - rfid + RFI_TO_KERNEL b . TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) @@ -1453,7 +1453,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) addi r13, r13, 4 mtspr SPRN_HSRR0, r13 GET_SCRATCH0(r13) - hrfid + HRFI_TO_KERNEL b . #endif diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2659844784b8..9c61f736c75b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -79,7 +79,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) mtmsrd r0,1 /* clear RI in MSR */ mtsrr0 r5 mtsrr1 r6 - RFI + RFI_TO_KERNEL kvmppc_call_hv_entry: BEGIN_FTR_SECTION @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - RFI + RFI_TO_KERNEL /* Virtual-mode return */ .Lvirt_return: @@ -1167,8 +1167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) - - hrfid + HRFI_TO_GUEST b . secondary_too_late: @@ -3320,7 +3319,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) ld r4, PACAKMSR(r13) mtspr SPRN_SRR0, r3 mtspr SPRN_SRR1, r4 - rfid + RFI_TO_KERNEL 9: addi r3, r1, STACK_FRAME_OVERHEAD bl kvmppc_bad_interrupt b 9b diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 42a4b237df5f..34a5adeff084 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -46,6 +46,9 @@ #define FUNC(name) name +#define RFI_TO_KERNEL RFI +#define RFI_TO_GUEST RFI + .macro INTERRUPT_TRAMPOLINE intno .global kvmppc_trampoline_\intno @@ -141,7 +144,7 @@ kvmppc_handler_skip_ins: GET_SCRATCH0(r13) /* And get back into the code */ - RFI + RFI_TO_KERNEL #endif /* @@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline) ori r5, r5, MSR_EE mtsrr0 r7 mtsrr1 r6 - RFI + RFI_TO_KERNEL #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 2a2b96d53999..93a180ceefad 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -156,7 +156,7 @@ no_dcbz32_on: PPC_LL r9, SVCPU_R9(r3) PPC_LL r3, (SVCPU_R3)(r3) - RFI + RFI_TO_GUEST kvmppc_handler_trampoline_enter_end: @@ -407,5 +407,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) cmpwi r12, BOOK3S_INTERRUPT_DOORBELL beqa BOOK3S_INTERRUPT_DOORBELL - RFI + RFI_TO_KERNEL kvmppc_handler_trampoline_exit_end: -- cgit v1.2.3 From b8e90cb7bc04a509e821e82ab6ed7a8ef11ba333 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64: Convert the syscall exit path to use RFI_TO_USER/KERNEL In the syscall exit path we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e68faa4d1b13..724733b74744 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -267,13 +267,23 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ + ld r2,GPR2(r1) + ld r1,GPR1(r1) + mtlr r4 + mtcr r5 + mtspr SPRN_SRR0,r7 + mtspr SPRN_SRR1,r8 + RFI_TO_USER + b . /* prevent speculative execution */ + + /* exit to kernel */ 1: ld r2,GPR2(r1) ld r1,GPR1(r1) mtlr r4 mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - RFI + RFI_TO_KERNEL b . /* prevent speculative execution */ .Lsyscall_error: -- cgit v1.2.3 From a08f828cf47e6c605af21d2cdec68f84e799c318 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64: Convert fast_exception_return to use RFI_TO_USER/KERNEL Similar to the syscall return path, in fast_exception_return we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 724733b74744..2748584b767d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -892,7 +892,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) -1: + mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) @@ -905,8 +905,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r3,GPR3(r1) ld r4,GPR4(r1) ld r1,GPR1(r1) + RFI_TO_USER + b . /* prevent speculative execution */ + +1: mtspr SPRN_SRR1,r3 + + ld r2,_CCR(r1) + mtcrf 0xFF,r2 + ld r2,_NIP(r1) + mtspr SPRN_SRR0,r2 - rfid + ld r0,GPR0(r1) + ld r2,GPR2(r1) + ld r3,GPR3(r1) + ld r4,GPR4(r1) + ld r1,GPR1(r1) + RFI_TO_KERNEL b . /* prevent speculative execution */ #endif /* CONFIG_PPC_BOOK3E */ -- cgit v1.2.3 From c7305645eb0c1621351cfc104038831ae87c0053 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Convert slb_miss_common to use RFI_TO_USER/KERNEL In the SLB miss handler we may be returning to user or kernel. We need to add a check early on and save the result in the cr4 register, and then we bifurcate the return path based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5502b0147c4e..ed356194f09c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -598,6 +598,9 @@ EXC_COMMON_BEGIN(slb_miss_common) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + andi. r9,r11,MSR_PR // Check for exception from userspace + cmpdi cr4,r9,MSR_PR // And save the result in CR4 for later + /* * Test MSR_RI before calling slb_allocate_realmode, because the * MSR in r11 gets clobbered. However we still want to allocate @@ -624,9 +627,32 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) /* All done -- return from exception. */ + bne cr4,1f /* returning to kernel */ + +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ + mtcrf 0x02,r9 /* I/D indication is in cr6 */ + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + + RESTORE_CTR(r9, PACA_EXSLB) + RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + RFI_TO_USER + b . /* prevent speculative execution */ +1: .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ @@ -640,9 +666,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) ld r11,PACA_EXSLB+EX_R11(r13) ld r12,PACA_EXSLB+EX_R12(r13) ld r13,PACA_EXSLB+EX_R13(r13) - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ + 2: std r3,PACA_EXSLB+EX_DAR(r13) mr r3,r12 mfspr r11,SPRN_SRR0 -- cgit v1.2.3 From 195e2addbce09e5afbc766efc1e6567c9ce840d3 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 6 Jan 2018 21:53:26 +0300 Subject: SolutionEngine771x: fix Ether platform data The 'sh_eth' driver's probe() method would fail on the SolutionEngine7710 board and crash on SolutionEngine7712 board as the platform code is hopelessly behind the driver's platform data -- it passes the PHY address instead of 'struct sh_eth_plat_data *'; pass the latter to the driver in order to fix the bug... Fixes: 71557a37adb5 ("[netdrvr] sh_eth: Add SH7619 support") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- arch/sh/boards/mach-se/770x/setup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c index 77c35350ee77..b7fa7a87e946 100644 --- a/arch/sh/boards/mach-se/770x/setup.c +++ b/arch/sh/boards/mach-se/770x/setup.c @@ -9,6 +9,7 @@ */ #include #include +#include #include #include #include @@ -115,6 +116,11 @@ static struct platform_device heartbeat_device = { #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\ defined(CONFIG_CPU_SUBTYPE_SH7712) /* SH771X Ethernet driver */ +static struct sh_eth_plat_data sh_eth_plat = { + .phy = PHY_ID, + .phy_interface = PHY_INTERFACE_MODE_MII, +}; + static struct resource sh_eth0_resources[] = { [0] = { .start = SH_ETH0_BASE, @@ -132,7 +138,7 @@ static struct platform_device sh_eth0_device = { .name = "sh771x-ether", .id = 0, .dev = { - .platform_data = PHY_ID, + .platform_data = &sh_eth_plat, }, .num_resources = ARRAY_SIZE(sh_eth0_resources), .resource = sh_eth0_resources, @@ -155,7 +161,7 @@ static struct platform_device sh_eth1_device = { .name = "sh771x-ether", .id = 1, .dev = { - .platform_data = PHY_ID, + .platform_data = &sh_eth_plat, }, .num_resources = ARRAY_SIZE(sh_eth1_resources), .resource = sh_eth1_resources, -- cgit v1.2.3 From f9a531d6731d74f1e24298d9641c2dc1fef2631b Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 6 Jan 2018 21:53:27 +0300 Subject: SolutionEngine771x: add Ether TSU resource After the Ether platform data is fixed, the driver probe() method would still fail since the 'struct sh_eth_cpu_data' corresponding to SH771x indicates the presence of TSU but the memory resource for it is absent. Add the missing TSU resource to both Ether devices and fix the harmless off-by-one error in the main memory resources, while at it... Fixes: 4986b996882d ("net: sh_eth: remove the SH_TSU_ADDR") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- arch/sh/boards/mach-se/770x/setup.c | 14 ++++++++++++-- arch/sh/include/mach-se/mach/se.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c index b7fa7a87e946..412326d59e6f 100644 --- a/arch/sh/boards/mach-se/770x/setup.c +++ b/arch/sh/boards/mach-se/770x/setup.c @@ -124,10 +124,15 @@ static struct sh_eth_plat_data sh_eth_plat = { static struct resource sh_eth0_resources[] = { [0] = { .start = SH_ETH0_BASE, - .end = SH_ETH0_BASE + 0x1B8, + .end = SH_ETH0_BASE + 0x1B8 - 1, .flags = IORESOURCE_MEM, }, [1] = { + .start = SH_TSU_BASE, + .end = SH_TSU_BASE + 0x200 - 1, + .flags = IORESOURCE_MEM, + }, + [2] = { .start = SH_ETH0_IRQ, .end = SH_ETH0_IRQ, .flags = IORESOURCE_IRQ, @@ -147,10 +152,15 @@ static struct platform_device sh_eth0_device = { static struct resource sh_eth1_resources[] = { [0] = { .start = SH_ETH1_BASE, - .end = SH_ETH1_BASE + 0x1B8, + .end = SH_ETH1_BASE + 0x1B8 - 1, .flags = IORESOURCE_MEM, }, [1] = { + .start = SH_TSU_BASE, + .end = SH_TSU_BASE + 0x200 - 1, + .flags = IORESOURCE_MEM, + }, + [2] = { .start = SH_ETH1_IRQ, .end = SH_ETH1_IRQ, .flags = IORESOURCE_IRQ, diff --git a/arch/sh/include/mach-se/mach/se.h b/arch/sh/include/mach-se/mach/se.h index 4246ef9b07a3..aa83fe1ff0b1 100644 --- a/arch/sh/include/mach-se/mach/se.h +++ b/arch/sh/include/mach-se/mach/se.h @@ -100,6 +100,7 @@ /* Base address */ #define SH_ETH0_BASE 0xA7000000 #define SH_ETH1_BASE 0xA7000400 +#define SH_TSU_BASE 0xA7000800 /* PHY ID */ #if defined(CONFIG_CPU_SUBTYPE_SH7710) # define PHY_ID 0x00 -- cgit v1.2.3 From 0b04ea6822936757cf7b18713e3d6ad05e97c883 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 8 Jan 2018 10:45:03 +0000 Subject: hexagon: Make THREAD_SIZE available to vmlinux.lds Make THREAD_SIZE available to vmlinux.lds on hexagon by including asm/thread_info.h the linker script. This allows init_stack to be allocated in the linker script in a subsequent patch. Reported-by: Guenter Roeck Signed-off-by: David Howells Reviewed-by: Guenter Roeck Acked-by: Richard Kuo cc: linux-hexagon@vger.kernel.org --- arch/hexagon/kernel/vmlinux.lds.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S index ec87e67feb19..ad69d181c939 100644 --- a/arch/hexagon/kernel/vmlinux.lds.S +++ b/arch/hexagon/kernel/vmlinux.lds.S @@ -22,6 +22,8 @@ #include /* Most of the kernel defines are here */ #include /* except for page_offset */ #include /* and now we're pulling cache line size */ +#include /* and we need THREAD_SIZE too */ + OUTPUT_ARCH(hexagon) ENTRY(stext) -- cgit v1.2.3 From 138101932054268a235c6ac7cf474f6a88fcd885 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 8 Jan 2018 10:49:09 +0000 Subject: openrisc: Make THREAD_SIZE available to vmlinux.lds Make THREAD_SIZE available to vmlinux.lds on openrisc by including asm/thread_info.h the linker script. This allows init_stack to be allocated in the linker script in a subsequent patch. Reported-by: Guenter Roeck Signed-off-by: David Howells Tested-by: Guenter Roeck Acked-by: Stafford Horne cc: Jonas Bonn cc: Stefan Kristiansson cc: openrisc@lists.librecores.org --- arch/openrisc/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index 00ddb7804be4..953bdcd54efe 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ #include #include +#include #include #ifdef __OR1K__ -- cgit v1.2.3 From 0500871f21b237b2bea2d9db405eadf78e5aab05 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jan 2018 15:12:01 +0000 Subject: Construct init thread stack in the linker script rather than by union Construct the init thread stack in the linker script rather than doing it by means of a union so that ia64's init_task.c can be got rid of. The following symbols are then made available from INIT_TASK_DATA() linker script macro: init_thread_union init_stack INIT_TASK_DATA() also expands the region to THREAD_SIZE to accommodate the size of the init stack. init_thread_union is given its own section so that it can be placed into the stack space in the right order. I'm assuming that the ia64 ordering is correct and that the task_struct is first and the thread_info second. Signed-off-by: David Howells Tested-by: Tony Luck Tested-by: Will Deacon (arm64) Tested-by: Palmer Dabbelt Acked-by: Thomas Gleixner --- arch/Kconfig | 4 +-- arch/alpha/include/asm/thread_info.h | 3 --- arch/arc/include/asm/thread_info.h | 3 --- arch/arm/include/asm/thread_info.h | 3 --- arch/arm64/include/asm/thread_info.h | 2 -- arch/blackfin/include/asm/thread_info.h | 2 -- arch/c6x/include/asm/thread_info.h | 3 --- arch/cris/include/asm/processor.h | 2 -- arch/cris/include/asm/thread_info.h | 2 -- arch/frv/include/asm/thread_info.h | 3 --- arch/h8300/include/asm/thread_info.h | 3 --- arch/hexagon/include/asm/thread_info.h | 3 --- arch/ia64/Kconfig | 2 +- arch/ia64/Makefile | 2 +- arch/ia64/include/asm/thread_info.h | 4 +-- arch/ia64/kernel/Makefile | 2 +- arch/ia64/kernel/init_task.c | 44 ------------------------------- arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m32r/include/asm/thread_info.h | 3 --- arch/m68k/include/asm/thread_info.h | 4 --- arch/metag/include/asm/thread_info.h | 3 --- arch/microblaze/include/asm/thread_info.h | 3 --- arch/mips/include/asm/thread_info.h | 3 --- arch/mn10300/include/asm/thread_info.h | 2 -- arch/nios2/include/asm/thread_info.h | 3 --- arch/openrisc/include/asm/processor.h | 2 -- arch/openrisc/include/asm/thread_info.h | 2 -- arch/parisc/include/asm/thread_info.h | 3 --- arch/powerpc/include/asm/thread_info.h | 3 --- arch/riscv/include/asm/thread_info.h | 2 -- arch/s390/include/asm/thread_info.h | 2 -- arch/score/include/asm/thread_info.h | 3 --- arch/sh/include/asm/thread_info.h | 3 --- arch/sparc/include/asm/thread_info_32.h | 3 --- arch/sparc/include/asm/thread_info_64.h | 3 --- arch/tile/include/asm/thread_info.h | 3 --- arch/um/include/asm/processor-generic.h | 5 +++- arch/um/include/asm/thread_info.h | 9 +++---- arch/um/include/asm/vmlinux.lds.h | 2 ++ arch/um/kernel/dyn.lds.S | 3 +-- arch/um/kernel/um_arch.c | 2 +- arch/um/kernel/uml.lds.S | 2 +- arch/unicore32/include/asm/thread_info.h | 3 --- arch/x86/include/asm/thread_info.h | 2 -- arch/xtensa/include/asm/thread_info.h | 3 --- include/asm-generic/vmlinux.lds.h | 4 +++ include/linux/init_task.h | 3 +++ include/linux/sched.h | 9 +++++++ init/Makefile | 2 -- init/init_task.c | 10 ++++--- 50 files changed, 42 insertions(+), 155 deletions(-) delete mode 100644 arch/ia64/kernel/init_task.c create mode 100644 arch/um/include/asm/vmlinux.lds.h (limited to 'arch') diff --git a/arch/Kconfig b/arch/Kconfig index 400b9e1b2f27..a26d6f8ab967 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -234,8 +234,8 @@ config ARCH_HAS_FORTIFY_SOURCE config ARCH_HAS_SET_MEMORY bool -# Select if arch init_task initializer is different to init/init_task.c -config ARCH_INIT_TASK +# Select if arch init_task must go in the __init_task_data section +config ARCH_TASK_STRUCT_ON_STACK bool # Select if arch has its private alloc_task_struct() function diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 8c20c5e35432..807d7b9a1860 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -39,9 +39,6 @@ struct thread_info { .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* How to get the thread information struct from C. */ register struct thread_info *__current_thread_info __asm__("$8"); #define current_thread_info() __current_thread_info diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index 2d79e527fa50..c85947bac5e5 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -62,9 +62,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - static inline __attribute_const__ struct thread_info *current_thread_info(void) { register unsigned long sp asm("sp"); diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 776757d1604a..e71cc35de163 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -75,9 +75,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* * how to get the current stack pointer in C */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index eb431286bacd..740aa03c5f0d 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -51,8 +51,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_stack (init_thread_union.stack) - #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index 2966b93850a1..a5aeab4e5f2d 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -56,8 +56,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) /* Given a task stack pointer, you can find its corresponding * thread_info structure just by masking it to the THREAD_SIZE diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h index acc70c135ab8..59a5697fe0f3 100644 --- a/arch/c6x/include/asm/thread_info.h +++ b/arch/c6x/include/asm/thread_info.h @@ -60,9 +60,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* get the thread information struct of current task */ static inline __attribute__((const)) struct thread_info *current_thread_info(void) diff --git a/arch/cris/include/asm/processor.h b/arch/cris/include/asm/processor.h index b50907799cb2..ee4d8b03d048 100644 --- a/arch/cris/include/asm/processor.h +++ b/arch/cris/include/asm/processor.h @@ -52,8 +52,6 @@ static inline void release_thread(struct task_struct *dead_task) /* Nothing needs to be done. */ } -#define init_stack (init_thread_union.stack) - #define cpu_relax() barrier() void default_idle(void); diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index 108f77081a3c..996fef3be1d5 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -63,8 +63,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) - #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h index ccba3b6ce918..0f950845fad9 100644 --- a/arch/frv/include/asm/thread_info.h +++ b/arch/frv/include/asm/thread_info.h @@ -64,9 +64,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ register struct thread_info *__current_thread_info asm("gr15"); diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index 072b92c0d8b5..0cdaa302d3d2 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -46,9 +46,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h index b80fe1db7b64..f41f9c6f0e31 100644 --- a/arch/hexagon/include/asm/thread_info.h +++ b/arch/hexagon/include/asm/thread_info.h @@ -84,9 +84,6 @@ struct thread_info { .regs = NULL, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* Tacky preprocessor trickery */ #define qqstr(s) qstr(s) #define qstr(s) #s diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 49583c5a5d44..315c51f58811 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -43,7 +43,7 @@ config IA64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD - select ARCH_INIT_TASK + select ARCH_TASK_STRUCT_ON_STACK select ARCH_TASK_STRUCT_ALLOCATOR select ARCH_THREAD_STACK_ALLOCATOR select ARCH_CLOCKSOURCE_DATA diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index c100d780f1eb..2dd7f519ad0b 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -42,7 +42,7 @@ $(error Sorry, you need a newer version of the assember, one that is built from endif KBUILD_CFLAGS += $(cflags-y) -head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o +head-y := arch/ia64/kernel/head.o libs-y += arch/ia64/lib/ core-y += arch/ia64/kernel/ arch/ia64/mm/ diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 1d172a4119a7..64a1011f6812 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -12,6 +12,8 @@ #include #include +#define THREAD_SIZE KERNEL_STACK_SIZE + #ifndef __ASSEMBLY__ /* @@ -41,8 +43,6 @@ struct thread_info { #endif }; -#define THREAD_SIZE KERNEL_STACK_SIZE - #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 14ad79f394e5..0b4c65a1af25 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -7,7 +7,7 @@ ifdef CONFIG_DYNAMIC_FTRACE CFLAGS_REMOVE_ftrace.o = -pg endif -extra-y := head.o init_task.o vmlinux.lds +extra-y := head.o vmlinux.lds obj-y := entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c deleted file mode 100644 index 8df9245e29d9..000000000000 --- a/arch/ia64/kernel/init_task.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This is where we statically allocate and initialize the initial - * task. - * - * Copyright (C) 1999, 2002-2003 Hewlett-Packard Co - * David Mosberger-Tang - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -/* - * Initial task structure. - * - * We need to make sure that this is properly aligned due to the way process stacks are - * handled. This is done by having a special ".data..init_task" section... - */ -#define init_thread_info init_task_mem.s.thread_info -#define init_stack init_task_mem.stack - -union { - struct { - struct task_struct task; - struct thread_info thread_info; - } s; - unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; -} init_task_mem asm ("init_task") __init_task_data = - {{ - .task = INIT_TASK(init_task_mem.s.task), - .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) -}}; - -EXPORT_SYMBOL(init_task); diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 58db59da0bd8..b0b2070e0591 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -3,6 +3,7 @@ #include #include #include +#include #include diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index b3a215b0ce0a..ba00f1032587 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -56,9 +56,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 928035591f2e..015f1ca38305 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -41,8 +41,6 @@ struct thread_info { .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_stack (init_thread_union.stack) - #ifndef __ASSEMBLY__ /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) @@ -58,8 +56,6 @@ static inline struct thread_info *current_thread_info(void) } #endif -#define init_thread_info (init_thread_union.thread_info) - /* entry.S relies on these definitions! * bits 0-7 are tested at every exception exit * bits 8-15 are also tested at syscall exit diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h index 554f73a77e6e..a1a9c7f5ca8c 100644 --- a/arch/metag/include/asm/thread_info.h +++ b/arch/metag/include/asm/thread_info.h @@ -74,9 +74,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the current stack pointer from C */ register unsigned long current_stack_pointer asm("A0StP") __used; diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index e7e8954e9815..9afe4b5bd6c8 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -86,9 +86,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index 5e8927f99a76..4993db40482c 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -49,9 +49,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* How to get the thread information struct from C. */ register struct thread_info *__current_thread_info __asm__("$28"); diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index f5f90bbf019d..1748a7b25bf8 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -79,8 +79,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) #define init_uregs \ ((struct pt_regs *) \ ((unsigned long) init_stack + THREAD_SIZE - sizeof(struct pt_regs))) diff --git a/arch/nios2/include/asm/thread_info.h b/arch/nios2/include/asm/thread_info.h index d69c338bd19c..7349a4fa635b 100644 --- a/arch/nios2/include/asm/thread_info.h +++ b/arch/nios2/include/asm/thread_info.h @@ -63,9 +63,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index 396d8f306c21..af31a9fe736a 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -84,8 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); void release_thread(struct task_struct *); unsigned long get_wchan(struct task_struct *p); -#define init_stack (init_thread_union.stack) - #define cpu_relax() barrier() #endif /* __ASSEMBLY__ */ diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index c229aa6bb502..5c15dfa2fd4f 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -79,8 +79,6 @@ struct thread_info { .ksp = 0, \ } -#define init_thread_info (init_thread_union.thread_info) - /* how to get the thread information struct from C */ register struct thread_info *current_thread_info_reg asm("r10"); #define current_thread_info() (current_thread_info_reg) diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index 598c8d60fa5e..285757544cca 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -25,9 +25,6 @@ struct thread_info { .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ #define current_thread_info() ((struct thread_info *)mfctl(30)) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index a264c3ad366b..4a12c00f8de3 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -58,9 +58,6 @@ struct thread_info { .flags = 0, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - #define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) /* how to get the thread information struct from C */ diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 22c3536ed281..f8fa1cd2dad9 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -64,8 +64,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_stack (init_thread_union.stack) - #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 0880a37b6d3b..25d6ec3aaddd 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -42,8 +42,6 @@ struct thread_info { .flags = 0, \ } -#define init_stack (init_thread_union.stack) - void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h index ad51b56e51bd..bc4c7c90550f 100644 --- a/arch/score/include/asm/thread_info.h +++ b/arch/score/include/asm/thread_info.h @@ -58,9 +58,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* How to get the thread information struct from C. */ register struct thread_info *__current_thread_info __asm__("r28"); #define current_thread_info() __current_thread_info diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index becb798f1b04..cf5c792bf70b 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -63,9 +63,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the current stack pointer from C */ register unsigned long current_stack_pointer asm("r15") __used; diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index febaaeb1a0fe..548b366165dd 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -63,9 +63,6 @@ struct thread_info { .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ register struct thread_info *current_thread_info_reg asm("g6"); #define current_thread_info() (current_thread_info_reg) diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index caf915321ba9..f7e7b0baec9f 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -120,9 +120,6 @@ struct thread_info { .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ register struct thread_info *current_thread_info_reg asm("g6"); #define current_thread_info() (current_thread_info_reg) diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index b7659b8f1117..2adcacd85749 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -59,9 +59,6 @@ struct thread_info { .align_ctl = 0, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - #endif /* !__ASSEMBLY__ */ #if PAGE_SIZE < 8192 diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 86942a492454..b58b746d3f2c 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -58,7 +58,10 @@ static inline void release_thread(struct task_struct *task) { } -#define init_stack (init_thread_union.stack) +static inline void mm_copy_segments(struct mm_struct *from_mm, + struct mm_struct *new_mm) +{ +} /* * User space process size: 3GB (default). diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 9300f7630d2a..4eecd960ee8c 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -6,6 +6,9 @@ #ifndef __UM_THREAD_INFO_H #define __UM_THREAD_INFO_H +#define THREAD_SIZE_ORDER CONFIG_KERNEL_STACK_ORDER +#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) + #ifndef __ASSEMBLY__ #include @@ -37,10 +40,6 @@ struct thread_info { .real_thread = NULL, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - -#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { @@ -53,8 +52,6 @@ static inline struct thread_info *current_thread_info(void) return ti; } -#define THREAD_SIZE_ORDER CONFIG_KERNEL_STACK_ORDER - #endif #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ diff --git a/arch/um/include/asm/vmlinux.lds.h b/arch/um/include/asm/vmlinux.lds.h new file mode 100644 index 000000000000..149494ae78ea --- /dev/null +++ b/arch/um/include/asm/vmlinux.lds.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index d417e3899700..5568cf882371 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -1,5 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include +#include #include OUTPUT_FORMAT(ELF_FORMAT) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index f433690b9b37..a818ccef30ca 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -54,7 +54,7 @@ struct cpuinfo_um boot_cpu_data = { union thread_union cpu0_irqstack __attribute__((__section__(".data..init_irqstack"))) = - { INIT_THREAD_INFO(init_task) }; + { .thread_info = INIT_THREAD_INFO(init_task) }; /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 3d6ed6ba5b78..36b07ec09742 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include +#include #include OUTPUT_FORMAT(ELF_FORMAT) diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h index e79ad6d5b5b2..5fb728f3b49a 100644 --- a/arch/unicore32/include/asm/thread_info.h +++ b/arch/unicore32/include/asm/thread_info.h @@ -87,9 +87,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* * how to get the thread information struct from C */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 00223333821a..d25a638a2720 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -62,8 +62,6 @@ struct thread_info { .flags = 0, \ } -#define init_stack (init_thread_union.stack) - #else /* !__ASSEMBLY__ */ #include diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 7be2400f745a..2ccd37510aaa 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -77,9 +77,6 @@ struct thread_info { .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) -#define init_stack (init_thread_union.stack) - /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ee8b707d9fa9..a564b83bf013 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -268,7 +268,11 @@ #define INIT_TASK_DATA(align) \ . = ALIGN(align); \ VMLINUX_SYMBOL(__start_init_task) = .; \ + VMLINUX_SYMBOL(init_thread_union) = .; \ + VMLINUX_SYMBOL(init_stack) = .; \ *(.data..init_task) \ + *(.data..init_thread_info) \ + . = VMLINUX_SYMBOL(__start_init_task) + THREAD_SIZE; \ VMLINUX_SYMBOL(__end_init_task) = .; /* diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 6a532629c983..30a89b99a5af 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -304,5 +304,8 @@ extern struct cred init_cred; /* Attach to the init_task data structure for proper alignment */ #define __init_task_data __attribute__((__section__(".data..init_task"))) +/* Attach to the thread_info data structure for proper alignment */ +#define __init_thread_info __attribute__((__section__(".data..init_thread_info"))) + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index d2588263a989..68a504f6e474 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1446,12 +1446,21 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { +#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK + struct task_struct task; +#endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; +#ifndef CONFIG_THREAD_INFO_IN_TASK +extern struct thread_info init_thread_info; +#endif + +extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; + #ifdef CONFIG_THREAD_INFO_IN_TASK static inline struct thread_info *task_thread_info(struct task_struct *task) { diff --git a/init/Makefile b/init/Makefile index 1dbb23787290..a3e5ce2bcf08 100644 --- a/init/Makefile +++ b/init/Makefile @@ -13,9 +13,7 @@ obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o endif obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o -ifneq ($(CONFIG_ARCH_INIT_TASK),y) obj-y += init_task.o -endif mounts-y := do_mounts.o mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o diff --git a/init/init_task.c b/init/init_task.c index 9325fee7dc82..2285aa42cbe1 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -17,15 +17,17 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); /* Initial task structure */ -struct task_struct init_task = INIT_TASK(init_task); +struct task_struct init_task +#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK + __init_task_data +#endif + = INIT_TASK(init_task); EXPORT_SYMBOL(init_task); /* * Initial thread structure. Alignment of this is handled by a special * linker map entry. */ -union thread_union init_thread_union __init_task_data = { #ifndef CONFIG_THREAD_INFO_IN_TASK - INIT_THREAD_INFO(init_task) +struct thread_info init_thread_info __init_thread_info = INIT_THREAD_INFO(init_task); #endif -}; -- cgit v1.2.3 From c9619bb293c9ab758ba298f039cf4b820dd8436f Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Mon, 8 Jan 2018 10:04:50 +0800 Subject: ARM: dts: imx6ul: add 696MHz operating point Add 696MHz operating point according to datasheet (Rev. 0, 12/2015). Signed-off-by: Anson Huang Reviewed-by: Fabio Estevam Acked-by: Dong Aisheng Signed-off-by: Rafael J. Wysocki --- arch/arm/boot/dts/imx6ul.dtsi | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx6ul.dtsi b/arch/arm/boot/dts/imx6ul.dtsi index d5181f85ca9c..963e1698fe1d 100644 --- a/arch/arm/boot/dts/imx6ul.dtsi +++ b/arch/arm/boot/dts/imx6ul.dtsi @@ -68,12 +68,14 @@ clock-latency = <61036>; /* two CLK32 periods */ operating-points = < /* kHz uV */ + 696000 1275000 528000 1175000 396000 1025000 198000 950000 >; fsl,soc-operating-points = < /* KHz uV */ + 696000 1275000 528000 1175000 396000 1175000 198000 1175000 -- cgit v1.2.3 From 6c7d47c33ed323f14f2a3b8de925e831dbaa4e69 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 22 Nov 2017 14:42:21 +1100 Subject: KVM: PPC: Book3S PR: Fix WIMG handling under pHyp Commit 96df226 ("KVM: PPC: Book3S PR: Preserve storage control bits") added code to preserve WIMG bits but it missed 2 special cases: - a magic page in kvmppc_mmu_book3s_64_xlate() and - guest real mode in kvmppc_handle_pagefault(). For these ptes, WIMG was 0 and pHyp failed on these causing a guest to stop in the very beginning at NIP=0x100 (due to bd9166ffe "KVM: PPC: Book3S PR: Exit KVM on failed mapping"). According to LoPAPR v1.1 14.5.4.1.2 H_ENTER: The hypervisor checks that the WIMG bits within the PTE are appropriate for the physical page number else H_Parameter return. (For System Memory pages WIMG=0010, or, 1110 if the SAO option is enabled, and for IO pages WIMG=01**.) This hence initializes WIMG to non-zero value HPTE_R_M (0x10), as expected by pHyp. [paulus@ozlabs.org - fix compile for 32-bit] Cc: stable@vger.kernel.org # v4.11+ Fixes: 96df226 "KVM: PPC: Book3S PR: Preserve storage control bits" Signed-off-by: Alexey Kardashevskiy Tested-by: Ruediger Oertel Reviewed-by: Greg Kurz Tested-by: Greg Kurz Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu.c | 1 + arch/powerpc/kvm/book3s_pr.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 29ebe2fd5867..a93d719edc90 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_read = true; gpte->may_write = true; gpte->page_size = MMU_PAGE_4K; + gpte->wimg = HPTE_R_M; return 0; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d0dc8624198f..7deaeeb14b93 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE +#define HPTE_R_M _PAGE_COHERENT #endif static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu) @@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.eaddr = eaddr; pte.vpage = eaddr >> 12; pte.page_size = MMU_PAGE_64K; + pte.wimg = HPTE_R_M; } switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { -- cgit v1.2.3 From ecba8297aafd50db6ae867e90844eead1611ef1c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 10 Jan 2018 17:04:39 +1100 Subject: KVM: PPC: Book3S HV: Always flush TLB in kvmppc_alloc_reset_hpt() The KVM_PPC_ALLOCATE_HTAB ioctl(), implemented by kvmppc_alloc_reset_hpt() is supposed to completely clear and reset a guest's Hashed Page Table (HPT) allocating or re-allocating it if necessary. In the case where an HPT of the right size already exists and it just zeroes it, it forces a TLB flush on all guest CPUs, to remove any stale TLB entries loaded from the old HPT. However, that situation can arise when the HPT is resizing as well - or even when switching from an RPT to HPT - so those cases need a TLB flush as well. So, move the TLB flush to trigger in all cases except for errors. Cc: stable@vger.kernel.org # v4.10+ Fixes: f98a8bf9ee20 ("KVM: PPC: Book3S HV: Allow KVM_PPC_ALLOCATE_HTAB ioctl() to change HPT size") Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8355398f0bb6..b73dbc9e797d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -165,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) * Reset all the reverse-mapping chains for all memslots */ kvmppc_rmap_reset(kvm); - /* Ensure that each vcpu will flush its TLB on next entry. */ - cpumask_setall(&kvm->arch.need_tlb_flush); err = 0; goto out; } @@ -182,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) kvmppc_set_hpt(kvm, &info); out: + if (err == 0) + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); + mutex_unlock(&kvm->lock); return err; } -- cgit v1.2.3 From aa8a5e0062ac940f7659394f4817c948dc8c0667 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Add support for RFI flush of L1-D cache On some CPUs we can prevent the Meltdown vulnerability by flushing the L1-D cache on exit from kernel to user mode, and from hypervisor to guest. This is known to be the case on at least Power7, Power8 and Power9. At this time we do not know the status of the vulnerability on other CPUs such as the 970 (Apple G5), pasemi CPUs (AmigaOne X1000) or Freescale CPUs. As more information comes to light we can enable this, or other mechanisms on those CPUs. The vulnerability occurs when the load of an architecturally inaccessible memory region (eg. userspace load of kernel memory) is speculatively executed to the point where its result can influence the address of a subsequent speculatively executed load. In order for that to happen, the first load must hit in the L1, because before the load is sent to the L2 the permission check is performed. Therefore if no kernel addresses hit in the L1 the vulnerability can not occur. We can ensure that is the case by flushing the L1 whenever we return to userspace. Similarly for hypervisor vs guest. In order to flush the L1-D cache on exit, we add a section of nops at each (h)rfi location that returns to a lower privileged context, and patch that with some sequence. Newer firmwares are able to advertise to us that there is a special nop instruction that flushes the L1-D. If we do not see that advertised, we fall back to doing a displacement flush in software. For guest kernels we support migration between some CPU versions, and different CPUs may use different flush instructions. So that we are prepared to migrate to a machine with a different flush instruction activated, we may have to patch more than one flush instruction at boot if the hypervisor tells us to. In the end this patch is mostly the work of Nicholas Piggin and Michael Ellerman. However a cast of thousands contributed to analysis of the issue, earlier versions of the patch, back ports testing etc. Many thanks to all of them. Tested-by: Jon Masters Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 40 ++++++++++++--- arch/powerpc/include/asm/feature-fixups.h | 13 +++++ arch/powerpc/include/asm/paca.h | 10 ++++ arch/powerpc/include/asm/setup.h | 13 +++++ arch/powerpc/kernel/asm-offsets.c | 5 ++ arch/powerpc/kernel/exceptions-64s.S | 84 +++++++++++++++++++++++++++++++ arch/powerpc/kernel/setup_64.c | 79 +++++++++++++++++++++++++++++ arch/powerpc/kernel/vmlinux.lds.S | 9 ++++ arch/powerpc/lib/feature-fixups.c | 41 +++++++++++++++ 9 files changed, 286 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index dfc56daed98b..7197b179c1b1 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -74,34 +74,58 @@ */ #define EX_R3 EX_DAR -/* Macros for annotating the expected destination of (h)rfid */ +/* + * Macros for annotating the expected destination of (h)rfid + * + * The nop instructions allow us to insert one or more instructions to flush the + * L1-D cache when returning to userspace or a guest. + */ +#define RFI_FLUSH_SLOT \ + RFI_FLUSH_FIXUP_SECTION; \ + nop; \ + nop; \ + nop #define RFI_TO_KERNEL \ rfid #define RFI_TO_USER \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_USER_OR_KERNEL \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_GUEST \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define HRFI_TO_KERNEL \ hrfid #define HRFI_TO_USER \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_USER_OR_KERNEL \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_GUEST \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_UNKNOWN \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 8f88f771cc55..1e82eb3caabd 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -187,7 +187,20 @@ label##3: \ FTR_ENTRY_OFFSET label##1b-label##3b; \ .popsection; +#define RFI_FLUSH_FIXUP_SECTION \ +951: \ + .pushsection __rfi_flush_fixup,"a"; \ + .align 2; \ +952: \ + FTR_ENTRY_OFFSET 951b-952b; \ + .popsection; + + #ifndef __ASSEMBLY__ +#include + +extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; + void apply_feature_fixups(void); void setup_feature_keys(void); #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 3892db93b837..23ac7fc0af23 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -232,6 +232,16 @@ struct paca_struct { struct sibling_subcore_state *sibling_subcore_state; #endif #endif +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * rfi fallback flush must be in its own cacheline to prevent + * other paca data leaking into the L1d + */ + u64 exrfi[EX_SIZE] __aligned(0x80); + void *rfi_flush_fallback_area; + u64 l1d_flush_congruence; + u64 l1d_flush_sets; +#endif }; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index cf00ec26303a..469b7fdc9be4 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -39,6 +39,19 @@ static inline void pseries_big_endian_exceptions(void) {} static inline void pseries_little_endian_exceptions(void) {} #endif /* CONFIG_PPC_PSERIES */ +void rfi_flush_enable(bool enable); + +/* These are bit flags */ +enum l1d_flush_type { + L1D_FLUSH_NONE = 0x1, + L1D_FLUSH_FALLBACK = 0x2, + L1D_FLUSH_ORI = 0x4, + L1D_FLUSH_MTTRIG = 0x8, +}; + +void __init setup_rfi_flush(enum l1d_flush_type, bool enable); +void do_rfi_flush_fixups(enum l1d_flush_type types); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6b958414b4e0..f390d57cf2e1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -237,6 +237,11 @@ int main(void) OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp); OFFSET(PACA_IN_MCE, paca_struct, in_mce); OFFSET(PACA_IN_NMI, paca_struct, in_nmi); + OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area); + OFFSET(PACA_EXRFI, paca_struct, exrfi); + OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence); + OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets); + #endif OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ed356194f09c..2dc10bf646b8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1449,6 +1449,90 @@ masked_##_H##interrupt: \ b .; \ MASKED_DEC_HANDLER(_H) +TRAMP_REAL_BEGIN(rfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + rfid + +TRAMP_REAL_BEGIN(hrfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + hrfid + /* * Real mode exceptions actually use this too, but alternate * instruction code patches (which end up in the common .text area) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 8956a9856604..96163f4c3673 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -801,3 +801,82 @@ static int __init disable_hardlockup_detector(void) return 0; } early_initcall(disable_hardlockup_detector); + +#ifdef CONFIG_PPC_BOOK3S_64 +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +bool rfi_flush; + +static void do_nothing(void *unused) +{ + /* + * We don't need to do the flush explicitly, just enter+exit kernel is + * sufficient, the RFI exit handlers will do the right thing. + */ +} + +void rfi_flush_enable(bool enable) +{ + if (rfi_flush == enable) + return; + + if (enable) { + do_rfi_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else + do_rfi_flush_fixups(L1D_FLUSH_NONE); + + rfi_flush = enable; +} + +static void init_fallback_flush(void) +{ + u64 l1d_size, limit; + int cpu; + + l1d_size = ppc64_caches.l1d.size; + limit = min(safe_stack_limit(), ppc64_rma_size); + + /* + * Align to L1d size, and size it at 2x L1d size, to catch possible + * hardware prefetch runoff. We don't have a recipe for load patterns to + * reliably avoid the prefetcher. + */ + l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit)); + memset(l1d_flush_fallback_area, 0, l1d_size * 2); + + for_each_possible_cpu(cpu) { + /* + * The fallback flush is currently coded for 8-way + * associativity. Different associativity is possible, but it + * will be treated as 8-way and may not evict the lines as + * effectively. + * + * 128 byte lines are mandatory. + */ + u64 c = l1d_size / 8; + + paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area; + paca[cpu].l1d_flush_congruence = c; + paca[cpu].l1d_flush_sets = c / 128; + } +} + +void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ + if (types & L1D_FLUSH_FALLBACK) { + pr_info("rfi-flush: Using fallback displacement flush\n"); + init_fallback_flush(); + } + + if (types & L1D_FLUSH_ORI) + pr_info("rfi-flush: Using ori type flush\n"); + + if (types & L1D_FLUSH_MTTRIG) + pr_info("rfi-flush: Using mttrig type flush\n"); + + enabled_flush_types = types; + + rfi_flush_enable(enable); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 0494e1566ee2..307843d23682 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -132,6 +132,15 @@ SECTIONS /* Read-only data */ RO_DATA(PAGE_SIZE) +#ifdef CONFIG_PPC64 + . = ALIGN(8); + __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) { + __start___rfi_flush_fixup = .; + *(__rfi_flush_fixup) + __stop___rfi_flush_fixup = .; + } +#endif + EXCEPTION_TABLE(0) NOTES :kernel :notes diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 41cf5ae273cf..a95ea007d654 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -116,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) } } +#ifdef CONFIG_PPC_BOOK3S_64 +void do_rfi_flush_fixups(enum l1d_flush_type types) +{ + unsigned int instrs[3], *dest; + long *start, *end; + int i; + + start = PTRRELOC(&__start___rfi_flush_fixup), + end = PTRRELOC(&__stop___rfi_flush_fixup); + + instrs[0] = 0x60000000; /* nop */ + instrs[1] = 0x60000000; /* nop */ + instrs[2] = 0x60000000; /* nop */ + + if (types & L1D_FLUSH_FALLBACK) + /* b .+16 to fallback flush */ + instrs[0] = 0x48000010; + + i = 0; + if (types & L1D_FLUSH_ORI) { + instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */ + instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + + for (i = 0; start < end; start++, i++) { + dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + patch_instruction(dest, instrs[0]); + patch_instruction(dest + 1, instrs[1]); + patch_instruction(dest + 2, instrs[2]); + } + + printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { long *start, *end; -- cgit v1.2.3 From bc9c9304a45480797e13a8e1df96ffcf44fb62fe Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/64s: Support disabling RFI flush with no_rfi_flush and nopti Because there may be some performance overhead of the RFI flush, add kernel command line options to disable it. We add a sensibly named 'no_rfi_flush' option, but we also hijack the x86 option 'nopti'. The RFI flush is not the same as KPTI, but if we see 'nopti' we can guess that the user is trying to avoid any overhead of Meltdown mitigations, and it means we don't have to educate every one about a different command line option. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 96163f4c3673..491be4179ddd 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -805,8 +805,29 @@ early_initcall(disable_hardlockup_detector); #ifdef CONFIG_PPC_BOOK3S_64 static enum l1d_flush_type enabled_flush_types; static void *l1d_flush_fallback_area; +static bool no_rfi_flush; bool rfi_flush; +static int __init handle_no_rfi_flush(char *p) +{ + pr_info("rfi-flush: disabled on command line."); + no_rfi_flush = true; + return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); + handle_no_rfi_flush(NULL); + return 0; +} +early_param("nopti", handle_no_pti); + static void do_nothing(void *unused) { /* @@ -877,6 +898,7 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) enabled_flush_types = types; - rfi_flush_enable(enable); + if (!no_rfi_flush) + rfi_flush_enable(enable); } #endif /* CONFIG_PPC_BOOK3S_64 */ -- cgit v1.2.3 From 8989d56878a7735dfdb234707a2fee6faf631085 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/pseries: Query hypervisor for RFI flush settings A new hypervisor call is available which tells the guest settings related to the RFI flush. Use it to query the appropriate flush instruction(s), and whether the flush is required. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/setup.c | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index a8531e012658..ae4f596273b5 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -459,6 +459,39 @@ static void __init find_and_init_phbs(void) of_pci_check_probe_only(); } +static void pseries_setup_rfi_flush(void) +{ + struct h_cpu_char_result result; + enum l1d_flush_type types; + bool enable; + long rc; + + /* Enable by default */ + enable = true; + + rc = plpar_get_cpu_characteristics(&result); + if (rc == H_SUCCESS) { + types = L1D_FLUSH_NONE; + + if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2) + types |= L1D_FLUSH_MTTRIG; + if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30) + types |= L1D_FLUSH_ORI; + + /* Use fallback if nothing set in hcall */ + if (types == L1D_FLUSH_NONE) + types = L1D_FLUSH_FALLBACK; + + if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR)) + enable = false; + } else { + /* Default to fallback if case hcall is not available */ + types = L1D_FLUSH_FALLBACK; + } + + setup_rfi_flush(types, enable); +} + static void __init pSeries_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); @@ -476,6 +509,8 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); + pseries_setup_rfi_flush(); + /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); -- cgit v1.2.3 From 6e032b350cd1fdb830f18f8320ef0e13b4e24094 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: powerpc/powernv: Check device-tree for RFI flush settings New device-tree properties are available which tell the hypervisor settings related to the RFI flush. Use them to determine the appropriate flush instruction to use, and whether the flush is required. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/setup.c | 49 ++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 1edfbc1e40f4..4fb21e17504a 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -37,13 +37,62 @@ #include #include #include +#include #include "powernv.h" +static void pnv_setup_rfi_flush(void) +{ + struct device_node *np, *fw_features; + enum l1d_flush_type type; + int enable; + + /* Default to fallback in case fw-features are not available */ + type = L1D_FLUSH_FALLBACK; + enable = 1; + + np = of_find_node_by_name(NULL, "ibm,opal"); + fw_features = of_get_child_by_name(np, "fw-features"); + of_node_put(np); + + if (fw_features) { + np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_MTTRIG; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_ORI; + + of_node_put(np); + + /* Enable unless firmware says NOT to */ + enable = 2; + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + of_node_put(fw_features); + } + + setup_rfi_flush(type, enable > 0); +} + static void __init pnv_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + pnv_setup_rfi_flush(); + /* Initialize SMP */ pnv_smp_init(); -- cgit v1.2.3 From 0a5191efe06b5103909206e4fbcff81d30283f8e Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 29 Oct 2017 16:27:21 +0100 Subject: MIPS: AR7: ensure the port type's FCR value is used Since commit aef9a7bd9b67 ("serial/uart/8250: Add tunable RX interrupt trigger I/F of FIFO buffers"), the port's default FCR value isn't used in serial8250_do_set_termios anymore, but copied over once in serial8250_config_port and then modified as needed. Unfortunately, serial8250_config_port will never be called if the port is shared between kernel and userspace, and the port's flag doesn't have UPF_BOOT_AUTOCONF, which would trigger a serial8250_config_port as well. This causes garbled output from userspace: [ 5.220000] random: procd urandom read with 49 bits of entropy available ers [kee Fix this by forcing it to be configured on boot, resulting in the expected output: [ 5.250000] random: procd urandom read with 50 bits of entropy available Press the [f] key and hit [enter] to enter failsafe mode Press the [1], [2], [3] or [4] key and hit [enter] to select the debug level Fixes: aef9a7bd9b67 ("serial/uart/8250: Add tunable RX interrupt trigger I/F of FIFO buffers") Signed-off-by: Jonas Gorski Cc: Greg Kroah-Hartman Cc: Yoshihiro YUNOMAE Cc: Florian Fainelli Cc: Nicolas Schichan Cc: linux-mips@linux-mips.org Cc: linux-serial@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17544/ Signed-off-by: Ralf Baechle --- arch/mips/ar7/platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/ar7/platform.c b/arch/mips/ar7/platform.c index 4674f1efbe7a..e1675c25d5d4 100644 --- a/arch/mips/ar7/platform.c +++ b/arch/mips/ar7/platform.c @@ -575,7 +575,7 @@ static int __init ar7_register_uarts(void) uart_port.type = PORT_AR7; uart_port.uartclk = clk_get_rate(bus_clk) / 2; uart_port.iotype = UPIO_MEM32; - uart_port.flags = UPF_FIXED_TYPE; + uart_port.flags = UPF_FIXED_TYPE | UPF_BOOT_AUTOCONF; uart_port.regshift = 2; uart_port.line = 0; -- cgit v1.2.3 From 274920a3ecd5f43af0cc380bc0a9ee73a52b9f8a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 23:49:12 +1100 Subject: powerpc/xmon: Add RFI flush related fields to paca dump Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index cab24f549e7c..b3bb5beec54a 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2375,6 +2375,10 @@ static void dump_one_paca(int cpu) DUMP(p, slb_cache_ptr, "x"); for (i = 0; i < SLB_CACHE_ENTRIES; i++) printf(" slb_cache[%d]: = 0x%016lx\n", i, p->slb_cache[i]); + + DUMP(p, rfi_flush_fallback_area, "px"); + DUMP(p, l1d_flush_congruence, "llx"); + DUMP(p, l1d_flush_sets, "llx"); #endif DUMP(p, dscr_default, "llx"); #ifdef CONFIG_PPC_BOOK3E -- cgit v1.2.3 From 2248fade965a5f1ba2a8e6e63f84df696b2d2780 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 11 Jan 2018 01:17:24 +1100 Subject: powerpc/xmon: Don't print hashed pointers in paca dump Remember when the biggest problem we had to worry about was hashed pointers, those were the days. These were missed in my earlier patch because they don't match "%p", but the macro is hiding a "%p", so these all end up being hashed, which is not what we want in xmon. Convert them to "%px". Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index b3bb5beec54a..0ddc7ac6c5f1 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2344,10 +2344,10 @@ static void dump_one_paca(int cpu) DUMP(p, kernel_toc, "lx"); DUMP(p, kernelbase, "lx"); DUMP(p, kernel_msr, "lx"); - DUMP(p, emergency_sp, "p"); + DUMP(p, emergency_sp, "px"); #ifdef CONFIG_PPC_BOOK3S_64 - DUMP(p, nmi_emergency_sp, "p"); - DUMP(p, mc_emergency_sp, "p"); + DUMP(p, nmi_emergency_sp, "px"); + DUMP(p, mc_emergency_sp, "px"); DUMP(p, in_nmi, "x"); DUMP(p, in_mce, "x"); DUMP(p, hmi_event_available, "x"); @@ -2382,14 +2382,14 @@ static void dump_one_paca(int cpu) #endif DUMP(p, dscr_default, "llx"); #ifdef CONFIG_PPC_BOOK3E - DUMP(p, pgd, "p"); - DUMP(p, kernel_pgd, "p"); - DUMP(p, tcd_ptr, "p"); - DUMP(p, mc_kstack, "p"); - DUMP(p, crit_kstack, "p"); - DUMP(p, dbg_kstack, "p"); + DUMP(p, pgd, "px"); + DUMP(p, kernel_pgd, "px"); + DUMP(p, tcd_ptr, "px"); + DUMP(p, mc_kstack, "px"); + DUMP(p, crit_kstack, "px"); + DUMP(p, dbg_kstack, "px"); #endif - DUMP(p, __current, "p"); + DUMP(p, __current, "px"); DUMP(p, kstack, "lx"); printf(" kstack_base = 0x%016lx\n", p->kstack & ~(THREAD_SIZE - 1)); DUMP(p, stab_rr, "lx"); @@ -2407,7 +2407,7 @@ static void dump_one_paca(int cpu) #endif #ifdef CONFIG_PPC_POWERNV - DUMP(p, core_idle_state_ptr, "p"); + DUMP(p, core_idle_state_ptr, "px"); DUMP(p, thread_idle_state, "x"); DUMP(p, thread_mask, "x"); DUMP(p, subcore_sibling_mask, "x"); -- cgit v1.2.3 From 349524bc0da698ec77f2057cf4a4948eb6349265 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 10 Jan 2018 17:10:12 +1100 Subject: powerpc: Don't preempt_disable() in show_cpuinfo() This causes warnings from cpufreq mutex code. This is also rather unnecessary and ineffective. If we really want to prevent concurrent unplug, we could take the unplug read lock but I don't see this being critical. Fixes: cd77b5ce208c ("powerpc/powernv/cpufreq: Fix the frequency read by /proc/cpuinfo") Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 9d213542a48b..8fd3a70047f1 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -242,14 +242,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) unsigned short maj; unsigned short min; - /* We only show online cpus: disable preempt (overzealous, I - * knew) to prevent cpu going down. */ - preempt_disable(); - if (!cpu_online(cpu_id)) { - preempt_enable(); - return 0; - } - #ifdef CONFIG_SMP pvr = per_cpu(cpu_pvr, cpu_id); #else @@ -358,9 +350,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_SMP seq_printf(m, "\n"); #endif - - preempt_enable(); - /* If this is the last cpu, print the summary */ if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids) show_cpuinfo_summary(m); -- cgit v1.2.3 From 9dd79fed1bed6089f9729027e2d8cfa1049266e5 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 14 Nov 2017 13:29:17 +0000 Subject: MIPS: ath25: Avoid undefined early_serial_setup() without SERIAL_8250_CONSOLE Currently MIPS allnoconfig with CONFIG_ATH25=y fails to link due to missing support for early_serial_setup(): LD vmlinux arch/mips/ath25/devices.o: In function ath25_serial_setup': devices.c:(.init.text+0x68): undefined reference to 'early_serial_setup' Rather than adding dependencies to the platform to force inclusion of SERIAL_8250_CONSOLE together with it's dependencies like TTY, HAS_IOMEM, etc, just make ath25_serial_setup() a no-op when the dependency is not selected in the kernel config. Signed-off-by: Matt Redfearn Cc: James Hogan Cc: Thomas Gleixner Cc: Philippe Ombredanne Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17700/ Signed-off-by: Ralf Baechle --- arch/mips/ath25/devices.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/mips/ath25/devices.c b/arch/mips/ath25/devices.c index e1156347da53..301a9028273c 100644 --- a/arch/mips/ath25/devices.c +++ b/arch/mips/ath25/devices.c @@ -73,6 +73,7 @@ const char *get_system_type(void) void __init ath25_serial_setup(u32 mapbase, int irq, unsigned int uartclk) { +#ifdef CONFIG_SERIAL_8250_CONSOLE struct uart_port s; memset(&s, 0, sizeof(s)); @@ -85,6 +86,7 @@ void __init ath25_serial_setup(u32 mapbase, int irq, unsigned int uartclk) s.uartclk = uartclk; early_serial_setup(&s); +#endif /* CONFIG_SERIAL_8250_CONSOLE */ } int __init ath25_add_wmac(int nr, u32 base, int irq) -- cgit v1.2.3 From ff9bed94d0f3b82d0fff3599cf0eb2cadf0fc770 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 14 Nov 2017 15:44:22 +0000 Subject: MIPS: RB532: Avoid undefined early_serial_setup() without SERIAL_8250_CONSOLE Currently MIPS allnoconfig with CONFIG_MIKROTIK_RB532=y fails to link due to missing support for early_serial_setup(): LD vmlinux arch/mips/rb532/serial.o: In function `setup_serial_port': serial.c:(.init.text+0x14): undefined reference to `early_serial_setup' Rather than adding dependencies to the platform to force inclusion of SERIAL_8250_CONSOLE together with it's dependencies like TTY, HAS_IOMEM, etc, just exclude arch/mips/rb532/serial.c from the build when it's dependency is not selected in the kernel config. Reported-by: Ralf Baechle Signed-off-by: Matt Redfearn Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17701/ Signed-off-by: Ralf Baechle --- arch/mips/rb532/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/rb532/Makefile b/arch/mips/rb532/Makefile index efdecdb6e3ea..8186afca2234 100644 --- a/arch/mips/rb532/Makefile +++ b/arch/mips/rb532/Makefile @@ -2,4 +2,6 @@ # Makefile for the RB532 board specific parts of the kernel # -obj-y += irq.o time.o setup.o serial.o prom.o gpio.o devices.o +obj-$(CONFIG_SERIAL_8250_CONSOLE) += serial.o + +obj-y += irq.o time.o setup.o prom.o gpio.o devices.o -- cgit v1.2.3 From ecff167cc80dd3c6afff55bdc66b4981d587ba3e Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 14 Nov 2017 15:44:23 +0000 Subject: MIPS: RB532: Avoid undefined mac_pton without GENERIC_NET_UTILS Currently MIPS allnoconfig with CONFIG_MIKROTIK_RB532=y fails to link due to missing support for mac_pton(): LD vmlinux arch/mips/rb532/devices.o: In function `setup_kmac': devices.c:(.init.text+0xc): undefined reference to `mac_pton' Rather than adding dependencies to the platform to force inclusion of GENERIC_NET_UTILS which is selected by CONFIG_NET, just exclude the setup of the MAC address if CONFIG_NET is not selected in the kernel config. Signed-off-by: Matt Redfearn Cc: James Hogan Cc: Boris Brezillon Cc: Neil Armstrong Cc: Krzysztof Kozlowski Cc: Tony Lindgren Cc: Vladimir Zapolskiy Cc: Shawn Guo Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17702/ Signed-off-by: Ralf Baechle --- arch/mips/rb532/devices.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/mips/rb532/devices.c b/arch/mips/rb532/devices.c index 32ea3e6731d6..354d258396ff 100644 --- a/arch/mips/rb532/devices.c +++ b/arch/mips/rb532/devices.c @@ -310,6 +310,8 @@ static int __init plat_setup_devices(void) return platform_add_devices(rb532_devs, ARRAY_SIZE(rb532_devs)); } +#ifdef CONFIG_NET + static int __init setup_kmac(char *s) { printk(KERN_INFO "korina mac = %s\n", s); @@ -322,4 +324,6 @@ static int __init setup_kmac(char *s) __setup("kmac=", setup_kmac); +#endif /* CONFIG_NET */ + arch_initcall(plat_setup_devices); -- cgit v1.2.3 From dfe004951b1387f8e91b83c95a5dc964a8545d49 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 14 Nov 2017 17:16:27 +0000 Subject: MIPS: BCM47XX Avoid compile error with MIPS allnoconfig Currently MIPS allnoconfig with CONFIG_BCM47XX=y fails to compile due to neither BCM47XX_BCMA nor BCM47XX_SSB being selected. This leads the enumeration in arch/mips/include/asm/mach-bcm47xx/bcm47xx.h to be empty, and compilation fails: In file included from arch/mips/bcm47xx/irq.c:32:0: ./arch/mips/include/asm/mach-bcm47xx/bcm47xx.h:34:1: error: expected identifier before '}' token }; ^ make[2]: *** [scripts/Makefile.build:314: arch/mips/bcm47xx/irq.o] Error 1 Fix this by ensuring that BCM47XX_SSB is selected if BCM47XX_BCMA is not. This allows us to select either system or both, but not neither. Signed-off-by: Matt Redfearn Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17703/ Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 350a990fc719..659e0079487f 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -259,6 +259,7 @@ config BCM47XX select LEDS_GPIO_REGISTER select BCM47XX_NVRAM select BCM47XX_SPROM + select BCM47XX_SSB if !BCM47XX_BCMA help Support for BCM47XX based boards -- cgit v1.2.3 From 7e5e371ee5390a8bb7e111c794a334d9bf25ca3d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 21 Nov 2017 00:02:40 +0000 Subject: MIPS: Fix CPS SMP NS16550 UART defaults The MIPS_CPS_NS16550_BASE and MIPS_CPS_NS16550_SHIFT options have no defaults for non-Malta platforms which select SYS_SUPPORTS_MIPS_CPS (i.e. the pistachio and generic platforms). This is problematic for automated allyesconfig and allmodconfig builds based on these platforms, since make silentoldconfig tries to ask the user for values, and especially since v4.15 where the default platform was switched to generic. Default these options to 0 and arrange for MIPS_CPS_NS16550 to be no when using that default base address, so that the option only has an effect when the default is provided (i.e. Malta) or when a value is provided by the user. Fixes: 609cf6f2291a ("MIPS: CPS: Early debug using an ns16550-compatible UART") Signed-off-by: James Hogan Reviewed-by: Paul Burton Tested-by: Guenter Roeck Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17749/ Signed-off-by: Ralf Baechle --- arch/mips/Kconfig.debug | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/mips/Kconfig.debug b/arch/mips/Kconfig.debug index 464af5e025d6..0749c3724543 100644 --- a/arch/mips/Kconfig.debug +++ b/arch/mips/Kconfig.debug @@ -124,30 +124,36 @@ config SCACHE_DEBUGFS If unsure, say N. -menuconfig MIPS_CPS_NS16550 +menuconfig MIPS_CPS_NS16550_BOOL bool "CPS SMP NS16550 UART output" depends on MIPS_CPS help Output debug information via an ns16550 compatible UART if exceptions occur early in the boot process of a secondary core. -if MIPS_CPS_NS16550 +if MIPS_CPS_NS16550_BOOL + +config MIPS_CPS_NS16550 + def_bool MIPS_CPS_NS16550_BASE != 0 config MIPS_CPS_NS16550_BASE hex "UART Base Address" default 0x1b0003f8 if MIPS_MALTA + default 0 help The base address of the ns16550 compatible UART on which to output debug information from the early stages of core startup. + This is only used if non-zero. + config MIPS_CPS_NS16550_SHIFT int "UART Register Shift" - default 0 if MIPS_MALTA + default 0 help The number of bits to shift ns16550 register indices by in order to form their addresses. That is, log base 2 of the span between adjacent ns16550 registers in the system. -endif # MIPS_CPS_NS16550 +endif # MIPS_CPS_NS16550_BOOL endmenu -- cgit v1.2.3 From b6ab1a138b758c4bdf30d5517e546e1c8aff9e3b Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Fri, 24 Nov 2017 07:38:20 +0530 Subject: MIPS: ralink: Fix platform_get_irq's error checking The platform_get_irq() function returns negative if an error occurs. zero or positive number on success. platform_get_irq() error checking for zero is not correct. Signed-off-by: Arvind Yadav Cc: john@phrozen.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17783/ Signed-off-by: Ralf Baechle --- arch/mips/ralink/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/mips/ralink/timer.c b/arch/mips/ralink/timer.c index d4469b20d176..4f46a4509f79 100644 --- a/arch/mips/ralink/timer.c +++ b/arch/mips/ralink/timer.c @@ -109,9 +109,9 @@ static int rt_timer_probe(struct platform_device *pdev) } rt->irq = platform_get_irq(pdev, 0); - if (!rt->irq) { + if (rt->irq < 0) { dev_err(&pdev->dev, "failed to load irq\n"); - return -ENOENT; + return rt->irq; } rt->membase = devm_ioremap_resource(&pdev->dev, res); -- cgit v1.2.3 From c04de7b1ad645b61c141df8ca903ba0cc03a57f7 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 5 Dec 2017 22:28:22 +0000 Subject: MIPS: CM: Drop WARN_ON(vp != 0) Since commit 68923cdc2eb3 ("MIPS: CM: Add cluster & block args to mips_cm_lock_other()"), mips_smp_send_ipi_mask() has used mips_cm_lock_other_cpu() with each CPU number, rather than mips_cm_lock_other() with the first VPE in each core. Prior to r6, multicore multithreaded systems such as dual-core dual-thread interAptivs with CPU Idle enabled (e.g. MIPS Creator Ci40) results in mips_cm_lock_other() repeatedly hitting WARN_ON(vp != 0). There doesn't appear to be anything fundamentally wrong about passing a non-zero VP/VPE number, even if it is a core's region that is locked into the other region before r6, so remove that particular WARN_ON(). Fixes: 68923cdc2eb3 ("MIPS: CM: Add cluster & block args to mips_cm_lock_other()") Signed-off-by: James Hogan Reviewed-by: Paul Burton Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org # 4.14+ Patchwork: https://patchwork.linux-mips.org/patch/17883/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/mips-cm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c index dd5567b1e305..8f5bd04f320a 100644 --- a/arch/mips/kernel/mips-cm.c +++ b/arch/mips/kernel/mips-cm.c @@ -292,7 +292,6 @@ void mips_cm_lock_other(unsigned int cluster, unsigned int core, *this_cpu_ptr(&cm_core_lock_flags)); } else { WARN_ON(cluster != 0); - WARN_ON(vp != 0); WARN_ON(block != CM_GCR_Cx_OTHER_BLOCK_LOCAL); /* -- cgit v1.2.3 From ccf85c744275de0ba40beff0bf9206a094f12e62 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 7 Dec 2017 07:14:17 +0000 Subject: MIPS: mm: Fix duplicate "const" on insn_table_MM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following gcc 7.x build error on microMIPS builds: arch/mips/mm/uasm-micromips.c:43:26: error: duplicate ‘const’ declaration specifier [-Werror=duplicate-decl-specifier] static const struct insn const insn_table_MM[insn_invalid] = { ^~~~~ The same issue has already been fixed in uasm-mips by commit 00e06297b351 ("MIPS: mm: remove duplicate "const" qualifier on insn_table"). Signed-off-by: James Hogan Fixes: ce807d5f67ed ("MIPS: Optimize uasm insn lookup.") Cc: David Daney Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17889/ Signed-off-by: Ralf Baechle --- arch/mips/mm/uasm-micromips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c index cdb5a191b9d5..9bb6baa45da3 100644 --- a/arch/mips/mm/uasm-micromips.c +++ b/arch/mips/mm/uasm-micromips.c @@ -40,7 +40,7 @@ #include "uasm.c" -static const struct insn const insn_table_MM[insn_invalid] = { +static const struct insn insn_table_MM[insn_invalid] = { [insn_addu] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_addu32_op), RT | RS | RD}, [insn_addiu] = {M(mm_addiu32_op, 0, 0, 0, 0, 0), RT | RS | SIMM}, [insn_and] = {M(mm_pool32a_op, 0, 0, 0, 0, mm_and_op), RT | RS | RD}, -- cgit v1.2.3 From 612e8e9350fd19cae6900cf36ea0c6892d1a0dca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 10 Jan 2018 12:28:16 +0100 Subject: x86/alternatives: Fix optimize_nops() checking The alternatives code checks only the first byte whether it is a NOP, but with NOPs in front of the payload and having actual instructions after it breaks the "optimized' test. Make sure to scan all bytes before deciding to optimize the NOPs in there. Reported-by: David Woodhouse Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Tom Lendacky Cc: Andi Kleen Cc: Tim Chen Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Dave Hansen Cc: Andi Kleen Cc: Andrew Lutomirski Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic --- arch/x86/kernel/alternative.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3344d3382e91..e0b97e4d1db5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -344,9 +344,12 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; + int i; - if (instr[0] != 0x90) - return; + for (i = 0; i < a->padlen; i++) { + if (instr[i] != 0x90) + return; + } local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); -- cgit v1.2.3 From 2a266f23550be997d783f27e704b9b40c4010292 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 10 Jan 2018 21:44:42 +0800 Subject: KVM MMU: check pending exception before injecting APF For example, when two APF's for page ready happen after one exit and the first one becomes pending, the second one will result in #DF. Instead, just handle the second page fault synchronously. Reported-by: Ross Zwisler Message-ID: Reported-by: Alec Blayne Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c4deb1f34faa..e577bacd4bd0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu))) + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) return false; if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) -- cgit v1.2.3 From ebabcf17bcd7ce968b1631ebe08236275698f39b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 7 Dec 2017 07:20:46 +0000 Subject: MIPS: Implement __multi3 for GCC7 MIPS64r6 builds GCC7 is a bit too eager to generate suboptimal __multi3 calls (128bit multiply with 128bit result) for MIPS64r6 builds, even in code which doesn't explicitly use 128bit types, such as the following: unsigned long func(unsigned long a, unsigned long b) { return a > (~0UL) / b; } Which GCC rearanges to: return (unsigned __int128)a * (unsigned __int128)b > 0xffffffffffffffff; Therefore implement __multi3, but only for MIPS64r6 with GCC7 as under normal circumstances we wouldn't expect any calls to __multi3 to be generated from kernel code. Reported-by: Thomas Petazzoni Signed-off-by: James Hogan Tested-by: Waldemar Brodkorb Cc: Ralf Baechle Cc: Maciej W. Rozycki Cc: Matthew Fortune Cc: Florian Fainelli Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17890/ --- arch/mips/lib/Makefile | 3 ++- arch/mips/lib/libgcc.h | 17 ++++++++++++++++ arch/mips/lib/multi3.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 arch/mips/lib/multi3.c (limited to 'arch') diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile index 78c2affeabf8..e84e12655fa8 100644 --- a/arch/mips/lib/Makefile +++ b/arch/mips/lib/Makefile @@ -16,4 +16,5 @@ obj-$(CONFIG_CPU_R3000) += r3k_dump_tlb.o obj-$(CONFIG_CPU_TX39XX) += r3k_dump_tlb.o # libgcc-style stuff needed in the kernel -obj-y += ashldi3.o ashrdi3.o bswapsi.o bswapdi.o cmpdi2.o lshrdi3.o ucmpdi2.o +obj-y += ashldi3.o ashrdi3.o bswapsi.o bswapdi.o cmpdi2.o lshrdi3.o multi3.o \ + ucmpdi2.o diff --git a/arch/mips/lib/libgcc.h b/arch/mips/lib/libgcc.h index 28002ed90c2c..199a7f96282f 100644 --- a/arch/mips/lib/libgcc.h +++ b/arch/mips/lib/libgcc.h @@ -10,10 +10,18 @@ typedef int word_type __attribute__ ((mode (__word__))); struct DWstruct { int high, low; }; + +struct TWstruct { + long long high, low; +}; #elif defined(__LITTLE_ENDIAN) struct DWstruct { int low, high; }; + +struct TWstruct { + long long low, high; +}; #else #error I feel sick. #endif @@ -23,4 +31,13 @@ typedef union { long long ll; } DWunion; +#if defined(CONFIG_64BIT) && defined(CONFIG_CPU_MIPSR6) +typedef int ti_type __attribute__((mode(TI))); + +typedef union { + struct TWstruct s; + ti_type ti; +} TWunion; +#endif + #endif /* __ASM_LIBGCC_H */ diff --git a/arch/mips/lib/multi3.c b/arch/mips/lib/multi3.c new file mode 100644 index 000000000000..111ad475aa0c --- /dev/null +++ b/arch/mips/lib/multi3.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "libgcc.h" + +/* + * GCC 7 suboptimally generates __multi3 calls for mips64r6, so for that + * specific case only we'll implement it here. + * + * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82981 + */ +#if defined(CONFIG_64BIT) && defined(CONFIG_CPU_MIPSR6) && (__GNUC__ == 7) + +/* multiply 64-bit values, low 64-bits returned */ +static inline long long notrace dmulu(long long a, long long b) +{ + long long res; + + asm ("dmulu %0,%1,%2" : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +/* multiply 64-bit unsigned values, high 64-bits of 128-bit result returned */ +static inline long long notrace dmuhu(long long a, long long b) +{ + long long res; + + asm ("dmuhu %0,%1,%2" : "=r" (res) : "r" (a), "r" (b)); + return res; +} + +/* multiply 128-bit values, low 128-bits returned */ +ti_type notrace __multi3(ti_type a, ti_type b) +{ + TWunion res, aa, bb; + + aa.ti = a; + bb.ti = b; + + /* + * a * b = (a.lo * b.lo) + * + 2^64 * (a.hi * b.lo + a.lo * b.hi) + * [+ 2^128 * (a.hi * b.hi)] + */ + res.s.low = dmulu(aa.s.low, bb.s.low); + res.s.high = dmuhu(aa.s.low, bb.s.low); + res.s.high += dmulu(aa.s.high, bb.s.low); + res.s.high += dmulu(aa.s.low, bb.s.high); + + return res.ti; +} +EXPORT_SYMBOL(__multi3); + +#endif /* 64BIT && CPU_MIPSR6 && GCC7 */ -- cgit v1.2.3 From 2a3e83c6f96c513f43ce5a8c9034608ea584a255 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Sat, 6 Jan 2018 02:00:13 +0100 Subject: x86/gart: Exclude GART aperture from vmcore On machines where the GART aperture is mapped over physical RAM /proc/vmcore contains the remapped range and reading it may cause hangs or reboots. In the past, the GART region was added into the resource map, implemented by commit 56dd669a138c ("[PATCH] Insert GART region into resource map") However, inserting the iomem_resource from the early GART code caused resource conflicts with some AGP drivers (bko#72201), which got avoided by reverting the patch in commit 707d4eefbdb3 ("Revert [PATCH] Insert GART region into resource map"). This revert introduced the /proc/vmcore bug. The vmcore ELF header is either prepared by the kernel (when using the kexec_file_load syscall) or by the kexec userspace (when using the kexec_load syscall). Since we no longer have the GART iomem resource, the userspace kexec has no way of knowing which region to exclude from the ELF header. Changes from v1 of this patch: Instead of excluding the aperture from the ELF header, this patch makes /proc/vmcore return zeroes in the second kernel when attempting to read the aperture region. This is done by reusing the gart_oldmem_pfn_is_ram infrastructure originally intended to exclude XEN balooned memory. This works for both, the kexec_file_load and kexec_load syscalls. [Note that the GART region is the same in the first and second kernels: regardless whether the first kernel fixed up the northbridge/bios setting and mapped the aperture over physical memory, the second kernel finds the northbridge properly configured by the first kernel and the aperture never overlaps with e820 memory because the second kernel has a fake e820 map created from the crashkernel memory regions. Thus, the second kernel keeps the aperture address/size as configured by the first kernel.] register_oldmem_pfn_is_ram can only register one callback and returns an error if the callback has been registered already. Since XEN used to be the only user of this function, it never checks the return value. Now that we have more than one user, I added a WARN_ON just in case agp, XEN, or any other future user of register_oldmem_pfn_is_ram were to step on each other's toes. Fixes: 707d4eefbdb3 ("Revert [PATCH] Insert GART region into resource map") Signed-off-by: Jiri Bohac Signed-off-by: Thomas Gleixner Cc: Baoquan He Cc: Toshi Kani Cc: David Airlie Cc: yinghai@kernel.org Cc: joro@8bytes.org Cc: kexec@lists.infradead.org Cc: Borislav Petkov Cc: Bjorn Helgaas Cc: Dave Young Cc: Vivek Goyal Link: https://lkml.kernel.org/r/20180106010013.73suskgxm7lox7g6@dwarf.suse.cz --- arch/x86/kernel/aperture_64.c | 46 ++++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/mmu_hvm.c | 2 +- 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index f5d92bc3b884..2c4d5ece7456 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * Using 512M as goal, in case kexec will load kernel_big @@ -56,6 +57,33 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; +#ifdef CONFIG_PROC_VMCORE +/* + * If the first kernel maps the aperture over e820 RAM, the kdump kernel will + * use the same range because it will remain configured in the northbridge. + * Trying to dump this area via /proc/vmcore may crash the machine, so exclude + * it from vmcore. + */ +static unsigned long aperture_pfn_start, aperture_page_count; + +static int gart_oldmem_pfn_is_ram(unsigned long pfn) +{ + return likely((pfn < aperture_pfn_start) || + (pfn >= aperture_pfn_start + aperture_page_count)); +} + +static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +{ + aperture_pfn_start = aper_base >> PAGE_SHIFT; + aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; + WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram)); +} +#else +static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +{ +} +#endif + /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ @@ -435,8 +463,16 @@ int __init gart_iommu_hole_init(void) out: if (!fix && !fallback_aper_force) { - if (last_aper_base) + if (last_aper_base) { + /* + * If this is the kdump kernel, the first kernel + * may have allocated the range over its e820 RAM + * and fixed up the northbridge + */ + exclude_from_vmcore(last_aper_base, last_aper_order); + return 1; + } return 0; } @@ -473,6 +509,14 @@ out: return 0; } + /* + * If this is the kdump kernel _and_ the first kernel did not + * configure the aperture in the northbridge, this range may + * overlap with the first kernel's memory. We can't access the + * range through vmcore even though it should be part of the dump. + */ + exclude_from_vmcore(aper_alloc, aper_order); + /* Fix up the north bridges */ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus, dev_base, dev_limit; diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 2cfcfe4f6b2a..dd2ad82eee80 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -75,6 +75,6 @@ void __init xen_hvm_init_mmu_ops(void) if (is_pagetable_dying_supported()) pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; #ifdef CONFIG_PROC_VMCORE - register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); + WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram)); #endif } -- cgit v1.2.3 From ab271bd4dfd568060ffcf5a21b667c7c5df7ab99 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Jan 2018 17:26:59 +0100 Subject: x86: kvm: propagate register_shrinker return code Patch "mm,vmscan: mark register_shrinker() as __must_check" is queued for 4.16 in linux-mm and adds a warning about the unchecked call to register_shrinker: arch/x86/kvm/mmu.c:5485:2: warning: ignoring return value of 'register_shrinker', declared with attribute warn_unused_result [-Wunused-result] This changes the kvm_mmu_module_init() function to fail itself when the call to register_shrinker fails. Signed-off-by: Arnd Bergmann Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e577bacd4bd0..2b8eb4da4d08 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5466,30 +5466,34 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + int ret = -ENOMEM; + kvm_mmu_clear_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); if (!pte_list_desc_cache) - goto nomem; + goto out; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) - goto nomem; + goto out; if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto nomem; + goto out; - register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker); + if (ret) + goto out; return 0; -nomem: +out: mmu_destroy_caches(); - return -ENOMEM; + return ret; } /* -- cgit v1.2.3 From bd89525a823ce6edddcedbe9aed79faa1b9cf544 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jan 2018 16:55:24 +0100 Subject: KVM: x86: emulate #UD while in guest mode This reverts commits ae1f57670703656cc9f293722c3b8b6782f8ab3f and ac9b305caa0df6f5b75d294e4b86c1027648991e. If the hardware doesn't support MOVBE, but L0 sets CPUID.01H:ECX.MOVBE in L1's emulated CPUID information, then L1 is likely to pass that CPUID bit through to L2. L2 will expect MOVBE to work, but if L1 doesn't intercept #UD, then any MOVBE instruction executed in L2 will raise #UD, and the exception will be delivered in L2. Commit ac9b305caa0df6f5b75d294e4b86c1027648991e is a better and more complete version of ae1f57670703 ("KVM: nVMX: Do not emulate #UD while in guest mode"); however, neither considers the above case. Suggested-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 9 +-------- arch/x86/kvm/vmx.c | 5 +---- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bb31c801f1fc..3158dac87f82 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -361,7 +361,6 @@ static void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; struct nested_state *g; - u32 h_intercept_exceptions; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -372,14 +371,9 @@ static void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested; - /* No need to intercept #UD if L1 doesn't intercept it */ - h_intercept_exceptions = - h->intercept_exceptions & ~(1U << UD_VECTOR); - c->intercept_cr = h->intercept_cr | g->intercept_cr; c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = - h_intercept_exceptions | g->intercept_exceptions; + c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; c->intercept = h->intercept | g->intercept; } @@ -2202,7 +2196,6 @@ static int ud_interception(struct vcpu_svm *svm) { int er; - WARN_ON_ONCE(is_guest_mode(&svm->vcpu)); er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c14d65f676a..427fd3200dd8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1887,7 +1887,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) | + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == @@ -1905,8 +1905,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; - else - eb |= 1u << UD_VECTOR; vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -5917,7 +5915,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; /* already handled by vmx_vcpu_run() */ if (is_invalid_opcode(intr_info)) { - WARN_ON_ONCE(is_guest_mode(vcpu)); er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); if (er == EMULATE_USER_EXIT) return 0; -- cgit v1.2.3 From f32ab7547161b9fa7ebfbc4f18ea1eb3fd49fe25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 11 Jan 2018 14:23:29 +0100 Subject: x86/PCI: Add "pci=big_root_window" option for AMD 64-bit windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only try to enable a 64-bit window on AMD CPUs when "pci=big_root_window" is specified. This taints the kernel because the new 64-bit window uses address space we don't know anything about, and it may contain unreported devices or memory that would conflict with the window. The pci_amd_enable_64bit_bar() quirk that enables the window is specific to AMD CPUs. The generic solution would be to have the firmware enable the window and describe it in the host bridge's _CRS method, or at least describe it in the _PRS method so the OS would have the option of enabling it. Signed-off-by: Christian König [bhelgaas: changelog, extend doc, mention taint in dmesg] Signed-off-by: Bjorn Helgaas --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/common.c | 5 +++++ arch/x86/pci/fixup.c | 7 ++++++- 4 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6571fbfdb2a1..619638362416 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3094,6 +3094,12 @@ pcie_scan_all Scan all possible PCIe devices. Otherwise we only look for one device below a PCIe downstream port. + big_root_window Try to add a big 64bit memory window to the PCIe + root complex on AMD CPUs. Some GFX hardware + can resize a BAR to allow access to all VRAM. + Adding the window is slightly risky (it may + conflict with unreported devices), so this + taints the kernel. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7a5d6695abd3..eb66fa9cd0fc 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -38,6 +38,7 @@ do { \ #define PCI_NOASSIGN_ROMS 0x80000 #define PCI_ROOT_NO_CRS 0x100000 #define PCI_NOASSIGN_BARS 0x200000 +#define PCI_BIG_ROOT_WINDOW 0x400000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7a5350d08cef..563049c483a1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -594,6 +594,11 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "nocrs")) { pci_probe |= PCI_ROOT_NO_CRS; return NULL; +#ifdef CONFIG_PHYS_ADDR_T_64BIT + } else if (!strcmp(str, "big_root_window")) { + pci_probe |= PCI_BIG_ROOT_WINDOW; + return NULL; +#endif } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index e663d6bf1328..8bad19c7473d 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -667,6 +667,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) struct resource *res, *conflict; struct pci_dev *other; + if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) + return; + /* Check that we are the only device of that type */ other = pci_get_device(dev->vendor, dev->device, NULL); if (other != dev || @@ -714,7 +717,9 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) res->start = conflict->end + 1; } - dev_info(&dev->dev, "adding root bus resource %pR\n", res); + dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", + res); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) | AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK; -- cgit v1.2.3 From 03a551734cfc2b93f83950a595974e3c9cbd82fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 11 Jan 2018 14:23:30 +0100 Subject: x86/PCI: Move and shrink AMD 64-bit window to avoid conflict MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid problems with BIOS implementations which don't report all used resources to the OS by only allocating a 256GB window directly below the hardware limit (from the BKDG, sec 2.4.6). Fixes a silent reboot loop reported by Aaro Koskinen on an AMD-based MSI MS-7699/760GA-P43(FX) system. This was apparently caused by RAM or other unreported hardware that conflicted with the new window. Link: https://support.amd.com/TechDocs/49125_15h_Models_30h-3Fh_BKDG.pdf Link: https://lkml.kernel.org/r/20180105220412.fzpwqe4zljdawr36@darkstar.musicnaut.iki.fi Fixes: fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") Reported-by: Aaro Koskinen Signed-off-by: Christian König [bhelgaas: changelog, comment, Fixes:] Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 8bad19c7473d..f6a26e3cb476 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -662,10 +662,11 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); */ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) { - unsigned i; u32 base, limit, high; - struct resource *res, *conflict; struct pci_dev *other; + struct resource *res; + unsigned i; + int r; if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) return; @@ -702,19 +703,20 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) if (!res) return; + /* + * Allocate a 256GB window directly below the 0xfd00000000 hardware + * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6). + */ res->name = "PCI Bus 0000:00"; res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_WINDOW; - res->start = 0x100000000ull; + res->start = 0xbd00000000ull; res->end = 0xfd00000000ull - 1; - /* Just grab the free area behind system memory for this */ - while ((conflict = request_resource_conflict(&iomem_resource, res))) { - if (conflict->end >= res->end) { - kfree(res); - return; - } - res->start = conflict->end + 1; + r = request_resource(&iomem_resource, res); + if (r) { + kfree(res); + return; } dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", -- cgit v1.2.3 From 445b69e3b75e42362a5bdc13c8b8f61599e2228a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 10 Jan 2018 14:49:39 -0800 Subject: x86/pti: Make unpoison of pgd for trusted boot work for real The inital fix for trusted boot and PTI potentially misses the pgd clearing if pud_alloc() sets a PGD. It probably works in *practice* because for two adjacent calls to map_tboot_page() that share a PGD entry, the first will clear NX, *then* allocate and set the PGD (without NX clear). The second call will *not* allocate but will clear the NX bit. Defer the NX clearing to a point after it is known that all top-level allocations have occurred. Add a comment to clarify why. [ tglx: Massaged changelog ] Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled") Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Andrea Arcangeli Cc: Jon Masters Cc: "Tim Chen" Cc: gnomes@lxorguk.ukuu.org.uk Cc: peterz@infradead.org Cc: ning.sun@intel.com Cc: tboot-devel@lists.sourceforge.net Cc: andi@firstfloor.org Cc: luto@kernel.org Cc: law@redhat.com Cc: pbonzini@redhat.com Cc: torvalds@linux-foundation.org Cc: gregkh@linux-foundation.org Cc: dwmw@amazon.co.uk Cc: nickc@redhat.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com --- arch/x86/kernel/tboot.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 75869a4b6c41..a2486f444073 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -127,7 +127,6 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, p4d = p4d_alloc(&tboot_mm, pgd, vaddr); if (!p4d) return -1; - pgd->pgd &= ~_PAGE_NX; pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; @@ -139,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); pte_unmap(pte); + + /* + * PTI poisons low addresses in the kernel page tables in the + * name of making them unusable for userspace. To execute + * code at such a low address, the poison must be cleared. + * + * Note: 'pgd' actually gets set in p4d_alloc() _or_ + * pud_alloc() depending on 4/5-level paging. + */ + pgd->pgd &= ~_PAGE_NX; + return 0; } -- cgit v1.2.3 From 76b043848fd22dbf7f8bf3a1452f8c70d557b860 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:25 +0000 Subject: x86/retpoline: Add initial retpoline support Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide the corresponding thunks. Provide assembler macros for invoking the thunks in the same way that GCC does, from native and inline assembler. This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In some circumstances, IBRS microcode features may be used instead, and the retpoline can be disabled. On AMD CPUs if lfence is serialising, the retpoline can be dramatically simplified to a simple "lfence; jmp *\reg". A future patch, after it has been verified that lfence really is serialising in all circumstances, can enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition to X86_FEATURE_RETPOLINE. Do not align the retpoline in the altinstr section, because there is no guarantee that it stays aligned when it's copied over the oldinstr during alternative patching. [ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks] [ tglx: Put actual function CALL/JMP in front of the macros, convert to symbolic labels ] [ dwmw2: Convert back to numeric labels, merge objtool fixes ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk --- arch/x86/Kconfig | 13 ++++ arch/x86/Makefile | 10 +++ arch/x86/include/asm/asm-prototypes.h | 25 +++++++ arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/nospec-branch.h | 128 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 4 ++ arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 48 +++++++++++++ 8 files changed, 231 insertions(+) create mode 100644 arch/x86/include/asm/nospec-branch.h create mode 100644 arch/x86/lib/retpoline.S (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e23d21ac745a..d1819161cc6c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -429,6 +429,19 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config RETPOLINE + bool "Avoid speculative indirect branches in kernel" + default y + help + Compile kernel with the retpoline compiler options to guard against + kernel-to-user data leaks by avoiding speculative indirect + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + + Without compiler support, at least indirect branches in assembler + code are eliminated. Since this includes the syscall entry path, + it is not entirely pointless. + config INTEL_RDT bool "Intel Resource Director Technology support" default n diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a20eacd9c7e9..974c61864978 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -235,6 +235,16 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +# Avoid indirect branches in kernel to deal with Spectre +ifdef CONFIG_RETPOLINE + RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) + ifneq ($(RETPOLINE_CFLAGS),) + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE + else + $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) + endif +endif + archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ff700d81e91e..0927cdc4f946 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -11,7 +11,32 @@ #include #include #include +#include #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); #endif + +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_32 +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); +#else +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); +INDIRECT_THUNK(8) +INDIRECT_THUNK(9) +INDIRECT_THUNK(10) +INDIRECT_THUNK(11) +INDIRECT_THUNK(12) +INDIRECT_THUNK(13) +INDIRECT_THUNK(14) +INDIRECT_THUNK(15) +#endif +INDIRECT_THUNK(ax) +INDIRECT_THUNK(bx) +INDIRECT_THUNK(cx) +INDIRECT_THUNK(dx) +INDIRECT_THUNK(si) +INDIRECT_THUNK(di) +INDIRECT_THUNK(bp) +INDIRECT_THUNK(sp) +#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1641c2f96363..f275447862f4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -203,6 +203,8 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h new file mode 100644 index 000000000000..e20e92ef2ca8 --- /dev/null +++ b/arch/x86/include/asm/nospec-branch.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NOSPEC_BRANCH_H__ +#define __NOSPEC_BRANCH_H__ + +#include +#include +#include + +#ifdef __ASSEMBLY__ + +/* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +.macro ANNOTATE_NOSPEC_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.nospec + .long .Lannotate_\@ - . + .popsection +.endm + +/* + * These are the bare retpoline primitives for indirect jmp and call. + * Do not use these directly; they only exist to make the ALTERNATIVE + * invocation below less ugly. + */ +.macro RETPOLINE_JMP reg:req + call .Ldo_rop_\@ +.Lspec_trap_\@: + pause + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov \reg, (%_ASM_SP) + ret +.endm + +/* + * This is a wrapper around RETPOLINE_JMP so the called function in reg + * returns to the instruction after the macro. + */ +.macro RETPOLINE_CALL reg:req + jmp .Ldo_call_\@ +.Ldo_retpoline_jmp_\@: + RETPOLINE_JMP \reg +.Ldo_call_\@: + call .Ldo_retpoline_jmp_\@ +.endm + +/* + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple + * indirect jmp/call which may be susceptible to the Spectre variant 2 + * attack. + */ +.macro JMP_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(jmp *\reg), \ + __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD +#else + jmp *\reg +#endif +.endm + +.macro CALL_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(call *\reg), \ + __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD +#else + call *\reg +#endif +.endm + +#else /* __ASSEMBLY__ */ + +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.nospec\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + +#if defined(CONFIG_X86_64) && defined(RETPOLINE) + +/* + * Since the inline asm uses the %V modifier which is only in newer GCC, + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + */ +# define CALL_NOSPEC \ + ANNOTATE_NOSPEC_ALTERNATIVE \ + ALTERNATIVE( \ + "call *%[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ + X86_FEATURE_RETPOLINE) +# define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +/* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ + " jmp 904f;\n" \ + " .align 16\n" \ + "901: call 903f;\n" \ + "902: pause;\n" \ + " jmp 902b;\n" \ + " .align 16\n" \ + "903: addl $4, %%esp;\n" \ + " pushl %[thunk_target];\n" \ + " ret;\n" \ + " .align 16\n" \ + "904: call 901b;\n", \ + X86_FEATURE_RETPOLINE) + +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#else /* No retpoline */ +# define CALL_NOSPEC "call *%[thunk_target]\n" +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 372ba3fb400f..7a671d1ae3cb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -905,6 +905,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); +#ifdef CONFIG_RETPOLINE + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +#endif + fpu__init_system(c); #ifdef CONFIG_X86_32 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 457f681ef379..d435c89875c1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_RETPOLINE) += retpoline.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S new file mode 100644 index 000000000000..cb45c6cb465f --- /dev/null +++ b/arch/x86/lib/retpoline.S @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include +#include +#include +#include + +.macro THUNK reg + .section .text.__x86.indirect_thunk.\reg + +ENTRY(__x86_indirect_thunk_\reg) + CFI_STARTPROC + JMP_NOSPEC %\reg + CFI_ENDPROC +ENDPROC(__x86_indirect_thunk_\reg) +.endm + +/* + * Despite being an assembler file we can't just use .irp here + * because __KSYM_DEPS__ only uses the C preprocessor and would + * only see one instance of "__x86_indirect_thunk_\reg" rather + * than one per register with the correct names. So we do it + * the simple and nasty way... + */ +#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) +#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) + +GENERATE_THUNK(_ASM_AX) +GENERATE_THUNK(_ASM_BX) +GENERATE_THUNK(_ASM_CX) +GENERATE_THUNK(_ASM_DX) +GENERATE_THUNK(_ASM_SI) +GENERATE_THUNK(_ASM_DI) +GENERATE_THUNK(_ASM_BP) +GENERATE_THUNK(_ASM_SP) +#ifdef CONFIG_64BIT +GENERATE_THUNK(r8) +GENERATE_THUNK(r9) +GENERATE_THUNK(r10) +GENERATE_THUNK(r11) +GENERATE_THUNK(r12) +GENERATE_THUNK(r13) +GENERATE_THUNK(r14) +GENERATE_THUNK(r15) +#endif -- cgit v1.2.3 From da285121560e769cc31797bba6422eea71d473e0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:26 +0000 Subject: x86/spectre: Add boot time option to select Spectre v2 mitigation Add a spectre_v2= option to select the mitigation used for the indirect branch speculation vulnerability. Currently, the only option available is retpoline, in its various forms. This will be expanded to cover the new IBRS/IBPB microcode features. The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a serializing instruction, which is indicated by the LFENCE_RDTSC feature. [ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS integration becomes simple ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk --- Documentation/admin-guide/kernel-parameters.txt | 28 +++++ arch/x86/include/asm/nospec-branch.h | 10 ++ arch/x86/kernel/cpu/bugs.c | 158 +++++++++++++++++++++++- arch/x86/kernel/cpu/common.c | 4 - 4 files changed, 195 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 905991745d26..8122b5f98ea1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2599,6 +2599,11 @@ nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. + nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2 + (indirect branch prediction) vulnerability. System may + allow data leaks with this option, which is equivalent + to spectre_v2=off. + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. @@ -3908,6 +3913,29 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt + spectre_v2= [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable + + Selecting 'on' will, and 'auto' may, choose a + mitigation method at run time according to the + CPU, the available microcode, the setting of the + CONFIG_RETPOLINE configuration option, and the + compiler with which the kernel was built. + + Specific mitigations can also be selected manually: + + retpoline - replace indirect branches + retpoline,generic - google's original retpoline + retpoline,amd - AMD-specific minimal thunk + + Not specifying this option is equivalent to + spectre_v2=auto. + spia_io_base= [HW,MTD] spia_fio_base= spia_pedr= diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e20e92ef2ca8..ea034fa6e261 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -124,5 +124,15 @@ # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif +/* The Spectre V2 mitigation variants */ +enum spectre_v2_mitigation { + SPECTRE_V2_NONE, + SPECTRE_V2_RETPOLINE_MINIMAL, + SPECTRE_V2_RETPOLINE_MINIMAL_AMD, + SPECTRE_V2_RETPOLINE_GENERIC, + SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS, +}; + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 76ad6cb44b40..e4dc26185aa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,6 +11,9 @@ #include #include #include + +#include +#include #include #include #include @@ -21,6 +24,8 @@ #include #include +static void __init spectre_v2_select_mitigation(void); + void __init check_bugs(void) { identify_boot_cpu(); @@ -30,6 +35,9 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* Select the proper spectre mitigation before patching alternatives */ + spectre_v2_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -62,6 +70,153 @@ void __init check_bugs(void) #endif } +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +static void __init spec2_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static inline bool retp_compiler(void) +{ + return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + char arg[20]; + int ret; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, + sizeof(arg)); + if (ret > 0) { + if (match_option(arg, ret, "off")) { + goto disable; + } else if (match_option(arg, ret, "on")) { + spec2_print_if_secure("force enabled on command line."); + return SPECTRE_V2_CMD_FORCE; + } else if (match_option(arg, ret, "retpoline")) { + spec2_print_if_insecure("retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE; + } else if (match_option(arg, ret, "retpoline,amd")) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + return SPECTRE_V2_CMD_AUTO; + } + spec2_print_if_insecure("AMD retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_AMD; + } else if (match_option(arg, ret, "retpoline,generic")) { + spec2_print_if_insecure("generic retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (match_option(arg, ret, "auto")) { + return SPECTRE_V2_CMD_AUTO; + } + } + + if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_AUTO; +disable: + spec2_print_if_insecure("disabled on command line."); + return SPECTRE_V2_CMD_NONE; +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + /* FALLTRHU */ + case SPECTRE_V2_CMD_AUTO: + goto retpoline_auto; + + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; + break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_generic; + break; + case SPECTRE_V2_CMD_RETPOLINE: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; + } + pr_err("kernel not compiled with retpoline; no mitigation available!"); + return; + +retpoline_auto: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + retpoline_amd: + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing. Switching to generic retpoline\n"); + goto retpoline_generic; + } + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : + SPECTRE_V2_RETPOLINE_MINIMAL_AMD; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } else { + retpoline_generic: + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : + SPECTRE_V2_RETPOLINE_MINIMAL; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); +} + +#undef pr_fmt + #ifdef CONFIG_SYSFS ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) @@ -86,6 +241,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Vulnerable\n"); + + return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7a671d1ae3cb..372ba3fb400f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -905,10 +905,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); -#ifdef CONFIG_RETPOLINE - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); -#endif - fpu__init_system(c); #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 9697fa39efd3fc3692f2949d4045f393ec58450b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:27 +0000 Subject: x86/retpoline/crypto: Convert crypto assembler indirect jumps Convert all indirect jumps in crypto assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-6-git-send-email-dwmw@amazon.co.uk --- arch/x86/crypto/aesni-intel_asm.S | 5 +++-- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 3 ++- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 16627fec80b2..3d09e3aca18d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,7 @@ #include #include #include +#include /* * The following macros are used to move an (un)aligned 16 byte value to/from @@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index f7c495e2863c..a14af6eb09cb 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,6 +17,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way: vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index eee5b3982cfd..b66bbfa62f50 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -12,6 +12,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way: vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 7a7de27c6f41..d9b734d0c8cc 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -45,6 +45,7 @@ #include #include +#include ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction @@ -172,7 +173,7 @@ continue_block: movzxw (bufp, %rax, 2), len lea crc_array(%rip), bufp lea (bufp, len, 1), bufp - jmp *bufp + JMP_NOSPEC bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: -- cgit v1.2.3 From 2641f08bb7fc63a636a2b18173221d7040a3512e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:28 +0000 Subject: x86/retpoline/entry: Convert entry assembler indirect jumps Convert indirect jumps in core 32/64bit entry assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return address after the 'call' instruction must be *precisely* at the .Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work, and the use of alternatives will mess that up unless we play horrid games to prepend with NOPs and make the variants the same length. It's not worth it; in the case where we ALTERNATIVE out the retpoline, the first instruction at __x86.indirect_thunk.rax is going to be a bare jmp *%rax anyway. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 5 +++-- arch/x86/entry/entry_64.S | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ace8f321a5a1..a1f28a54f23a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,6 +44,7 @@ #include #include #include +#include .section .entry.text, "ax" @@ -290,7 +291,7 @@ ENTRY(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - call *%ebx + CALL_NOSPEC %ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -919,7 +920,7 @@ common_exception: movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call *%edi + CALL_NOSPEC %edi jmp ret_from_exception END(common_exception) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ed31d00dc5ee..59874bc1aed2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "calling.h" @@ -187,7 +188,7 @@ ENTRY(entry_SYSCALL_64_trampoline) */ pushq %rdi movq $entry_SYSCALL_64_stage2, %rdi - jmp *%rdi + JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) .popsection @@ -266,7 +267,12 @@ entry_SYSCALL_64_fastpath: * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif .Lentry_SYSCALL_64_after_fastpath_call: movq %rax, RAX(%rsp) @@ -438,7 +444,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - jmp *%rax /* Called from C */ + JMP_NOSPEC %rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -517,7 +523,7 @@ ENTRY(ret_from_fork) 1: /* kernel thread */ movq %r12, %rdi - call *%rbx + CALL_NOSPEC %rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() -- cgit v1.2.3 From 9351803bd803cdbeb9b5a7850b7b6f464806e3db Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:29 +0000 Subject: x86/retpoline/ftrace: Convert ftrace assembler indirect jumps Convert all indirect jumps in ftrace assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-8-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/ftrace_32.S | 6 ++++-- arch/x86/kernel/ftrace_64.S | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index b6c6468e10bc..4c8440de3355 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CC_USING_FENTRY # define function_hook __fentry__ @@ -197,7 +198,8 @@ ftrace_stub: movl 0x4(%ebp), %edx subl $MCOUNT_INSN_SIZE, %eax - call *ftrace_trace_function + movl ftrace_trace_function, %ecx + CALL_NOSPEC %ecx popl %edx popl %ecx @@ -241,5 +243,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - jmp *%ecx + JMP_NOSPEC %ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index c832291d948a..7cb8ba08beb9 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -7,7 +7,7 @@ #include #include #include - +#include .code64 .section .entry.text, "ax" @@ -286,8 +286,8 @@ trace: * ip and parent ip are used and the list function is called when * function tracing is enabled. */ - call *ftrace_trace_function - + movq ftrace_trace_function, %r8 + CALL_NOSPEC %r8 restore_mcount_regs jmp fgraph_trace @@ -329,5 +329,5 @@ GLOBAL(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - jmp *%rdi + JMP_NOSPEC %rdi #endif -- cgit v1.2.3 From e70e5892b28c18f517f29ab6e83bd57705104b31 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:30 +0000 Subject: x86/retpoline/hyperv: Convert assembler indirect jumps Convert all indirect jumps in hyperv inline asm code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-9-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/mshyperv.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 581bb54dd464..5119e4b555cc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@ #include #include #include +#include /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent @@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return U64_MAX; __asm__ __volatile__("mov %4, %%r8\n" - "call *%5" + CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address), "m" (hv_hypercall_pg) + : "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); #else u32 input_address_hi = upper_32_bits(input_address); @@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) if (!hv_hypercall_pg) return U64_MAX; - __asm__ __volatile__("call *%7" + __asm__ __volatile__(CALL_NOSPEC : "=A" (hv_status), "+c" (input_address_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input_address_hi), "D"(output_address_hi), "S"(output_address_lo), - "m" (hv_hypercall_pg) + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory"); #endif /* !x86_64 */ return hv_status; @@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) #ifdef CONFIG_X86_64 { - __asm__ __volatile__("call *%4" + __asm__ __volatile__(CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "m" (hv_hypercall_pg) + : THUNK_TARGET(hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); } #else @@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) u32 input1_hi = upper_32_bits(input1); u32 input1_lo = lower_32_bits(input1); - __asm__ __volatile__ ("call *%5" + __asm__ __volatile__ (CALL_NOSPEC : "=A"(hv_status), "+c"(input1_lo), ASM_CALL_CONSTRAINT : "A" (control), "b" (input1_hi), - "m" (hv_hypercall_pg) + THUNK_TARGET(hv_hypercall_pg) : "cc", "edi", "esi"); } #endif -- cgit v1.2.3 From ea08816d5b185ab3d09e95e393f265af54560350 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:31 +0000 Subject: x86/retpoline/xen: Convert Xen hypercall indirect jumps Convert indirect call in Xen hypercall to use non-speculative sequence, when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Reviewed-by: Juergen Gross Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/xen/hypercall.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7cb282e9e587..bfd882617613 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -217,9 +218,9 @@ privcmd_call(unsigned call, __HYPERCALL_5ARG(a1, a2, a3, a4, a5); stac(); - asm volatile("call *%[call]" + asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM - : [call] "a" (&hypercall_page[call]) + : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); clac(); -- cgit v1.2.3 From 5096732f6f695001fa2d6f1335a2680b37912c69 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:32 +0000 Subject: x86/retpoline/checksum32: Convert assembler indirect jumps Convert all indirect jumps in 32bit checksum assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-11-git-send-email-dwmw@amazon.co.uk --- arch/x86/lib/checksum_32.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4d34bb548b41..46e71a74e612 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -29,7 +29,8 @@ #include #include #include - +#include + /* * computes a partial checksum, e.g. for TCP/UDP fragments */ @@ -156,7 +157,7 @@ ENTRY(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) -- cgit v1.2.3 From 7614e913db1f40fff819b36216484dc3808995d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 11 Jan 2018 21:46:33 +0000 Subject: x86/retpoline/irq32: Convert assembler indirect jumps Convert all indirect jumps in 32bit irq inline asm code to use non speculative sequences. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-12-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/irq_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a83b3346a0e1..c1bdbd3d3232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -20,6 +20,7 @@ #include #include +#include #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack); static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=b" (stack) : "0" (stack), - "D"(func) + [thunk_target] "D"(func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) call_on_stack(print_stack_overflow, isp); asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=a" (arg1), "=b" (isp) : "0" (desc), "1" (isp), - "D" (desc->handle_irq) + [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; } -- cgit v1.2.3 From 69c4d8ed49568598f200b340b17e391c35be3d4b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Jan 2018 22:04:16 +0100 Subject: arm64: dts: socfpga: add missing interrupt-parent The PMU node has no working interrupt, as shown by this dtc warning: arch/arm64/boot/dts/altera/socfpga_stratix10_socdk.dtb: Warning (interrupts_property): Missing interrupt-parent for /pmu This adds an interrupt-parent property so we can correct parse that interrupt number. Signed-off-by: Arnd Bergmann Acked-by: Dinh Nguyen Signed-off-by: Olof Johansson --- arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi index 7c9bdc7ab50b..9db19314c60c 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi @@ -66,6 +66,7 @@ <&cpu1>, <&cpu2>, <&cpu3>; + interrupt-parent = <&intc>; }; psci { -- cgit v1.2.3 From 117cc7a908c83697b0b737d15ae1eb5943afe35b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 11:11:27 +0000 Subject: x86/retpoline: Fill return stack buffer on vmexit In accordance with the Intel and AMD documentation, we need to overwrite all entries in the RSB on exiting a guest, to prevent malicious branch target predictions from affecting the host kernel. This is needed both for retpoline and for IBRS. [ak: numbers again for the RSB stuffing labels] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/nospec-branch.h | 78 +++++++++++++++++++++++++++++++++++- arch/x86/kvm/svm.c | 4 ++ arch/x86/kvm/vmx.c | 4 ++ 3 files changed, 85 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ea034fa6e261..402a11c803c3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,6 +7,48 @@ #include #include +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * We define a CPP macro such that it can be used from both .S files and + * inline assembly. It's possible to do a .macro and then include that + * from C via asm(".include ") but let's not go there. + */ + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +/* + * Google experimented with loop-unrolling and this turned out to be + * the optimal version — two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp) \ + mov $(nr/2), reg; \ +771: \ + call 772f; \ +773: /* speculation trap */ \ + pause; \ + jmp 773b; \ +772: \ + call 774f; \ +775: /* speculation trap */ \ + pause; \ + jmp 775b; \ +774: \ + dec reg; \ + jnz 771b; \ + add $(BITS_PER_LONG/8) * nr, sp; + #ifdef __ASSEMBLY__ /* @@ -74,6 +116,20 @@ #else call *\reg #endif +.endm + + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ + \ftr +.Lskip_rsb_\@: +#endif .endm #else /* __ASSEMBLY__ */ @@ -119,7 +175,7 @@ X86_FEATURE_RETPOLINE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) -#else /* No retpoline */ +#else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -134,5 +190,25 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future + * CPUs with IBRS_ATT *might* it be avoided. + */ +static inline void vmexit_fill_RSB(void) +{ +#ifdef CONFIG_RETPOLINE + unsigned long loops = RSB_CLEAR_LOOPS / 2; + + asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE("jmp 910f", + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), + X86_FEATURE_RETPOLINE) + "910:" + : "=&r" (loops), ASM_CALL_CONSTRAINT + : "r" (loops) : "memory" ); +#endif +} #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0e68f0b3cbf7..2744b97345b8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "trace.h" @@ -4985,6 +4986,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 62ee4362e1c1..d1e25dba3112 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -9403,6 +9404,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (debugctlmsr) update_debugctlmsr(debugctlmsr); -- cgit v1.2.3 From 1289e0e29857e606a70a0200bf7849ae38d3493a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 11 Jan 2018 11:15:43 -0800 Subject: perf/x86/rapl: Fix Haswell and Broadwell server RAPL event Perf-fuzzer triggers non-existent MSR access in RAPL driver on Haswell-EX. Haswell/Broadwell server and client have differnt RAPL events. Since 'commit 7f2236d0bf9a ("perf/x86/rapl: Use Intel family macros for RAPL")', it accidentally assign RAPL client events to server. Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Linux-kernel@vger.kernel.org Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 005908ee9333..a2efb490f743 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -755,14 +755,14 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init), -- cgit v1.2.3 From dd533734395f0e14db12d82fc64a879c805743dd Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 9 Jan 2018 11:36:34 +0100 Subject: mtd: nand: use reworked NAND controller driver with Marvell EBU SoCs Choose to compile and embed marvell_nand.c as NAND controller driver instead of the legacy pxa3xx_nand.c for platforms with Marvell EBU SoCs. Signed-off-by: Miquel Raynal Acked-by: Gregory CLEMENT Acked-by: Arnd Bergmann Signed-off-by: Boris Brezillon --- arch/arm/configs/mvebu_v7_defconfig | 2 +- arch/arm64/configs/defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/configs/mvebu_v7_defconfig b/arch/arm/configs/mvebu_v7_defconfig index 69553704f2dc..4b6e4fd47e5d 100644 --- a/arch/arm/configs/mvebu_v7_defconfig +++ b/arch/arm/configs/mvebu_v7_defconfig @@ -57,7 +57,7 @@ CONFIG_MTD_CFI_STAA=y CONFIG_MTD_PHYSMAP_OF=y CONFIG_MTD_M25P80=y CONFIG_MTD_NAND=y -CONFIG_MTD_NAND_PXA3xx=y +CONFIG_MTD_NAND_MARVELL=y CONFIG_MTD_SPI_NOR=y CONFIG_SRAM=y CONFIG_MTD_UBI=y diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 6356c6da34ea..b20fa9b31efe 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -161,7 +161,7 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_M25P80=y CONFIG_MTD_NAND=y CONFIG_MTD_NAND_DENALI_DT=y -CONFIG_MTD_NAND_PXA3xx=y +CONFIG_MTD_NAND_MARVELL=y CONFIG_MTD_SPI_NOR=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_NBD=m -- cgit v1.2.3 From 396744b76a89b1cd7681c6b8a7716b545f6cf986 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Fri, 12 Jan 2018 14:12:56 +0100 Subject: ARM: dts: OMAP2+: Add compatible property to onenand node OMAP onenand nodes are missing compatible property, add it. Signed-off-by: Ladislav Michl Reviewed-by: Peter Ujfalusi Acked-by: Roger Quadros Acked-by: Tony Lindgren Tested-by: Tony Lindgren Tested-by: Aaro Koskinen Signed-off-by: Boris Brezillon --- arch/arm/boot/dts/omap2420-n8x0-common.dtsi | 1 + arch/arm/boot/dts/omap3-n900.dts | 1 + arch/arm/boot/dts/omap3-n950-n9.dtsi | 1 + arch/arm/boot/dts/omap3430-sdp.dts | 1 + 4 files changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/omap2420-n8x0-common.dtsi b/arch/arm/boot/dts/omap2420-n8x0-common.dtsi index 1df3ace3af92..63b0b4921e4e 100644 --- a/arch/arm/boot/dts/omap2420-n8x0-common.dtsi +++ b/arch/arm/boot/dts/omap2420-n8x0-common.dtsi @@ -52,6 +52,7 @@ onenand@0,0 { #address-cells = <1>; #size-cells = <1>; + compatible = "ti,omap2-onenand"; reg = <0 0 0x20000>; /* CS0, offset 0, IO size 128K */ gpmc,sync-read; diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts index 669c51c00c00..e7c7b8e50703 100644 --- a/arch/arm/boot/dts/omap3-n900.dts +++ b/arch/arm/boot/dts/omap3-n900.dts @@ -838,6 +838,7 @@ onenand@0,0 { #address-cells = <1>; #size-cells = <1>; + compatible = "ti,omap2-onenand"; reg = <0 0 0x20000>; /* CS0, offset 0, IO size 128K */ gpmc,sync-read; diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index 12fbb3da5fce..0d9b85317529 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -367,6 +367,7 @@ onenand@0,0 { #address-cells = <1>; #size-cells = <1>; + compatible = "ti,omap2-onenand"; reg = <0 0 0x20000>; /* CS0, offset 0, IO size 128K */ gpmc,sync-read; diff --git a/arch/arm/boot/dts/omap3430-sdp.dts b/arch/arm/boot/dts/omap3430-sdp.dts index 908951eb5943..d652708f6bef 100644 --- a/arch/arm/boot/dts/omap3430-sdp.dts +++ b/arch/arm/boot/dts/omap3430-sdp.dts @@ -154,6 +154,7 @@ linux,mtd-name= "samsung,kfm2g16q2m-deb8"; #address-cells = <1>; #size-cells = <1>; + compatible = "ti,omap2-onenand"; reg = <2 0 0x20000>; /* CS2, offset 0, IO size 4 */ gpmc,device-width = <2>; -- cgit v1.2.3 From d36005d4a289d31e23be387995df32b8f0554cbc Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Fri, 12 Jan 2018 14:13:19 +0100 Subject: ARM: dts: omap3-igep: Update onenand node timings Update node timings to be compatible with actual chip used - gpmc_cs_show_timings utilized to dump values. Signed-off-by: Ladislav Michl Reviewed-by: Peter Ujfalusi Acked-by: Roger Quadros Acked-by: Tony Lindgren Signed-off-by: Boris Brezillon --- arch/arm/boot/dts/omap3-igep.dtsi | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/omap3-igep.dtsi b/arch/arm/boot/dts/omap3-igep.dtsi index 4ad7d5565906..f33cc80c9dbc 100644 --- a/arch/arm/boot/dts/omap3-igep.dtsi +++ b/arch/arm/boot/dts/omap3-igep.dtsi @@ -147,32 +147,32 @@ gpmc,sync-read; gpmc,sync-write; gpmc,burst-length = <16>; - gpmc,burst-read; gpmc,burst-wrap; + gpmc,burst-read; gpmc,burst-write; gpmc,device-width = <2>; /* GPMC_DEVWIDTH_16BIT */ gpmc,mux-add-data = <2>; /* GPMC_MUX_AD */ gpmc,cs-on-ns = <0>; - gpmc,cs-rd-off-ns = <87>; - gpmc,cs-wr-off-ns = <87>; + gpmc,cs-rd-off-ns = <96>; + gpmc,cs-wr-off-ns = <96>; gpmc,adv-on-ns = <0>; - gpmc,adv-rd-off-ns = <10>; - gpmc,adv-wr-off-ns = <10>; - gpmc,oe-on-ns = <15>; - gpmc,oe-off-ns = <87>; + gpmc,adv-rd-off-ns = <12>; + gpmc,adv-wr-off-ns = <12>; + gpmc,oe-on-ns = <18>; + gpmc,oe-off-ns = <96>; gpmc,we-on-ns = <0>; - gpmc,we-off-ns = <87>; - gpmc,rd-cycle-ns = <112>; - gpmc,wr-cycle-ns = <112>; - gpmc,access-ns = <81>; - gpmc,page-burst-access-ns = <15>; + gpmc,we-off-ns = <96>; + gpmc,rd-cycle-ns = <114>; + gpmc,wr-cycle-ns = <114>; + gpmc,access-ns = <90>; + gpmc,page-burst-access-ns = <12>; gpmc,bus-turnaround-ns = <0>; gpmc,cycle2cycle-delay-ns = <0>; gpmc,wait-monitoring-ns = <0>; - gpmc,clk-activation-ns = <5>; + gpmc,clk-activation-ns = <6>; gpmc,wr-data-mux-bus-ns = <30>; - gpmc,wr-access-ns = <81>; - gpmc,sync-clk-ps = <15000>; + gpmc,wr-access-ns = <90>; + gpmc,sync-clk-ps = <12000>; #address-cells = <1>; #size-cells = <1>; -- cgit v1.2.3 From 2514830b8b8ca966fae35103070984c2e847b2b9 Mon Sep 17 00:00:00 2001 From: Ladislav Michl Date: Fri, 12 Jan 2018 14:18:00 +0100 Subject: ARM: OMAP2+: Remove gpmc-onenand As OneNAND driver is now using devicetree gpmc-onenand and its platform data is unused and can be removed. Signed-off-by: Ladislav Michl Reviewed-by: Peter Ujfalusi Acked-by: Roger Quadros Acked-by: Tony Lindgren Tested-by: Tony Lindgren Tested-by: Aaro Koskinen Signed-off-by: Boris Brezillon --- arch/arm/mach-omap2/Makefile | 3 - arch/arm/mach-omap2/gpmc-onenand.c | 409 ------------------------ include/linux/platform_data/mtd-onenand-omap2.h | 34 -- 3 files changed, 446 deletions(-) delete mode 100644 arch/arm/mach-omap2/gpmc-onenand.c delete mode 100644 include/linux/platform_data/mtd-onenand-omap2.h (limited to 'arch') diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile index 2f722a805948..c15bbcad5f67 100644 --- a/arch/arm/mach-omap2/Makefile +++ b/arch/arm/mach-omap2/Makefile @@ -232,6 +232,3 @@ obj-y += $(omap-hsmmc-m) $(omap-hsmmc-y) obj-y += omap_phy_internal.o obj-$(CONFIG_MACH_OMAP2_TUSB6010) += usb-tusb6010.o - -onenand-$(CONFIG_MTD_ONENAND_OMAP2) := gpmc-onenand.o -obj-y += $(onenand-m) $(onenand-y) diff --git a/arch/arm/mach-omap2/gpmc-onenand.c b/arch/arm/mach-omap2/gpmc-onenand.c deleted file mode 100644 index 2944af820558..000000000000 --- a/arch/arm/mach-omap2/gpmc-onenand.c +++ /dev/null @@ -1,409 +0,0 @@ -/* - * linux/arch/arm/mach-omap2/gpmc-onenand.c - * - * Copyright (C) 2006 - 2009 Nokia Corporation - * Contacts: Juha Yrjola - * Tony Lindgren - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "soc.h" - -#define ONENAND_IO_SIZE SZ_128K - -#define ONENAND_FLAG_SYNCREAD (1 << 0) -#define ONENAND_FLAG_SYNCWRITE (1 << 1) -#define ONENAND_FLAG_HF (1 << 2) -#define ONENAND_FLAG_VHF (1 << 3) - -static unsigned onenand_flags; -static unsigned latency; - -static struct omap_onenand_platform_data *gpmc_onenand_data; - -static struct resource gpmc_onenand_resource = { - .flags = IORESOURCE_MEM, -}; - -static struct platform_device gpmc_onenand_device = { - .name = "omap2-onenand", - .id = -1, - .num_resources = 1, - .resource = &gpmc_onenand_resource, -}; - -static struct gpmc_settings onenand_async = { - .device_width = GPMC_DEVWIDTH_16BIT, - .mux_add_data = GPMC_MUX_AD, -}; - -static struct gpmc_settings onenand_sync = { - .burst_read = true, - .burst_wrap = true, - .burst_len = GPMC_BURST_16, - .device_width = GPMC_DEVWIDTH_16BIT, - .mux_add_data = GPMC_MUX_AD, - .wait_pin = 0, -}; - -static void omap2_onenand_calc_async_timings(struct gpmc_timings *t) -{ - struct gpmc_device_timings dev_t; - const int t_cer = 15; - const int t_avdp = 12; - const int t_aavdh = 7; - const int t_ce = 76; - const int t_aa = 76; - const int t_oe = 20; - const int t_cez = 20; /* max of t_cez, t_oez */ - const int t_wpl = 40; - const int t_wph = 30; - - memset(&dev_t, 0, sizeof(dev_t)); - - dev_t.t_avdp_r = max_t(int, t_avdp, t_cer) * 1000; - dev_t.t_avdp_w = dev_t.t_avdp_r; - dev_t.t_aavdh = t_aavdh * 1000; - dev_t.t_aa = t_aa * 1000; - dev_t.t_ce = t_ce * 1000; - dev_t.t_oe = t_oe * 1000; - dev_t.t_cez_r = t_cez * 1000; - dev_t.t_cez_w = dev_t.t_cez_r; - dev_t.t_wpl = t_wpl * 1000; - dev_t.t_wph = t_wph * 1000; - - gpmc_calc_timings(t, &onenand_async, &dev_t); -} - -static void omap2_onenand_set_async_mode(void __iomem *onenand_base) -{ - u32 reg; - - /* Ensure sync read and sync write are disabled */ - reg = readw(onenand_base + ONENAND_REG_SYS_CFG1); - reg &= ~ONENAND_SYS_CFG1_SYNC_READ & ~ONENAND_SYS_CFG1_SYNC_WRITE; - writew(reg, onenand_base + ONENAND_REG_SYS_CFG1); -} - -static void set_onenand_cfg(void __iomem *onenand_base) -{ - u32 reg = ONENAND_SYS_CFG1_RDY | ONENAND_SYS_CFG1_INT; - - reg |= (latency << ONENAND_SYS_CFG1_BRL_SHIFT) | - ONENAND_SYS_CFG1_BL_16; - if (onenand_flags & ONENAND_FLAG_SYNCREAD) - reg |= ONENAND_SYS_CFG1_SYNC_READ; - else - reg &= ~ONENAND_SYS_CFG1_SYNC_READ; - if (onenand_flags & ONENAND_FLAG_SYNCWRITE) - reg |= ONENAND_SYS_CFG1_SYNC_WRITE; - else - reg &= ~ONENAND_SYS_CFG1_SYNC_WRITE; - if (onenand_flags & ONENAND_FLAG_HF) - reg |= ONENAND_SYS_CFG1_HF; - else - reg &= ~ONENAND_SYS_CFG1_HF; - if (onenand_flags & ONENAND_FLAG_VHF) - reg |= ONENAND_SYS_CFG1_VHF; - else - reg &= ~ONENAND_SYS_CFG1_VHF; - - writew(reg, onenand_base + ONENAND_REG_SYS_CFG1); -} - -static int omap2_onenand_get_freq(struct omap_onenand_platform_data *cfg, - void __iomem *onenand_base) -{ - u16 ver = readw(onenand_base + ONENAND_REG_VERSION_ID); - int freq; - - switch ((ver >> 4) & 0xf) { - case 0: - freq = 40; - break; - case 1: - freq = 54; - break; - case 2: - freq = 66; - break; - case 3: - freq = 83; - break; - case 4: - freq = 104; - break; - default: - pr_err("onenand rate not detected, bad GPMC async timings?\n"); - freq = 0; - } - - return freq; -} - -static void omap2_onenand_calc_sync_timings(struct gpmc_timings *t, - unsigned int flags, - int freq) -{ - struct gpmc_device_timings dev_t; - const int t_cer = 15; - const int t_avdp = 12; - const int t_cez = 20; /* max of t_cez, t_oez */ - const int t_wpl = 40; - const int t_wph = 30; - int min_gpmc_clk_period, t_ces, t_avds, t_avdh, t_ach, t_aavdh, t_rdyo; - int div, gpmc_clk_ns; - - if (flags & ONENAND_SYNC_READ) - onenand_flags = ONENAND_FLAG_SYNCREAD; - else if (flags & ONENAND_SYNC_READWRITE) - onenand_flags = ONENAND_FLAG_SYNCREAD | ONENAND_FLAG_SYNCWRITE; - - switch (freq) { - case 104: - min_gpmc_clk_period = 9600; /* 104 MHz */ - t_ces = 3; - t_avds = 4; - t_avdh = 2; - t_ach = 3; - t_aavdh = 6; - t_rdyo = 6; - break; - case 83: - min_gpmc_clk_period = 12000; /* 83 MHz */ - t_ces = 5; - t_avds = 4; - t_avdh = 2; - t_ach = 6; - t_aavdh = 6; - t_rdyo = 9; - break; - case 66: - min_gpmc_clk_period = 15000; /* 66 MHz */ - t_ces = 6; - t_avds = 5; - t_avdh = 2; - t_ach = 6; - t_aavdh = 6; - t_rdyo = 11; - break; - default: - min_gpmc_clk_period = 18500; /* 54 MHz */ - t_ces = 7; - t_avds = 7; - t_avdh = 7; - t_ach = 9; - t_aavdh = 7; - t_rdyo = 15; - onenand_flags &= ~ONENAND_FLAG_SYNCWRITE; - break; - } - - div = gpmc_calc_divider(min_gpmc_clk_period); - gpmc_clk_ns = gpmc_ticks_to_ns(div); - if (gpmc_clk_ns < 15) /* >66MHz */ - onenand_flags |= ONENAND_FLAG_HF; - else - onenand_flags &= ~ONENAND_FLAG_HF; - if (gpmc_clk_ns < 12) /* >83MHz */ - onenand_flags |= ONENAND_FLAG_VHF; - else - onenand_flags &= ~ONENAND_FLAG_VHF; - if (onenand_flags & ONENAND_FLAG_VHF) - latency = 8; - else if (onenand_flags & ONENAND_FLAG_HF) - latency = 6; - else if (gpmc_clk_ns >= 25) /* 40 MHz*/ - latency = 3; - else - latency = 4; - - /* Set synchronous read timings */ - memset(&dev_t, 0, sizeof(dev_t)); - - if (onenand_flags & ONENAND_FLAG_SYNCREAD) - onenand_sync.sync_read = true; - if (onenand_flags & ONENAND_FLAG_SYNCWRITE) { - onenand_sync.sync_write = true; - onenand_sync.burst_write = true; - } else { - dev_t.t_avdp_w = max(t_avdp, t_cer) * 1000; - dev_t.t_wpl = t_wpl * 1000; - dev_t.t_wph = t_wph * 1000; - dev_t.t_aavdh = t_aavdh * 1000; - } - dev_t.ce_xdelay = true; - dev_t.avd_xdelay = true; - dev_t.oe_xdelay = true; - dev_t.we_xdelay = true; - dev_t.clk = min_gpmc_clk_period; - dev_t.t_bacc = dev_t.clk; - dev_t.t_ces = t_ces * 1000; - dev_t.t_avds = t_avds * 1000; - dev_t.t_avdh = t_avdh * 1000; - dev_t.t_ach = t_ach * 1000; - dev_t.cyc_iaa = (latency + 1); - dev_t.t_cez_r = t_cez * 1000; - dev_t.t_cez_w = dev_t.t_cez_r; - dev_t.cyc_aavdh_oe = 1; - dev_t.t_rdyo = t_rdyo * 1000 + min_gpmc_clk_period; - - gpmc_calc_timings(t, &onenand_sync, &dev_t); -} - -static int omap2_onenand_setup_async(void __iomem *onenand_base) -{ - struct gpmc_timings t; - int ret; - - /* - * Note that we need to keep sync_write set for the call to - * omap2_onenand_set_async_mode() to work to detect the onenand - * supported clock rate for the sync timings. - */ - if (gpmc_onenand_data->of_node) { - gpmc_read_settings_dt(gpmc_onenand_data->of_node, - &onenand_async); - if (onenand_async.sync_read || onenand_async.sync_write) { - if (onenand_async.sync_write) - gpmc_onenand_data->flags |= - ONENAND_SYNC_READWRITE; - else - gpmc_onenand_data->flags |= ONENAND_SYNC_READ; - onenand_async.sync_read = false; - } - } - - onenand_async.sync_write = true; - omap2_onenand_calc_async_timings(&t); - - ret = gpmc_cs_program_settings(gpmc_onenand_data->cs, &onenand_async); - if (ret < 0) - return ret; - - ret = gpmc_cs_set_timings(gpmc_onenand_data->cs, &t, &onenand_async); - if (ret < 0) - return ret; - - omap2_onenand_set_async_mode(onenand_base); - - return 0; -} - -static int omap2_onenand_setup_sync(void __iomem *onenand_base, int *freq_ptr) -{ - int ret, freq = *freq_ptr; - struct gpmc_timings t; - - if (!freq) { - /* Very first call freq is not known */ - freq = omap2_onenand_get_freq(gpmc_onenand_data, onenand_base); - if (!freq) - return -ENODEV; - set_onenand_cfg(onenand_base); - } - - if (gpmc_onenand_data->of_node) { - gpmc_read_settings_dt(gpmc_onenand_data->of_node, - &onenand_sync); - } else { - /* - * FIXME: Appears to be legacy code from initial ONENAND commit. - * Unclear what boards this is for and if this can be removed. - */ - if (!cpu_is_omap34xx()) - onenand_sync.wait_on_read = true; - } - - omap2_onenand_calc_sync_timings(&t, gpmc_onenand_data->flags, freq); - - ret = gpmc_cs_program_settings(gpmc_onenand_data->cs, &onenand_sync); - if (ret < 0) - return ret; - - ret = gpmc_cs_set_timings(gpmc_onenand_data->cs, &t, &onenand_sync); - if (ret < 0) - return ret; - - set_onenand_cfg(onenand_base); - - *freq_ptr = freq; - - return 0; -} - -static int gpmc_onenand_setup(void __iomem *onenand_base, int *freq_ptr) -{ - struct device *dev = &gpmc_onenand_device.dev; - unsigned l = ONENAND_SYNC_READ | ONENAND_SYNC_READWRITE; - int ret; - - ret = omap2_onenand_setup_async(onenand_base); - if (ret) { - dev_err(dev, "unable to set to async mode\n"); - return ret; - } - - if (!(gpmc_onenand_data->flags & l)) - return 0; - - ret = omap2_onenand_setup_sync(onenand_base, freq_ptr); - if (ret) - dev_err(dev, "unable to set to sync mode\n"); - return ret; -} - -int gpmc_onenand_init(struct omap_onenand_platform_data *_onenand_data) -{ - int err; - struct device *dev = &gpmc_onenand_device.dev; - - gpmc_onenand_data = _onenand_data; - gpmc_onenand_data->onenand_setup = gpmc_onenand_setup; - gpmc_onenand_device.dev.platform_data = gpmc_onenand_data; - - if (cpu_is_omap24xx() && - (gpmc_onenand_data->flags & ONENAND_SYNC_READWRITE)) { - dev_warn(dev, "OneNAND using only SYNC_READ on 24xx\n"); - gpmc_onenand_data->flags &= ~ONENAND_SYNC_READWRITE; - gpmc_onenand_data->flags |= ONENAND_SYNC_READ; - } - - if (cpu_is_omap34xx()) - gpmc_onenand_data->flags |= ONENAND_IN_OMAP34XX; - else - gpmc_onenand_data->flags &= ~ONENAND_IN_OMAP34XX; - - err = gpmc_cs_request(gpmc_onenand_data->cs, ONENAND_IO_SIZE, - (unsigned long *)&gpmc_onenand_resource.start); - if (err < 0) { - dev_err(dev, "Cannot request GPMC CS %d, error %d\n", - gpmc_onenand_data->cs, err); - return err; - } - - gpmc_onenand_resource.end = gpmc_onenand_resource.start + - ONENAND_IO_SIZE - 1; - - err = platform_device_register(&gpmc_onenand_device); - if (err) { - dev_err(dev, "Unable to register OneNAND device\n"); - gpmc_cs_free(gpmc_onenand_data->cs); - } - - return err; -} diff --git a/include/linux/platform_data/mtd-onenand-omap2.h b/include/linux/platform_data/mtd-onenand-omap2.h deleted file mode 100644 index 56ff0e6f5ad1..000000000000 --- a/include/linux/platform_data/mtd-onenand-omap2.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (C) 2006 Nokia Corporation - * Author: Juha Yrjola - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __MTD_ONENAND_OMAP2_H -#define __MTD_ONENAND_OMAP2_H - -#include -#include - -#define ONENAND_SYNC_READ (1 << 0) -#define ONENAND_SYNC_READWRITE (1 << 1) -#define ONENAND_IN_OMAP34XX (1 << 2) - -struct omap_onenand_platform_data { - int cs; - int gpio_irq; - struct mtd_partition *parts; - int nr_parts; - int (*onenand_setup)(void __iomem *, int *freq_ptr); - int dma_channel; - u8 flags; - u8 regulator_can_sleep; - u8 skip_initial_unlocking; - - /* for passing the partitions */ - struct device_node *of_node; -}; -#endif -- cgit v1.2.3 From f10ee3dcc9f0aba92a5c4c064628be5200765dc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 00:23:57 +0100 Subject: x86/pti: Fix !PCID and sanitize defines The switch to the user space page tables in the low level ASM code sets unconditionally bit 12 and bit 11 of CR3. Bit 12 is switching the base address of the page directory to the user part, bit 11 is switching the PCID to the PCID associated with the user page tables. This fails on a machine which lacks PCID support because bit 11 is set in CR3. Bit 11 is reserved when PCID is inactive. While the Intel SDM claims that the reserved bits are ignored when PCID is disabled, the AMD APM states that they should be cleared. This went unnoticed as the AMD APM was not checked when the code was developed and reviewed and test systems with Intel CPUs never failed to boot. The report is against a Centos 6 host where the guest fails to boot, so it's not yet clear whether this is a virt issue or can happen on real hardware too, but thats irrelevant as the AMD APM clearly ask for clearing the reserved bits. Make sure that on non PCID machines bit 11 is not set by the page table switching code. Andy suggested to rename the related bits and masks so they are clearly describing what they should be used for, which is done as well for clarity. That split could have been done with alternatives but the macro hell is horrible and ugly. This can be done on top if someone cares to remove the extra orq. For now it's a straight forward fix. Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") Reported-by: Laura Abbott Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Willy Tarreau Cc: David Woodhouse Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801140009150.2371@nanos --- arch/x86/entry/calling.h | 36 ++++++++++++++++++---------------- arch/x86/include/asm/processor-flags.h | 2 +- arch/x86/include/asm/tlbflush.h | 6 +++--- 3 files changed, 23 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 45a63e00a6af..3f48f695d5e6 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is built with * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two * halves: */ -#define PTI_SWITCH_PGTABLES_MASK (1<= (1 << X86_CR3_PTI_SWITCH_BIT)); + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); /* * The ASID being passed in here should have respected the * MAX_ASID_AVAILABLE and thus never have the switch bit set. */ - VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); #endif /* * The dynamically-assigned ASIDs that get passed in are small @@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid) { u16 ret = kern_pcid(asid); #ifdef CONFIG_PAGE_TABLE_ISOLATION - ret |= 1 << X86_CR3_PTI_SWITCH_BIT; + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; #endif return ret; } -- cgit v1.2.3 From 99a9dc98ba52267ce5e062b52de88ea1f1b2a7d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 14 Jan 2018 11:27:13 +0100 Subject: x86,perf: Disable intel_bts when PTI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intel_bts driver does not use the 'normal' BTS buffer which is exposed through the cpu_entry_area but instead uses the memory allocated for the perf AUX buffer. This obviously comes apart when using PTI because then the kernel mapping; which includes that AUX buffer memory; disappears. Fixing this requires to expose a mapping which is visible in all context and that's not trivial. As a quick fix disable this driver when PTI is enabled to prevent malfunction. Fixes: 385ce0ea4c07 ("x86/mm/pti: Add Kconfig") Reported-by: Vince Weaver Reported-by: Robert Święcki Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Alexander Shishkin Cc: greg@kroah.com Cc: hughd@google.com Cc: luto@amacapital.net Cc: Vince Weaver Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180114102713.GB6166@worktop.programming.kicks-ass.net --- arch/x86/events/intel/bts.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 141e07b06216..24ffa1e88cf9 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -582,6 +582,24 @@ static __init int bts_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; + if (boot_cpu_has(X86_FEATURE_PTI)) { + /* + * BTS hardware writes through a virtual memory map we must + * either use the kernel physical map, or the user mapping of + * the AUX buffer. + * + * However, since this driver supports per-CPU and per-task inherit + * we cannot use the user mapping since it will not be availble + * if we're not running the owning process. + * + * With PTI we can't use the kernal map either, because its not + * there when we run userspace. + * + * For now, disable this driver when using PTI. + */ + return -ENODEV; + } + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; -- cgit v1.2.3 From da4ae6c4a0b8dee5a5377a385545d2250fa8cddb Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 22 Dec 2017 00:27:54 -0500 Subject: x86/tsc: Future-proof native_calibrate_tsc() If the crystal frequency cannot be determined via CPUID(15).crystal_khz or the built-in table then native_calibrate_tsc() will still set the X86_FEATURE_TSC_KNOWN_FREQ flag which prevents the refined TSC calibration. As a consequence such systems use cpu_khz for the TSC frequency which is incorrect when cpu_khz != tsc_khz resulting in time drift. Return early when the crystal frequency cannot be retrieved without setting the X86_FEATURE_TSC_KNOWN_FREQ flag. This ensures that the refined TSC calibration is invoked. [ tglx: Steam-blastered changelog. Sigh ] Fixes: 4ca4df0b7eb0 ("x86/tsc: Mark TSC frequency determined by CPUID as known") Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: Bin Gao Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/0fe2503aa7d7fc69137141fc705541a78101d2b9.1513920414.git.len.brown@intel.com --- arch/x86/kernel/tsc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 8ea117f8142e..ce4b71119c36 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -612,6 +612,8 @@ unsigned long native_calibrate_tsc(void) } } + if (crystal_khz == 0) + return 0; /* * TSC frequency determined by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This -- cgit v1.2.3 From b511203093489eb1829cb4de86e8214752205ac6 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 22 Dec 2017 00:27:55 -0500 Subject: x86/tsc: Fix erroneous TSC rate on Skylake Xeon The INTEL_FAM6_SKYLAKE_X hardcoded crystal_khz value of 25MHZ is problematic: - SKX workstations (with same model # as server variants) use a 24 MHz crystal. This results in a -4.0% time drift rate on SKX workstations. - SKX servers subject the crystal to an EMI reduction circuit that reduces its actual frequency by (approximately) -0.25%. This results in -1 second per 10 minute time drift as compared to network time. This issue can also trigger a timer and power problem, on configurations that use the LAPIC timer (versus the TSC deadline timer). Clock ticks scheduled with the LAPIC timer arrive a few usec before the time they are expected (according to the slow TSC). This causes Linux to poll-idle, when it should be in an idle power saving state. The idle and clock code do not graciously recover from this error, sometimes resulting in significant polling and measurable power impact. Stop using native_calibrate_tsc() for INTEL_FAM6_SKYLAKE_X. native_calibrate_tsc() will return 0, boot will run with tsc_khz = cpu_khz, and the TSC refined calibration will update tsc_khz to correct for the difference. [ tglx: Sanitized change log ] Fixes: 6baf3d61821f ("x86/tsc: Add additional Intel CPU models to the crystal quirk list") Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: Prarit Bhargava Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/ff6dcea166e8ff8f2f6a03c17beab2cb436aa779.1513920414.git.len.brown@intel.com --- arch/x86/kernel/tsc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ce4b71119c36..3bf4df7f52d7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -602,7 +602,6 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; -- cgit v1.2.3 From 4b5b2127238e689ee18aa6752959751dd61c4c73 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 22 Dec 2017 00:27:56 -0500 Subject: x86/tsc: Print tsc_khz, when it differs from cpu_khz If CPU and TSC frequency are the same the printout of the CPU frequency is valid for the TSC as well: tsc: Detected 2900.000 MHz processor If the TSC frequency is different there is no information in dmesg. Add a conditional printout: tsc: Detected 2904.000 MHz TSC Signed-off-by: Len Brown Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Link: https://lkml.kernel.org/r/537b342debcd8e8aebc8d631015dcdf9f9ba8a26.1513920414.git.len.brown@intel.com --- arch/x86/kernel/tsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3bf4df7f52d7..e169e85db434 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1316,6 +1316,12 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + if (cpu_khz != tsc_khz) { + pr_info("Detected %lu.%03lu MHz TSC", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); -- cgit v1.2.3 From beacd6f7ed5e2915959442245b3b2480c2e37490 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 12 Jan 2018 14:31:35 -0600 Subject: x86/mm/pkeys: Fix fill_sig_info_pkey SEGV_PKUERR is a signal specific si_code which happens to have the same numeric value as several others: BUS_MCEERR_AR, ILL_ILLTRP, FPE_FLTOVF, TRAP_HWBKPT, CLD_TRAPPED, POLL_ERR, SEGV_THREAD_ID, as such it is not safe to just test the si_code the signal number must also be tested to prevent a false positive in fill_sig_info_pkey. This error was by inspection, and BUS_MCEERR_AR appears to be a real candidate for confusion. So pass in si_signo and check for SIG_SEGV to verify that it is actually a SEGV_PKUERR Fixes: 019132ff3daf ("x86/mm/pkeys: Fill in pkey field in siginfo") Signed-off-by: "Eric W. Biederman" Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Dave Hansen Cc: Oleg Nesterov Cc: Al Viro cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180112203135.4669-2-ebiederm@xmission.com --- arch/x86/mm/fault.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 06fe3d51d385..b3e40773dce0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -172,14 +172,15 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) +static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info, + u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return; /* Fault not from Protection Keys: nothing to do */ - if (si_code != SEGV_PKUERR) + if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV)) return; /* * force_sig_info_fault() is called from a number of @@ -218,7 +219,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, pkey); + fill_sig_info_pkey(si_signo, si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } -- cgit v1.2.3 From fc90ccfd286eabb05ec54521367df8663cf0bbbf Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Tue, 28 Nov 2017 16:53:50 +0200 Subject: Revert "x86/apic: Remove init_bsp_APIC()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b371ae0d4a194b178817b0edfb6a7395c7aec37a. It causes boot hangs on old P3/P4 systems when the local APIC is enforced in UP mode. Reported-by: Meelis Roos Signed-off-by: Ville Syrjälä Signed-off-by: Thomas Gleixner Cc: Dou Liyang Cc: yinghai@kernel.org Cc: bhe@redhat.com Link: https://lkml.kernel.org/r/20171128145350.21560-1-ville.syrjala@linux.intel.com --- arch/x86/include/asm/apic.h | 1 + arch/x86/kernel/apic/apic.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit.c | 3 +++ 3 files changed, 53 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a9e57f08bfa6..98722773391d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -136,6 +136,7 @@ extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); extern void apic_intr_mode_init(void); extern void setup_local_APIC(void); extern void init_apic_mappings(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 880441f24146..25ddf02598d2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1286,6 +1286,55 @@ static int __init apic_intr_mode_select(void) return APIC_SYMMETRIC_IO; } +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + if (apic_extnmi == APIC_EXTNMI_NONE) + value |= APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} + /* Init the interrupt delivery mode for the BSP */ void __init apic_intr_mode_init(void) { diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 8da3e909e967..a539410c4ea9 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -61,6 +61,9 @@ void __init init_ISA_irqs(void) struct irq_chip *chip = legacy_pic->chip; int i; +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif legacy_pic->init(0); for (i = 0; i < nr_legacy_irqs(); i++) -- cgit v1.2.3 From 7cf1aaa2ad3855bd5e95bef382a66fe122fc9b01 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Dec 2017 16:18:16 -0800 Subject: x86/timer: Don't inline __const_udelay __const_udelay is marked inline, and LTO will happily inline it everywhere Dropping the inline saves ~44k text in a LTO build. 13999560 1740864 1499136 17239560 1070e08 vmlinux-with-udelay-inline 13954764 1736768 1499136 17190668 1064f0c vmlinux-wo-udelay-inline Inlining it has no advantage in general, so its the right thing to do. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171222001821.2157-2-andi@firstfloor.org --- arch/x86/lib/delay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 553f8fd23cc4..09c83b2f80d2 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -162,7 +162,7 @@ void __delay(unsigned long loops) } EXPORT_SYMBOL(__delay); -inline void __const_udelay(unsigned long xloops) +void __const_udelay(unsigned long xloops) { unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy; int d0; -- cgit v1.2.3 From 80a3e3949b8f3a3efa853d8752fd7ed5ec02de2d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Dec 2017 16:18:20 -0800 Subject: x86/extable: Mark exception handler functions visible Mark the C exception handler functions that are directly called through exception tables visible. LTO needs to know they are accessed from assembler. [ tglx: Mopped up the wrecked argument alignment. Sigh.... ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20171222001821.2157-6-andi@firstfloor.org --- arch/x86/mm/extable.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 9fe656c42aa5..45f5d6cf65ae 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -21,16 +21,16 @@ ex_fixup_handler(const struct exception_table_entry *x) return (ex_handler_t)((unsigned long)&x->handler + x->handler); } -bool ex_handler_default(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_default(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { regs->ip = ex_fixup_addr(fixup); return true; } EXPORT_SYMBOL(ex_handler_default); -bool ex_handler_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { regs->ip = ex_fixup_addr(fixup); regs->ax = trapnr; @@ -42,8 +42,8 @@ EXPORT_SYMBOL_GPL(ex_handler_fault); * Handler for UD0 exception following a failed test against the * result of a refcount inc/dec/add/sub. */ -bool ex_handler_refcount(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { /* First unconditionally saturate the refcount. */ *(int *)regs->cx = INT_MIN / 2; @@ -95,8 +95,8 @@ EXPORT_SYMBOL(ex_handler_refcount); * of vulnerability by restoring from the initial state (essentially, zeroing * out all the FPU registers) if we can't restore from the task's FPU state. */ -bool ex_handler_fprestore(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { regs->ip = ex_fixup_addr(fixup); @@ -108,8 +108,8 @@ bool ex_handler_fprestore(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fprestore); -bool ex_handler_ext(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_ext(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { /* Special hack for uaccess_err */ current->thread.uaccess_err = 1; @@ -118,8 +118,8 @@ bool ex_handler_ext(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_ext); -bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) @@ -133,8 +133,8 @@ bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); -bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, @@ -147,8 +147,8 @@ bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); -bool ex_handler_clear_fs(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr) +__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); @@ -157,7 +157,7 @@ bool ex_handler_clear_fs(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_clear_fs); -bool ex_has_fault_handler(unsigned long ip) +__visible bool ex_has_fault_handler(unsigned long ip) { const struct exception_table_entry *e; ex_handler_t handler; -- cgit v1.2.3 From 327867faa4d66628fcd92a843adb3345736a5313 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Dec 2017 16:18:21 -0800 Subject: x86/idt: Mark IDT tables __initconst const variables must use __initconst, not __initdata. Fix this up for the IDT tables, which got it consistently wrong. Fixes: 16bc18d895ce ("x86/idt: Move 32-bit idt_descr to C code") Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20171222001821.2157-7-andi@firstfloor.org --- arch/x86/kernel/idt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d985cef3984f..56d99be3706a 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -56,7 +56,7 @@ struct idt_data { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_idts[] = { +static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, debug), SYSG(X86_TRAP_BP, int3), #ifdef CONFIG_X86_32 @@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = { * the traps which use them are reinitialized with IST after cpu_init() has * set up TSS. */ -static const __initdata struct idt_data def_idts[] = { +static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, bounds), @@ -108,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = { /* * The APIC and SMP idt entries */ -static const __initdata struct idt_data apic_idts[] = { +static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP INTG(RESCHEDULE_VECTOR, reschedule_interrupt), INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), @@ -150,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_pf_idts[] = { +static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, page_fault), }; @@ -158,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = { * Override for the debug_idt. Same as the default, but with interrupt * stack set to DEFAULT_STACK (0). Required for NMI trap handling. */ -static const __initdata struct idt_data dbg_idts[] = { +static const __initconst struct idt_data dbg_idts[] = { INTG(X86_TRAP_DB, debug), INTG(X86_TRAP_BP, int3), }; @@ -180,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * The exceptions which use Interrupt stacks. They are setup after * cpu_init() when the TSS has been initialized. */ -static const __initdata struct idt_data ist_idts[] = { +static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, DEBUG_STACK), ISTG(X86_TRAP_NMI, nmi, NMI_STACK), SISTG(X86_TRAP_BP, int3, DEBUG_STACK), -- cgit v1.2.3 From 30c7e5b123673d5e570e238dbada2fb68a87212c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Dec 2017 10:20:11 +0100 Subject: x86/tsc: Allow TSC calibration without PIT Zhang Rui reported that a Surface Pro 4 will fail to boot with lapic=notscdeadline. Part of the problem is that that machine doesn't have a PIT. If, for some reason, the TSC init has to fall back to TSC calibration, it relies on the PIT to be present. Allow TSC calibration to reliably fall back to HPET. The below results in an accurate TSC measurement when forced on a IVB: tsc: Unable to calibrate against PIT tsc: No reference (HPET/PMTIMER) available tsc: Unable to calibrate against PIT tsc: using HPET reference calibration tsc: Detected 2792.451 MHz processor Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: len.brown@intel.com Cc: rui.zhang@intel.com Link: https://lkml.kernel.org/r/20171222092243.333145937@infradead.org --- arch/x86/include/asm/i8259.h | 5 +++++ arch/x86/kernel/tsc.c | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index c8376b40e882..5cdcdbd4d892 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -69,6 +69,11 @@ struct legacy_pic { extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; +static inline bool has_legacy_pic(void) +{ + return legacy_pic != &null_legacy_pic; +} + static inline int nr_legacy_irqs(void) { return legacy_pic->nr_legacy_irqs; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e169e85db434..a2c9dd8bfc6f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -25,6 +25,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -363,6 +364,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) unsigned long tscmin, tscmax; int pitcnt; + if (!has_legacy_pic()) { + /* + * Relies on tsc_early_delay_calibrate() to have given us semi + * usable udelay(), wait for the same 50ms we would have with + * the PIT loop below. + */ + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + return ULONG_MAX; + } + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -487,6 +502,9 @@ static unsigned long quick_pit_calibrate(void) u64 tsc, delta; unsigned long d1, d2; + if (!has_legacy_pic()) + return 0; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); -- cgit v1.2.3 From 6d671e1b85c63e7a337ba76c1a154c091545cff8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Dec 2017 10:20:12 +0100 Subject: x86/time: Unconditionally register legacy timer interrupt Even without a PIC/PIT the legacy timer interrupt is required for HPET in legacy replacement mode. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: len.brown@intel.com Cc: rui.zhang@intel.com Link: https://lkml.kernel.org/r/20171222092243.382623763@infradead.org --- arch/x86/kernel/time.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 749d189f8cd4..774ebafa97c4 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -69,9 +69,12 @@ static struct irqaction irq0 = { static void __init setup_default_timer_irq(void) { - if (!nr_legacy_irqs()) - return; - setup_irq(0, &irq0); + /* + * Unconditionally register the legacy timer; even without legacy + * PIC/PIT we need this for the HPET0 in legacy replacement mode. + */ + if (setup_irq(0, &irq0)) + pr_info("Failed to register legacy timer interrupt\n"); } /* Default timer init function */ -- cgit v1.2.3 From aa83c45762a242acce9b35020363225a7b59d7c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Dec 2017 10:20:13 +0100 Subject: x86/tsc: Introduce early tsc clocksource Without TSC_KNOWN_FREQ the TSC clocksource is registered so late that the kernel first switches to the HPET. Using HPET on large CPU count machines is undesirable. Therefore register a tsc-early clocksource using the preliminary tsc_khz from quick calibration. Then when the final TSC calibration is done, it can switch to the tuned frequency. The only notably problem is that the real tsc clocksource must be marked with CLOCK_SOURCE_VALID_FOR_HRES, otherwise it will not be selected when unregistering tsc-early. tsc-early cannot be left registered, because then the clocksource code would fall back to it when we tsc clocksource is marked unstable later. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: len.brown@intel.com Cc: rui.zhang@intel.com Cc: Len Brown Link: https://lkml.kernel.org/r/20171222092243.431585460@infradead.org --- arch/x86/kernel/tsc.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a2c9dd8bfc6f..fb4302738410 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1006,8 +1006,6 @@ static void __init detect_art(void) /* clocksource code */ -static struct clocksource clocksource_tsc; - static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); @@ -1058,12 +1056,31 @@ static void tsc_cs_tick_stable(struct clocksource *cs) /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ +static struct clocksource clocksource_tsc_early = { + .name = "tsc-early", + .rating = 299, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, + .archdata = { .vclock_mode = VCLOCK_TSC }, + .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, +}; + +/* + * Must mark VALID_FOR_HRES early such that when we unregister tsc_early + * this one will immediately take over. We will only register if TSC has + * been found good. + */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, @@ -1187,8 +1204,8 @@ static void tsc_refine_calibration_work(struct work_struct *work) int cpu; /* Don't bother refining TSC on unstable systems */ - if (check_tsc_unstable()) - goto out; + if (tsc_unstable) + return; /* * Since the work is started early in boot, we may be @@ -1240,9 +1257,13 @@ static void tsc_refine_calibration_work(struct work_struct *work) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: + if (tsc_unstable) + return; + if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); + clocksource_unregister(&clocksource_tsc_early); } @@ -1251,13 +1272,11 @@ static int __init init_tsc_clocksource(void) if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) return 0; + if (check_tsc_unstable()) + return 0; + if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; - /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; - } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; @@ -1270,6 +1289,7 @@ static int __init init_tsc_clocksource(void) if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); + clocksource_unregister(&clocksource_tsc_early); return 0; } @@ -1374,9 +1394,12 @@ void __init tsc_init(void) check_system_tsc_reliable(); - if (unsynchronized_tsc()) + if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); + return; + } + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } -- cgit v1.2.3 From 32c9c801a853f181448ed4e8730168c556f9e05a Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:43 +0100 Subject: x86/apic: Install an empty physflat_init_apic_ldr As the comment already stated, there is no need for setting up LDR (and DFR) in physflat mode as it remains unused (see SDM, 10.6.2.1). flat_init_apic_ldr only served as a placeholder for a nop operation so far, causing no harm. That will change when running over the Jailhouse hypervisor. Here we must not touch LDR in a way that destroys the mapping originally set up by the Linux root cell. Jailhouse enforces this setting in order to efficiently validate any IPI requests sent by a cell. Avoid a needless clash caused by flat_init_apic_ldr by installing a true nop handler. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/f9867d294cdae4d45ed89d3a2e6adb524f4f6794.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/apic/apic_flat_64.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 25a87028cb3f..4b5547789713 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -218,6 +218,15 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } +static void physflat_init_apic_ldr(void) +{ + /* + * LDR and DFR are not involved in physflat mode, rather: + * "In physical destination mode, the destination processor is + * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1) + */ +} + static void physflat_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -251,8 +260,7 @@ static struct apic apic_physflat __ro_after_init = { .dest_logical = 0, .check_apicid_used = NULL, - /* not needed, but shouldn't hurt: */ - .init_apic_ldr = flat_init_apic_ldr, + .init_apic_ldr = physflat_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, -- cgit v1.2.3 From e348caef8b4a161cc27bec8f7500b7e100370ef1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:44 +0100 Subject: x86/platform: Control warm reset setup via legacy feature flag Allow to turn off the setup of BIOS-managed warm reset via a new flag in x86_legacy_features. Besides the UV1, the upcoming jailhose guest support needs this switched off. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/44376558129d70a2c1527959811371ef4b82e829.1511770314.git.jan.kiszka@siemens.com --- arch/x86/include/asm/x86_init.h | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/platform-quirks.c | 1 + arch/x86/kernel/smpboot.c | 4 ++-- 4 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index aa4747569e23..fc2f082ac635 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -212,6 +212,7 @@ enum x86_legacy_i8042_state { struct x86_legacy_features { enum x86_legacy_i8042_state i8042; int rtc; + int warm_reset; int no_vga; int reserve_bios_regions; struct x86_legacy_devices devices; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e1b8e8bf6b3c..6de35fc8fb3a 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -316,6 +316,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) } else if (!strcmp(oem_table_id, "UVH")) { /* Only UV1 systems: */ uv_system_type = UV_NON_UNIQUE_APIC; + x86_platform.legacy.warm_reset = 0; __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift); uv_set_apicid_hibit(); uv_apic = 1; diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 39a59299bfa0..235fe6008ac8 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -9,6 +9,7 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT; x86_platform.legacy.rtc = 1; + x86_platform.legacy.warm_reset = 1; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ed556d50d7ed..9adcae1b135c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -934,7 +934,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, * the targeted processor. */ - if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + if (x86_platform.legacy.warm_reset) { pr_debug("Setting warm reset code and vector.\n"); @@ -1006,7 +1006,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* mark "stuck" area as not stuck */ *trampoline_status = 0; - if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + if (x86_platform.legacy.warm_reset) { /* * Cleanup possible dangling ends... */ -- cgit v1.2.3 From a09c5ec00a120dae52eceef3eebff93ed729bb43 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:45 +0100 Subject: x86: Introduce and use MP IRQ trigger and polarity defines MP_IRQDIR_* constants pointed in the right direction but remained unused so far: It's cleaner to use symbolic values for the IRQ flags in the MP config table. That also saves some comments. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/60809926663a1d38e2a5db47d020d6e2e7a70019.1511770314.git.jan.kiszka@siemens.com --- arch/x86/include/asm/mpspec_def.h | 14 +++++++++++--- arch/x86/kernel/apic/io_apic.c | 20 ++++++++++---------- arch/x86/kernel/mpparse.c | 23 ++++++++++++++--------- arch/x86/platform/intel-mid/sfi.c | 5 ++--- 4 files changed, 37 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index a6bec8028480..6fb923a34309 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -128,9 +128,17 @@ enum mp_irq_source_types { mp_ExtINT = 3 }; -#define MP_IRQDIR_DEFAULT 0 -#define MP_IRQDIR_HIGH 1 -#define MP_IRQDIR_LOW 3 +#define MP_IRQPOL_DEFAULT 0x0 +#define MP_IRQPOL_ACTIVE_HIGH 0x1 +#define MP_IRQPOL_RESERVED 0x2 +#define MP_IRQPOL_ACTIVE_LOW 0x3 +#define MP_IRQPOL_MASK 0x3 + +#define MP_IRQTRIG_DEFAULT 0x0 +#define MP_IRQTRIG_EDGE 0x4 +#define MP_IRQTRIG_RESERVED 0x8 +#define MP_IRQTRIG_LEVEL 0xc +#define MP_IRQTRIG_MASK 0xc #define MP_APIC_ALL 0xFF diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8a7963421460..8ad2e410974f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -800,18 +800,18 @@ static int irq_polarity(int idx) /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].irqflag & 0x03) { - case 0: + switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { + case MP_IRQPOL_DEFAULT: /* conforms to spec, ie. bus-type dependent polarity */ if (test_bit(bus, mp_bus_not_pci)) return default_ISA_polarity(idx); else return default_PCI_polarity(idx); - case 1: + case MP_IRQPOL_ACTIVE_HIGH: return IOAPIC_POL_HIGH; - case 2: + case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); - case 3: + case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_POL_LOW; } @@ -845,8 +845,8 @@ static int irq_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { - case 0: + switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { + case MP_IRQTRIG_DEFAULT: /* conforms to spec, ie. bus-type dependent trigger mode */ if (test_bit(bus, mp_bus_not_pci)) trigger = default_ISA_trigger(idx); @@ -854,11 +854,11 @@ static int irq_trigger(int idx) trigger = default_PCI_trigger(idx); /* Take EISA into account */ return eisa_irq_trigger(idx, bus, trigger); - case 1: + case MP_IRQTRIG_EDGE: return IOAPIC_EDGE; - case 2: + case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); - case 3: + case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_LEVEL; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3a4b12809ab5..27d0a1712663 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -281,7 +281,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) int ELCR_fallback = 0; intsrc.type = MP_INTSRC; - intsrc.irqflag = 0; /* conforming */ + intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; intsrc.srcbus = 0; intsrc.dstapic = mpc_ioapic_id(0); @@ -324,10 +324,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) * copy that information over to the MP table in the * irqflag field (level sensitive, active high polarity). */ - if (ELCR_trigger(i)) - intsrc.irqflag = 13; - else - intsrc.irqflag = 0; + if (ELCR_trigger(i)) { + intsrc.irqflag = MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_HIGH; + } else { + intsrc.irqflag = MP_IRQTRIG_DEFAULT | + MP_IRQPOL_DEFAULT; + } } intsrc.srcbusirq = i; @@ -419,7 +422,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) construct_ioapic_table(mpc_default_type); lintsrc.type = MP_LINTSRC; - lintsrc.irqflag = 0; /* conforming */ + lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; lintsrc.srcbusid = 0; lintsrc.srcbusirq = 0; lintsrc.destapic = MP_APIC_ALL; @@ -664,7 +667,7 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m) if (m->irqtype != mp_INT) return 0; - if (m->irqflag != 0x0f) + if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW)) return 0; /* not legacy */ @@ -673,7 +676,8 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m) if (mp_irqs[i].irqtype != mp_INT) continue; - if (mp_irqs[i].irqflag != 0x0f) + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) continue; if (mp_irqs[i].srcbus != m->srcbus) @@ -784,7 +788,8 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, if (mp_irqs[i].irqtype != mp_INT) continue; - if (mp_irqs[i].irqflag != 0x0f) + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) continue; if (nr_m_spare > 0) { diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 19b43e3a9f0f..7be1e1fe9ae3 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -96,8 +96,7 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) pentry->freq_hz, pentry->irq); mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; - /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ - mp_irq.irqflag = 5; + mp_irq.irqflag = MP_IRQTRIG_EDGE | MP_IRQPOL_ACTIVE_HIGH; mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; @@ -168,7 +167,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) totallen, (u32)pentry->phys_addr, pentry->irq); mp_irq.type = MP_INTSRC; mp_irq.irqtype = mp_INT; - mp_irq.irqflag = 0xf; /* level trigger and active low */ + mp_irq.irqflag = MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW; mp_irq.srcbus = MP_BUS_ISA; mp_irq.srcbusirq = pentry->irq; /* IRQ */ mp_irq.dstapic = MP_APIC_ALL; -- cgit v1.2.3 From 4a362601baa6fff92b576d85199f1948cec2fb3b Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:46 +0100 Subject: x86/jailhouse: Add infrastructure for running in non-root cell The Jailhouse hypervisor is able to statically partition a multicore system into multiple so-called cells. Linux is used as boot loader and continues to run in the root cell after Jailhouse is enabled. Linux can also run in non-root cells. Jailhouse does not emulate usual x86 devices. It also provides no complex ACPI but basic platform information that the boot loader forwards via setup data. This adds the infrastructure to detect when running in a non-root cell so that the platform can be configured as required in succeeding steps. Support is limited to x86-64 so far, primarily because no boot loader stub exists for i386 and, thus, we wouldn't be able to test the 32-bit path. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/7f823d077b38b1a70c526b40b403f85688c137d3.1511770314.git.jan.kiszka@siemens.com --- arch/x86/Kconfig | 8 ++++ arch/x86/include/asm/hypervisor.h | 1 + arch/x86/include/asm/jailhouse_para.h | 26 +++++++++++++ arch/x86/include/uapi/asm/bootparam.h | 22 +++++++++++ arch/x86/kernel/Makefile | 2 + arch/x86/kernel/cpu/hypervisor.c | 4 ++ arch/x86/kernel/jailhouse.c | 73 +++++++++++++++++++++++++++++++++++ 7 files changed, 136 insertions(+) create mode 100644 arch/x86/include/asm/jailhouse_para.h create mode 100644 arch/x86/kernel/jailhouse.c (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ff4e9cd99854..fbea8d15fcfb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -796,6 +796,14 @@ config PARAVIRT_TIME_ACCOUNTING config PARAVIRT_CLOCK bool +config JAILHOUSE_GUEST + bool "Jailhouse non-root cell support" + depends on X86_64 + ---help--- + This option allows to run Linux as guest in a Jailhouse non-root + cell. You can leave this option disabled if you only want to start + Jailhouse and run Linux afterwards in the root cell. + endif #HYPERVISOR_GUEST config NO_BOOTMEM diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 96aa6b9884dc..8c5aaba6633f 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -28,6 +28,7 @@ enum x86_hypervisor_type { X86_HYPER_XEN_PV, X86_HYPER_XEN_HVM, X86_HYPER_KVM, + X86_HYPER_JAILHOUSE, }; #ifdef CONFIG_HYPERVISOR_GUEST diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h new file mode 100644 index 000000000000..875b54376689 --- /dev/null +++ b/arch/x86/include/asm/jailhouse_para.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL2.0 */ + +/* + * Jailhouse paravirt_ops implementation + * + * Copyright (c) Siemens AG, 2015-2017 + * + * Authors: + * Jan Kiszka + */ + +#ifndef _ASM_X86_JAILHOUSE_PARA_H +#define _ASM_X86_JAILHOUSE_PARA_H + +#include + +#ifdef CONFIG_JAILHOUSE_GUEST +bool jailhouse_paravirt(void); +#else +static inline bool jailhouse_paravirt(void) +{ + return false; +} +#endif + +#endif /* _ASM_X86_JAILHOUSE_PARA_H */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index afdd5ae0fcc4..aebf60357758 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -9,6 +9,7 @@ #define SETUP_PCI 3 #define SETUP_EFI 4 #define SETUP_APPLE_PROPERTIES 5 +#define SETUP_JAILHOUSE 6 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -126,6 +127,27 @@ struct boot_e820_entry { __u32 type; } __attribute__((packed)); +/* + * Smallest compatible version of jailhouse_setup_data required by this kernel. + */ +#define JAILHOUSE_SETUP_REQUIRED_VERSION 1 + +/* + * The boot loader is passing platform information via this Jailhouse-specific + * setup data structure. + */ +struct jailhouse_setup_data { + u16 version; + u16 compatible_version; + u16 pm_timer_address; + u16 num_cpus; + u64 pci_mmconfig_base; + u32 tsc_khz; + u32 apic_khz; + u8 standard_ioapic; + u8 cpu_ids[255]; +} __attribute__((packed)); + /* The so-called "zeropage" */ struct boot_params { struct screen_info screen_info; /* 0x000 */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 81bb565f4497..aed9296dccd3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -112,6 +112,8 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o + obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index bea8d3e24f50..479ca4728de0 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -31,6 +31,7 @@ extern const struct hypervisor_x86 x86_hyper_ms_hyperv; extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_xen_hvm; extern const struct hypervisor_x86 x86_hyper_kvm; +extern const struct hypervisor_x86 x86_hyper_jailhouse; static const __initconst struct hypervisor_x86 * const hypervisors[] = { @@ -45,6 +46,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_KVM_GUEST &x86_hyper_kvm, #endif +#ifdef CONFIG_JAILHOUSE_GUEST + &x86_hyper_jailhouse, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c new file mode 100644 index 000000000000..1186b8909595 --- /dev/null +++ b/arch/x86/kernel/jailhouse.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL2.0 +/* + * Jailhouse paravirt_ops implementation + * + * Copyright (c) Siemens AG, 2015-2017 + * + * Authors: + * Jan Kiszka + */ + +#include +#include +#include +#include + +static __initdata struct jailhouse_setup_data setup_data; + +static uint32_t jailhouse_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0 || + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return 0; + + return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); +} + +static uint32_t __init jailhouse_detect(void) +{ + return jailhouse_cpuid_base(); +} + +static void __init jailhouse_init_platform(void) +{ + u64 pa_data = boot_params.hdr.setup_data; + struct setup_data header; + void *mapping; + + while (pa_data) { + mapping = early_memremap(pa_data, sizeof(header)); + memcpy(&header, mapping, sizeof(header)); + early_memunmap(mapping, sizeof(header)); + + if (header.type == SETUP_JAILHOUSE && + header.len >= sizeof(setup_data)) { + pa_data += offsetof(struct setup_data, data); + + mapping = early_memremap(pa_data, sizeof(setup_data)); + memcpy(&setup_data, mapping, sizeof(setup_data)); + early_memunmap(mapping, sizeof(setup_data)); + + break; + } + + pa_data = header.next; + } + + if (!pa_data) + panic("Jailhouse: No valid setup data found"); + + if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) + panic("Jailhouse: Unsupported setup data structure"); +} + +bool jailhouse_paravirt(void) +{ + return jailhouse_cpuid_base() != 0; +} + +const struct hypervisor_x86 x86_hyper_jailhouse __refconst = { + .name = "Jailhouse", + .detect = jailhouse_detect, + .init.init_platform = jailhouse_init_platform, +}; -- cgit v1.2.3 From 11c8dc419bbc7b5acef812043feefc53c45ef558 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:47 +0100 Subject: x86/jailhouse: Enable APIC and SMP support Register the APIC which Jailhouse always exposes at 0xfee00000 if in xAPIC mode or via MSRs as x2APIC. The latter is only available if it was already activated because there is no support for switching its mode during runtime. Jailhouse requires the APIC to be operated in phys-flat mode. Ensure that this mode is selected by Linux. The available CPUs are taken from the setup data structure that the loader filled and registered with the kernel. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/8b2255da0a9856c530293a67aa9d6addfe102a2b.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/apic/apic_flat_64.c | 4 +++- arch/x86/kernel/jailhouse.c | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 4b5547789713..fcce5a784c71 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -239,7 +240,8 @@ static void physflat_send_IPI_all(int vector) static int physflat_probe(void) { - if (apic == &apic_physflat || num_possible_cpus() > 8) + if (apic == &apic_physflat || num_possible_cpus() > 8 || + jailhouse_paravirt()) return 1; return 0; diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 1186b8909595..57f49963d8dc 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -29,12 +30,43 @@ static uint32_t __init jailhouse_detect(void) return jailhouse_cpuid_base(); } +static void __init jailhouse_get_smp_config(unsigned int early) +{ + unsigned int cpu; + + if (x2apic_enabled()) { + /* + * We do not have access to IR inside Jailhouse non-root cells. + * So we have to run in physical mode. + */ + x2apic_phys = 1; + + /* + * This will trigger the switch to apic_x2apic_phys. + * Empty OEM IDs ensure that only this APIC driver picks up + * the call. + */ + default_acpi_madt_oem_check("", ""); + } + + register_lapic_address(0xfee00000); + + for (cpu = 0; cpu < setup_data.num_cpus; cpu++) { + generic_processor_info(setup_data.cpu_ids[cpu], + boot_cpu_apic_version); + } + + smp_found_config = 1; +} + static void __init jailhouse_init_platform(void) { u64 pa_data = boot_params.hdr.setup_data; struct setup_data header; void *mapping; + x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; + while (pa_data) { mapping = early_memremap(pa_data, sizeof(header)); memcpy(&header, mapping, sizeof(header)); @@ -66,8 +98,18 @@ bool jailhouse_paravirt(void) return jailhouse_cpuid_base() != 0; } +static bool jailhouse_x2apic_available(void) +{ + /* + * The x2APIC is only available if the root cell enabled it. Jailhouse + * does not support switching between xAPIC and x2APIC. + */ + return x2apic_enabled(); +} + const struct hypervisor_x86 x86_hyper_jailhouse __refconst = { .name = "Jailhouse", .detect = jailhouse_detect, .init.init_platform = jailhouse_init_platform, + .init.x2apic_available = jailhouse_x2apic_available, }; -- cgit v1.2.3 From 87e65d05bb0a18e00655a58159790bc8d38e219e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:48 +0100 Subject: x86/jailhouse: Enable PMTIMER Jailhouse exposes the PMTIMER as only reference clock to all cells. Pick up its address from the setup data. Allow to enable the Linux support of it by relaxing its strict dependency on ACPI. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/6d5c3fadd801eb3fba9510e2d3db14a9c404a1a0.1511770314.git.jan.kiszka@siemens.com --- arch/x86/Kconfig | 1 + arch/x86/kernel/jailhouse.c | 4 ++++ drivers/acpi/Kconfig | 32 ++++++++++++++++---------------- 3 files changed, 21 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fbea8d15fcfb..a936e29245d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -799,6 +799,7 @@ config PARAVIRT_CLOCK config JAILHOUSE_GUEST bool "Jailhouse non-root cell support" depends on X86_64 + select X86_PM_TIMER ---help--- This option allows to run Linux as guest in a Jailhouse non-root cell. You can leave this option disabled if you only want to start diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 57f49963d8dc..21c107770d67 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -8,6 +8,7 @@ * Jan Kiszka */ +#include #include #include #include @@ -91,6 +92,9 @@ static void __init jailhouse_init_platform(void) if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) panic("Jailhouse: Unsupported setup data structure"); + + pmtmr_ioport = setup_data.pm_timer_address; + pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); } bool jailhouse_paravirt(void) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 46505396869e..d650c5b6ec90 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -361,22 +361,6 @@ config ACPI_PCI_SLOT i.e., segment/bus/device/function tuples, with physical slots in the system. If you are unsure, say N. -config X86_PM_TIMER - bool "Power Management Timer Support" if EXPERT - depends on X86 - default y - help - The Power Management Timer is available on all ACPI-capable, - in most cases even if ACPI is unusable or blacklisted. - - This timing source is not affected by power management features - like aggressive processor idling, throttling, frequency and/or - voltage scaling, unlike the commonly used Time Stamp Counter - (TSC) timing source. - - You should nearly always say Y here because many modern - systems require this timer. - config ACPI_CONTAINER bool "Container and Module Devices" default (ACPI_HOTPLUG_MEMORY || ACPI_HOTPLUG_CPU) @@ -564,3 +548,19 @@ config TPS68470_PMIC_OPREGION using this, are probed. endif # ACPI + +config X86_PM_TIMER + bool "Power Management Timer Support" if EXPERT + depends on X86 && (ACPI || JAILHOUSE_GUEST) + default y + help + The Power Management Timer is available on all ACPI-capable, + in most cases even if ACPI is unusable or blacklisted. + + This timing source is not affected by power management features + like aggressive processor idling, throttling, frequency and/or + voltage scaling, unlike the commonly used Time Stamp Counter + (TSC) timing source. + + You should nearly always say Y here because many modern + systems require this timer. -- cgit v1.2.3 From e85eb632f651e70252bb18b292efaf6961164e32 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:49 +0100 Subject: x86/jailhouse: Set up timekeeping Get the precalibrated frequencies for the TSC and the APIC timer from the Jailhouse platform info and set the kernel values accordingly. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/b2557426332fc337a74d3141cb920f7dce9ad601.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/jailhouse.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 21c107770d67..34cf9d3e1751 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -16,6 +16,7 @@ #include static __initdata struct jailhouse_setup_data setup_data; +static unsigned int precalibrated_tsc_khz; static uint32_t jailhouse_cpuid_base(void) { @@ -31,6 +32,16 @@ static uint32_t __init jailhouse_detect(void) return jailhouse_cpuid_base(); } +static void __init jailhouse_timer_init(void) +{ + lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ); +} + +static unsigned long jailhouse_get_tsc(void) +{ + return precalibrated_tsc_khz; +} + static void __init jailhouse_get_smp_config(unsigned int early) { unsigned int cpu; @@ -66,8 +77,12 @@ static void __init jailhouse_init_platform(void) struct setup_data header; void *mapping; + x86_init.timers.timer_init = jailhouse_timer_init; x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; + x86_platform.calibrate_cpu = jailhouse_get_tsc; + x86_platform.calibrate_tsc = jailhouse_get_tsc; + while (pa_data) { mapping = early_memremap(pa_data, sizeof(header)); memcpy(&header, mapping, sizeof(header)); @@ -95,6 +110,8 @@ static void __init jailhouse_init_platform(void) pmtmr_ioport = setup_data.pm_timer_address; pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); + + precalibrated_tsc_khz = setup_data.tsc_khz; } bool jailhouse_paravirt(void) -- cgit v1.2.3 From 0d7c1e22183b9ddaa0b3bf30ece6577741bc13b3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:50 +0100 Subject: x86/jailhouse: Avoid access of unsupported platform resources Non-root cells do not have CMOS access, thus the warm reset cannot be enabled. There is no RTC, thus also no wall clock. Furthermore, there are no ISA IRQs and no PIC. Also disable probing of i8042 devices that are typically blocked for non-root cells. In theory, access could also be granted to a non-root cell, provided the root cell is not using the devices. But there is no concrete scenario in sight, and disabling probing over Jailhouse allows to build generic kernels that keep CONFIG_SERIO enabled for use in normal systems. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/39b68cc2c496501c9d95e6f40e5d76e3053c3908.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/jailhouse.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 34cf9d3e1751..b9f116d62f81 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -13,6 +13,7 @@ #include #include #include +#include #include static __initdata struct jailhouse_setup_data setup_data; @@ -32,6 +33,11 @@ static uint32_t __init jailhouse_detect(void) return jailhouse_cpuid_base(); } +static void jailhouse_get_wallclock(struct timespec *now) +{ + memset(now, 0, sizeof(*now)); +} + static void __init jailhouse_timer_init(void) { lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ); @@ -77,11 +83,18 @@ static void __init jailhouse_init_platform(void) struct setup_data header; void *mapping; + x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = jailhouse_timer_init; x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; x86_platform.calibrate_cpu = jailhouse_get_tsc; x86_platform.calibrate_tsc = jailhouse_get_tsc; + x86_platform.get_wallclock = jailhouse_get_wallclock; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + + legacy_pic = &null_legacy_pic; while (pa_data) { mapping = early_memremap(pa_data, sizeof(header)); -- cgit v1.2.3 From 5ae4443010b83cce3d55ce5259870e542a7c9551 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:51 +0100 Subject: x86/jailhouse: Silence ACPI warning Jailhouse support does not depend on ACPI, and does not even use it. But if it should be enabled, avoid warning about its absence in the platform. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/939687007cbd7643b02fd330e8616e7e5944063f.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/jailhouse.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index b9f116d62f81..54469ef4c3c7 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -125,6 +125,12 @@ static void __init jailhouse_init_platform(void) pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); precalibrated_tsc_khz = setup_data.tsc_khz; + + /* + * Avoid that the kernel complains about missing ACPI tables - there + * are none in a non-root cell. + */ + disable_acpi(); } bool jailhouse_paravirt(void) -- cgit v1.2.3 From fd498076821739db38babe72602f7c227587cbb5 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:52 +0100 Subject: x86/jailhouse: Halt instead of failing to restart Jailhouse provides no guest-initiated restart. So, do not even try to. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/ef8a0ef95c2b17c21066e5f28ea56b58bf7eaa82.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/jailhouse.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 54469ef4c3c7..2b55672ca05f 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -10,10 +10,12 @@ #include #include +#include #include #include #include #include +#include #include static __initdata struct jailhouse_setup_data setup_data; @@ -77,6 +79,12 @@ static void __init jailhouse_get_smp_config(unsigned int early) smp_found_config = 1; } +static void jailhouse_no_restart(void) +{ + pr_notice("Jailhouse: Restart not supported, halting\n"); + machine_halt(); +} + static void __init jailhouse_init_platform(void) { u64 pa_data = boot_params.hdr.setup_data; @@ -96,6 +104,8 @@ static void __init jailhouse_init_platform(void) legacy_pic = &null_legacy_pic; + machine_ops.emergency_restart = jailhouse_no_restart; + while (pa_data) { mapping = early_memremap(pa_data, sizeof(header)); memcpy(&header, mapping, sizeof(header)); -- cgit v1.2.3 From cf878e169d37b596de41322291523951540984c1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:53 +0100 Subject: x86/jailhouse: Wire up IOAPIC for legacy UART ports The typical I/O interrupts in non-root cells are MSI-based. However, the platform UARTs do not support MSI. In order to run a non-root cell that shall use one of them, the standard IOAPIC must be registered and 1:1 routing for IRQ 3 and 4 set up. If an IOAPIC is not available, the boot loader clears standard_ioapic in the setup data, so registration is skipped. If the guest is not allowed to to use one of those pins, Jailhouse will simply ignore the access. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/90d942dda9d48a8046e00bb3c1bb6757c83227be.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/jailhouse.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 2b55672ca05f..01d5b06a42bc 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,15 @@ static unsigned long jailhouse_get_tsc(void) static void __init jailhouse_get_smp_config(unsigned int early) { + struct ioapic_domain_cfg ioapic_cfg = { + .type = IOAPIC_DOMAIN_STRICT, + .ops = &mp_ioapic_irqdomain_ops, + }; + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + }; unsigned int cpu; if (x2apic_enabled()) { @@ -77,6 +87,17 @@ static void __init jailhouse_get_smp_config(unsigned int early) } smp_found_config = 1; + + if (setup_data.standard_ioapic) { + mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); + + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + mp_irq.srcbusirq = mp_irq.dstirq = 3; + mp_save_irq(&mp_irq); + + mp_irq.srcbusirq = mp_irq.dstirq = 4; + mp_save_irq(&mp_irq); + } } static void jailhouse_no_restart(void) -- cgit v1.2.3 From a0c01e4bb92d085462c293091a521cb9e7000371 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 27 Nov 2017 09:11:54 +0100 Subject: x86/jailhouse: Initialize PCI support With this change, PCI devices can be detected and used inside a non-root cell. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: jailhouse-dev@googlegroups.com Link: https://lkml.kernel.org/r/e8d19494b96b68a749bcac514795d864ad9c28c3.1511770314.git.jan.kiszka@siemens.com --- arch/x86/kernel/jailhouse.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 01d5b06a42bc..d6d5976a9b51 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -106,6 +107,19 @@ static void jailhouse_no_restart(void) machine_halt(); } +static int __init jailhouse_pci_arch_init(void) +{ + pci_direct_init(1); + + /* + * There are no bridges on the virtual PCI root bus under Jailhouse, + * thus no other way to discover all devices than a full scan. + */ + pcibios_last_bus = 0xff; + + return 0; +} + static void __init jailhouse_init_platform(void) { u64 pa_data = boot_params.hdr.setup_data; @@ -115,6 +129,7 @@ static void __init jailhouse_init_platform(void) x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = jailhouse_timer_init; x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; + x86_init.pci.arch_init = jailhouse_pci_arch_init; x86_platform.calibrate_cpu = jailhouse_get_tsc; x86_platform.calibrate_tsc = jailhouse_get_tsc; @@ -157,6 +172,8 @@ static void __init jailhouse_init_platform(void) precalibrated_tsc_khz = setup_data.tsc_khz; + pci_probe = 0; + /* * Avoid that the kernel complains about missing ACPI tables - there * are none in a non-root cell. -- cgit v1.2.3 From b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 22:13:29 +0100 Subject: x86/retpoline: Remove compile time warning Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler does not have retpoline support. Linus rationale for this is: It's wrong because it will just make people turn off RETPOLINE, and the asm updates - and return stack clearing - that are independent of the compiler are likely the most important parts because they are likely the ones easiest to target. And it's annoying because most people won't be able to do anything about it. The number of people building their own compiler? Very small. So if their distro hasn't got a compiler yet (and pretty much nobody does), the warning is just annoying crap. It is already properly reported as part of the sysfs interface. The compile-time warning only encourages bad things. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Requested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=GEcCDQg5uAtw@mail.gmail.com --- arch/x86/Makefile | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 974c61864978..504b1a4535ac 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -240,8 +240,6 @@ ifdef CONFIG_RETPOLINE RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) ifneq ($(RETPOLINE_CFLAGS),) KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE - else - $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) endif endif -- cgit v1.2.3 From 0d39e2669d7b0fefd2d8f9e7868ae669b364d9ba Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 10 Jan 2018 18:36:02 +0300 Subject: x86/kasan: Panic if there is not enough memory to boot Currently KASAN doesn't panic in case it don't have enough memory to boot. Instead, it crashes in some random place: kernel BUG at arch/x86/mm/physaddr.c:27! RIP: 0010:__phys_addr+0x268/0x276 Call Trace: kasan_populate_shadow+0x3f2/0x497 kasan_init+0x12e/0x2b2 setup_arch+0x2825/0x2a2c start_kernel+0xc8/0x15f4 x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x72/0x75 secondary_startup_64+0xa5/0xb0 Use memblock_virt_alloc_try_nid() for allocations without failure fallback. It will panic with an out of memory message. Reported-by: kernel test robot Signed-off-by: Andrey Ryabinin Signed-off-by: Thomas Gleixner Acked-by: Dmitry Vyukov Cc: kasan-dev@googlegroups.com Cc: Alexander Potapenko Cc: lkp@01.org Link: https://lkml.kernel.org/r/20180110153602.18919-1-aryabinin@virtuozzo.com --- arch/x86/mm/kasan_init_64.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 47388f0c0e59..af6f2f9c6a26 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -21,10 +21,14 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static __init void *early_alloc(size_t size, int nid) +static __init void *early_alloc(size_t size, int nid, bool panic) { - return memblock_virt_alloc_try_nid_nopanic(size, size, - __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); + if (panic) + return memblock_virt_alloc_try_nid(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); + else + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); } static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, @@ -38,14 +42,14 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, if (boot_cpu_has(X86_FEATURE_PSE) && ((end - addr) == PMD_SIZE) && IS_ALIGNED(addr, PMD_SIZE)) { - p = early_alloc(PMD_SIZE, nid); + p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; else if (p) memblock_free(__pa(p), PMD_SIZE); } - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); pmd_populate_kernel(&init_mm, pmd, p); } @@ -57,7 +61,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, if (!pte_none(*pte)) continue; - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); set_pte_at(&init_mm, addr, pte, entry); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -75,14 +79,14 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, if (boot_cpu_has(X86_FEATURE_GBPAGES) && ((end - addr) == PUD_SIZE) && IS_ALIGNED(addr, PUD_SIZE)) { - p = early_alloc(PUD_SIZE, nid); + p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; else if (p) memblock_free(__pa(p), PUD_SIZE); } - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); pud_populate(&init_mm, pud, p); } @@ -101,7 +105,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, unsigned long next; if (p4d_none(*p4d)) { - void *p = early_alloc(PAGE_SIZE, nid); + void *p = early_alloc(PAGE_SIZE, nid, true); p4d_populate(&init_mm, p4d, p); } @@ -122,7 +126,7 @@ static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, unsigned long next; if (pgd_none(*pgd)) { - p = early_alloc(PAGE_SIZE, nid); + p = early_alloc(PAGE_SIZE, nid, true); pgd_populate(&init_mm, pgd, p); } -- cgit v1.2.3 From c995efd5a740d9cbafbf58bde4973e8b50b4d761 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 17:49:25 +0000 Subject: x86/retpoline: Fill RSB on context switch for affected CPUs On context switch from a shallow call stack to a deeper one, as the CPU does 'ret' up the deeper side it may encounter RSB entries (predictions for where the 'ret' goes to) which were populated in userspace. This is problematic if neither SMEP nor KPTI (the latter of which marks userspace pages as NX for the kernel) are active, as malicious code in userspace may then be executed speculatively. Overwrite the CPU's return prediction stack with calls which are predicted to return to an infinite loop, to "capture" speculation if this happens. This is required both for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI. On Skylake+ the problem is slightly different, and an *underflow* of the RSB may cause errant branch predictions to occur. So there it's not so much overwrite, as *filling* the RSB to attempt to prevent it getting empty. This is only a partial solution for Skylake+ since there are many other conditions which may result in the RSB becoming empty. The full solution on Skylake+ is to use IBRS, which will prevent the problem even when the RSB becomes empty. With IBRS, the RSB-stuffing will not be required on context switch. [ tglx: Added missing vendor check and slighty massaged comments and changelog ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 11 +++++++++++ arch/x86/entry/entry_64.S | 11 +++++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a1f28a54f23a..60c4c342316c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -244,6 +244,17 @@ ENTRY(__switch_to_asm) movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popl %esi popl %edi diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 59874bc1aed2..d54a0ede61d1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -487,6 +487,17 @@ ENTRY(__switch_to_asm) movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popq %r15 popq %r14 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f275447862f4..aa09559b2c0b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -211,6 +211,7 @@ #define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e4dc26185aa7..390b3dc3d438 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -23,6 +23,7 @@ #include #include #include +#include static void __init spectre_v2_select_mitigation(void); @@ -155,6 +156,23 @@ disable: return SPECTRE_V2_CMD_NONE; } +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + return true; + } + } + return false; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -213,6 +231,24 @@ retpoline_auto: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If neither SMEP or KPTI are available, there is a risk of + * hitting userspace addresses in the RSB after a context switch + * from a shallow call stack to a deeper one. To prevent this fill + * the entire RSB, even when using IBRS. + * + * Skylake era CPUs have a separate issue with *underflow* of the + * RSB, when they will predict 'ret' targets from the generic BTB. + * The proper mitigation for this is IBRS. If IBRS is not supported + * or deactivated in favour of retpolines the RSB fill on context + * switch is required. + */ + if ((!boot_cpu_has(X86_FEATURE_PTI) && + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } } #undef pr_fmt -- cgit v1.2.3 From 28d437d550e1e39f805d99f9f8ac399c778827b7 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Sat, 13 Jan 2018 17:27:30 -0600 Subject: x86/retpoline: Add LFENCE to the retpoline/RSB filling RSB macros The PAUSE instruction is currently used in the retpoline and RSB filling macros as a speculation trap. The use of PAUSE was originally suggested because it showed a very, very small difference in the amount of cycles/time used to execute the retpoline as compared to LFENCE. On AMD, the PAUSE instruction is not a serializing instruction, so the pause/jmp loop will use excess power as it is speculated over waiting for return to mispredict to the correct target. The RSB filling macro is applicable to AMD, and, if software is unable to verify that LFENCE is serializing on AMD (possible when running under a hypervisor), the generic retpoline support will be used and, so, is also applicable to AMD. Keep the current usage of PAUSE for Intel, but add an LFENCE instruction to the speculation trap for AMD. The same sequence has been adopted by GCC for the GCC generated retpolines. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Acked-by: David Woodhouse Acked-by: Arjan van de Ven Cc: Rik van Riel Cc: Andi Kleen Cc: Paul Turner Cc: Peter Zijlstra Cc: Tim Chen Cc: Jiri Kosina Cc: Dave Hansen Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Kees Cook Link: https://lkml.kernel.org/r/20180113232730.31060.36287.stgit@tlendack-t1.amdoffice.net --- arch/x86/include/asm/nospec-branch.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 402a11c803c3..7b45d8424150 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -11,7 +11,7 @@ * Fill the CPU return stack buffer. * * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; jmp' loop to capture speculative execution. + * infinite 'pause; lfence; jmp' loop to capture speculative execution. * * This is required in various cases for retpoline and IBRS-based * mitigations for the Spectre variant 2 vulnerability. Sometimes to @@ -38,11 +38,13 @@ call 772f; \ 773: /* speculation trap */ \ pause; \ + lfence; \ jmp 773b; \ 772: \ call 774f; \ 775: /* speculation trap */ \ pause; \ + lfence; \ jmp 775b; \ 774: \ dec reg; \ @@ -73,6 +75,7 @@ call .Ldo_rop_\@ .Lspec_trap_\@: pause + lfence jmp .Lspec_trap_\@ .Ldo_rop_\@: mov \reg, (%_ASM_SP) @@ -165,6 +168,7 @@ " .align 16\n" \ "901: call 903f;\n" \ "902: pause;\n" \ + " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ "903: addl $4, %%esp;\n" \ -- cgit v1.2.3 From be6d447e4f9c5cc6d48aabc3ec362b6a559c3fd7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Jan 2018 10:24:34 +0100 Subject: x86/jailhouse: Hide x2apic code when CONFIG_X86_X2APIC=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x2apic_phys is not available when CONFIG_X86_X2APIC=n and the code is not optimized out resulting in a build fail: jailhouse.c: In function ‘jailhouse_get_smp_config’: jailhouse.c:73:3: error: ‘x2apic_phys’ undeclared (first use in this function) Fixes: 11c8dc419bbc ("x86/jailhouse: Enable APIC and SMP support") Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Jan Kiszka Cc: jailhouse-dev@googlegroups.com --- arch/x86/kernel/jailhouse.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index d6d5976a9b51..7ade152133c7 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -52,6 +52,24 @@ static unsigned long jailhouse_get_tsc(void) return precalibrated_tsc_khz; } +static void __init jailhouse_x2apic_init(void) +{ +#ifdef CONFIG_X86_X2APIC + if (!x2apic_enabled()) + return; + /* + * We do not have access to IR inside Jailhouse non-root cells. So + * we have to run in physical mode. + */ + x2apic_phys = 1; + /* + * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs + * ensure that only this APIC driver picks up the call. + */ + default_acpi_madt_oem_check("", ""); +#endif +} + static void __init jailhouse_get_smp_config(unsigned int early) { struct ioapic_domain_cfg ioapic_cfg = { @@ -65,20 +83,7 @@ static void __init jailhouse_get_smp_config(unsigned int early) }; unsigned int cpu; - if (x2apic_enabled()) { - /* - * We do not have access to IR inside Jailhouse non-root cells. - * So we have to run in physical mode. - */ - x2apic_phys = 1; - - /* - * This will trigger the switch to apic_x2apic_phys. - * Empty OEM IDs ensure that only this APIC driver picks up - * the call. - */ - default_acpi_madt_oem_check("", ""); - } + jailhouse_x2apic_init(); register_lapic_address(0xfee00000); -- cgit v1.2.3 From abde587b61a3ddb2918385f95ef2b3ca37d5a017 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Jan 2018 16:51:20 +0100 Subject: x86/jailhouse: Add PCI dependency Building jailhouse support without PCI results in a link error: arch/x86/kernel/jailhouse.o: In function `jailhouse_init_platform': jailhouse.c:(.init.text+0x235): undefined reference to `pci_probe' arch/x86/kernel/jailhouse.o: In function `jailhouse_pci_arch_init': jailhouse.c:(.init.text+0x265): undefined reference to `pci_direct_init' jailhouse.c:(.init.text+0x26c): undefined reference to `pcibios_last_bus' Add the missing Kconfig dependency. Fixes: a0c01e4bb92d ("x86/jailhouse: Initialize PCI support") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Jan Kiszka Link: https://lkml.kernel.org/r/20180115155150.51407-1-arnd@arndb.de --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a936e29245d0..390be2eb153d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -798,7 +798,7 @@ config PARAVIRT_CLOCK config JAILHOUSE_GUEST bool "Jailhouse non-root cell support" - depends on X86_64 + depends on X86_64 && PCI select X86_PM_TIMER ---help--- This option allows to run Linux as guest in a Jailhouse non-root -- cgit v1.2.3 From 1303880179e67c59e801429b7e5d0f6b21137d99 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Jan 2018 13:25:56 -0600 Subject: x86/mm: Clean up register saving in the __enc_copy() assembly code Clean up the use of PUSH and POP and when registers are saved in the __enc_copy() assembly function in order to improve the readability of the code. Move parameter register saving into general purpose registers earlier in the code and move all the pushes to the beginning of the function with corresponding pops at the end. We do this to prepare fixes. Tested-by: Gabriel Craciunescu Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brijesh Singh Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180110192556.6026.74187.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt_boot.S | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 730e6d541df1..de3688461145 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -103,20 +103,19 @@ ENTRY(__enc_copy) orq $X86_CR4_PGE, %rdx mov %rdx, %cr4 + push %r15 + + movq %rcx, %r9 /* Save kernel length */ + movq %rdi, %r10 /* Save encrypted kernel address */ + movq %rsi, %r11 /* Save decrypted kernel address */ + /* Set the PAT register PA5 entry to write-protect */ - push %rcx movl $MSR_IA32_CR_PAT, %ecx rdmsr - push %rdx /* Save original PAT value */ + mov %rdx, %r15 /* Save original PAT value */ andl $0xffff00ff, %edx /* Clear PA5 */ orl $0x00000500, %edx /* Set PA5 to WP */ wrmsr - pop %rdx /* RDX contains original PAT value */ - pop %rcx - - movq %rcx, %r9 /* Save kernel length */ - movq %rdi, %r10 /* Save encrypted kernel address */ - movq %rsi, %r11 /* Save decrypted kernel address */ wbinvd /* Invalidate any cache entries */ @@ -138,12 +137,13 @@ ENTRY(__enc_copy) jnz 1b /* Kernel length not zero? */ /* Restore PAT register */ - push %rdx /* Save original PAT value */ movl $MSR_IA32_CR_PAT, %ecx rdmsr - pop %rdx /* Restore original PAT value */ + mov %r15, %rdx /* Restore original PAT value */ wrmsr + pop %r15 + ret .L__enc_copy_end: ENDPROC(__enc_copy) -- cgit v1.2.3 From bacf6b499e11760aef73a3bb5ce4e5eea74a3fd4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Jan 2018 13:26:05 -0600 Subject: x86/mm: Use a struct to reduce parameters for SME PGD mapping In preparation for follow-on patches, combine the PGD mapping parameters into a struct to reduce the number of function arguments and allow for direct updating of the next pagetable mapping area pointer. Tested-by: Gabriel Craciunescu Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brijesh Singh Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180110192605.6026.96206.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt.c | 90 ++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 391b13402e40..5a20696c5440 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -464,6 +464,14 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); } +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_val; + unsigned long vaddr; +}; + static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, unsigned long end) { @@ -486,15 +494,14 @@ static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, #define PUD_FLAGS _KERNPG_TABLE_NOENC #define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) -static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, - unsigned long vaddr, pmdval_t pmd_val) +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { pgd_t *pgd_p; p4d_t *p4d_p; pud_t *pud_p; pmd_t *pmd_p; - pgd_p = pgd_base + pgd_index(vaddr); + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); if (native_pgd_val(*pgd_p)) { if (IS_ENABLED(CONFIG_X86_5LEVEL)) p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); @@ -504,15 +511,15 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, pgd_t pgd; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p = pgtable_area; + p4d_p = ppd->pgtable_area; memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); - pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); } else { - pud_p = pgtable_area; + pud_p = ppd->pgtable_area; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); } @@ -520,44 +527,41 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, } if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p += p4d_index(vaddr); + p4d_p += p4d_index(ppd->vaddr); if (native_p4d_val(*p4d_p)) { pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); } else { p4d_t p4d; - pud_p = pgtable_area; + pud_p = ppd->pgtable_area; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); native_set_p4d(p4d_p, p4d); } } - pud_p += pud_index(vaddr); + pud_p += pud_index(ppd->vaddr); if (native_pud_val(*pud_p)) { if (native_pud_val(*pud_p) & _PAGE_PSE) - goto out; + return; pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); } else { pud_t pud; - pmd_p = pgtable_area; + pmd_p = ppd->pgtable_area; memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); - pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); native_set_pud(pud_p, pud); } - pmd_p += pmd_index(vaddr); + pmd_p += pmd_index(ppd->vaddr); if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(pmd_val)); - -out: - return pgtable_area; + native_set_pmd(pmd_p, native_make_pmd(ppd->pmd_val)); } static unsigned long __init sme_pgtable_calc(unsigned long len) @@ -615,11 +619,10 @@ void __init sme_encrypt_kernel(void) unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; unsigned long kernel_start, kernel_end, kernel_len; + struct sme_populate_pgd_data ppd; unsigned long pgtable_area_len; unsigned long paddr, pmd_flags; unsigned long decrypted_base; - void *pgtable_area; - pgd_t *pgd; if (!sme_active()) return; @@ -683,18 +686,18 @@ void __init sme_encrypt_kernel(void) * pagetables and when the new encrypted and decrypted kernel * mappings are populated. */ - pgtable_area = (void *)execute_end; + ppd.pgtable_area = (void *)execute_end; /* * Make sure the current pagetable structure has entries for * addressing the workarea. */ - pgd = (pgd_t *)native_read_cr3_pa(); + ppd.pgd = (pgd_t *)native_read_cr3_pa(); paddr = workarea_start; while (paddr < workarea_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + PMD_FLAGS); + ppd.pmd_val = paddr + PMD_FLAGS; + ppd.vaddr = paddr; + sme_populate_pgd_large(&ppd); paddr += PMD_PAGE_SIZE; } @@ -708,17 +711,17 @@ void __init sme_encrypt_kernel(void) * populated with new PUDs and PMDs as the encrypted and decrypted * kernel mappings are created. */ - pgd = pgtable_area; - memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); - pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; /* Add encrypted kernel (identity) mappings */ pmd_flags = PMD_FLAGS | _PAGE_ENC; paddr = kernel_start; while (paddr < kernel_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + pmd_flags); + ppd.pmd_val = paddr + pmd_flags; + ppd.vaddr = paddr; + sme_populate_pgd_large(&ppd); paddr += PMD_PAGE_SIZE; } @@ -736,9 +739,9 @@ void __init sme_encrypt_kernel(void) pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); paddr = kernel_start; while (paddr < kernel_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr + decrypted_base, - paddr + pmd_flags); + ppd.pmd_val = paddr + pmd_flags; + ppd.vaddr = paddr + decrypted_base; + sme_populate_pgd_large(&ppd); paddr += PMD_PAGE_SIZE; } @@ -746,30 +749,29 @@ void __init sme_encrypt_kernel(void) /* Add decrypted workarea mappings to both kernel mappings */ paddr = workarea_start; while (paddr < workarea_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + PMD_FLAGS); + ppd.pmd_val = paddr + PMD_FLAGS; + ppd.vaddr = paddr; + sme_populate_pgd_large(&ppd); - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr + decrypted_base, - paddr + PMD_FLAGS); + ppd.vaddr = paddr + decrypted_base; + sme_populate_pgd_large(&ppd); paddr += PMD_PAGE_SIZE; } /* Perform the encryption */ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)pgd); + kernel_len, workarea_start, (unsigned long)ppd.pgd); /* * At this point we are running encrypted. Remove the mappings for * the decrypted areas - all that is needed for this is to remove * the PGD entry/entries. */ - sme_clear_pgd(pgd, kernel_start + decrypted_base, + sme_clear_pgd(ppd.pgd, kernel_start + decrypted_base, kernel_end + decrypted_base); - sme_clear_pgd(pgd, workarea_start + decrypted_base, + sme_clear_pgd(ppd.pgd, workarea_start + decrypted_base, workarea_end + decrypted_base); /* Flush the TLB - no globals so cr3 is enough */ -- cgit v1.2.3 From 2b5d00b6c2cdd94f6d6a494a6f6c0c0fc7b8e711 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Jan 2018 13:26:16 -0600 Subject: x86/mm: Centralize PMD flags in sme_encrypt_kernel() In preparation for encrypting more than just the kernel during early boot processing, centralize the use of the PMD flag settings based on the type of mapping desired. When 4KB aligned encryption is added, this will allow either PTE flags or large page PMD flags to be used without requiring the caller to adjust. Tested-by: Gabriel Craciunescu Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brijesh Singh Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180110192615.6026.14767.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt.c | 133 +++++++++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 56 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 5a20696c5440..35f38caa1fa3 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -468,31 +468,39 @@ struct sme_populate_pgd_data { void *pgtable_area; pgd_t *pgd; - pmdval_t pmd_val; + pmdval_t pmd_flags; + unsigned long paddr; + unsigned long vaddr; + unsigned long vaddr_end; }; -static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, - unsigned long end) +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; - pgd_start = start & PGDIR_MASK; - pgd_end = end & PGDIR_MASK; + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; - pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); - pgd_size *= sizeof(pgd_t); + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); - pgd_p = pgd_base + pgd_index(start); + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); memset(pgd_p, 0, pgd_size); } -#define PGD_FLAGS _KERNPG_TABLE_NOENC -#define P4D_FLAGS _KERNPG_TABLE_NOENC -#define PUD_FLAGS _KERNPG_TABLE_NOENC -#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) { @@ -561,7 +569,35 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) pmd_p += pmd_index(ppd->vaddr); if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(ppd->pmd_val)); + native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags) +{ + ppd->pmd_flags = pmd_flags; + + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_PAGE_SIZE; + ppd->paddr += PMD_PAGE_SIZE; + } +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP); } static unsigned long __init sme_pgtable_calc(unsigned long len) @@ -621,7 +657,6 @@ void __init sme_encrypt_kernel(void) unsigned long kernel_start, kernel_end, kernel_len; struct sme_populate_pgd_data ppd; unsigned long pgtable_area_len; - unsigned long paddr, pmd_flags; unsigned long decrypted_base; if (!sme_active()) @@ -693,14 +728,10 @@ void __init sme_encrypt_kernel(void) * addressing the workarea. */ ppd.pgd = (pgd_t *)native_read_cr3_pa(); - paddr = workarea_start; - while (paddr < workarea_end) { - ppd.pmd_val = paddr + PMD_FLAGS; - ppd.vaddr = paddr; - sme_populate_pgd_large(&ppd); - - paddr += PMD_PAGE_SIZE; - } + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); /* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); @@ -715,17 +746,6 @@ void __init sme_encrypt_kernel(void) memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; - /* Add encrypted kernel (identity) mappings */ - pmd_flags = PMD_FLAGS | _PAGE_ENC; - paddr = kernel_start; - while (paddr < kernel_end) { - ppd.pmd_val = paddr + pmd_flags; - ppd.vaddr = paddr; - sme_populate_pgd_large(&ppd); - - paddr += PMD_PAGE_SIZE; - } - /* * A different PGD index/entry must be used to get different * pagetable entries for the decrypted mapping. Choose the next @@ -735,29 +755,28 @@ void __init sme_encrypt_kernel(void) decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); decrypted_base <<= PGDIR_SHIFT; + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + /* Add decrypted, write-protected kernel (non-identity) mappings */ - pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); - paddr = kernel_start; - while (paddr < kernel_end) { - ppd.pmd_val = paddr + pmd_flags; - ppd.vaddr = paddr + decrypted_base; - sme_populate_pgd_large(&ppd); - - paddr += PMD_PAGE_SIZE; - } + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); /* Add decrypted workarea mappings to both kernel mappings */ - paddr = workarea_start; - while (paddr < workarea_end) { - ppd.pmd_val = paddr + PMD_FLAGS; - ppd.vaddr = paddr; - sme_populate_pgd_large(&ppd); - - ppd.vaddr = paddr + decrypted_base; - sme_populate_pgd_large(&ppd); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); - paddr += PMD_PAGE_SIZE; - } + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); /* Perform the encryption */ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, @@ -768,11 +787,13 @@ void __init sme_encrypt_kernel(void) * the decrypted areas - all that is needed for this is to remove * the PGD entry/entries. */ - sme_clear_pgd(ppd.pgd, kernel_start + decrypted_base, - kernel_end + decrypted_base); + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); - sme_clear_pgd(ppd.pgd, workarea_start + decrypted_base, - workarea_end + decrypted_base); + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); /* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); -- cgit v1.2.3 From cc5f01e28d6c60f274fd1e33b245f679f79f543c Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Jan 2018 13:26:26 -0600 Subject: x86/mm: Prepare sme_encrypt_kernel() for PAGE aligned encryption In preparation for encrypting more than just the kernel, the encryption support in sme_encrypt_kernel() needs to support 4KB page aligned encryption instead of just 2MB large page aligned encryption. Update the routines that populate the PGD to support non-2MB aligned addresses. This is done by creating PTE page tables for the start and end portion of the address range that fall outside of the 2MB alignment. This results in, at most, two extra pages to hold the PTE entries for each mapping of a range. Tested-by: Gabriel Craciunescu Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brijesh Singh Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180110192626.6026.75387.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/mm/mem_encrypt.c | 123 +++++++++++++++++++++++++++++++++++------ arch/x86/mm/mem_encrypt_boot.S | 20 +++++-- 2 files changed, 121 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 35f38caa1fa3..e74a1722d438 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -469,6 +469,7 @@ struct sme_populate_pgd_data { pgd_t *pgd; pmdval_t pmd_flags; + pteval_t pte_flags; unsigned long paddr; unsigned long vaddr; @@ -493,6 +494,7 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) #define PGD_FLAGS _KERNPG_TABLE_NOENC #define P4D_FLAGS _KERNPG_TABLE_NOENC #define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC #define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) @@ -502,7 +504,15 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) #define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) -static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd_p; p4d_t *p4d_p; @@ -553,7 +563,7 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) pud_p += pud_index(ppd->vaddr); if (native_pud_val(*pud_p)) { if (native_pud_val(*pud_p) & _PAGE_PSE) - return; + return NULL; pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); } else { @@ -567,16 +577,55 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) native_set_pud(pud_p, pud); } + return pmd_p; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pmd_t *pmd_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + pmd_p += pmd_index(ppd->vaddr); if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); } -static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, - pmdval_t pmd_flags) +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) { - ppd->pmd_flags = pmd_flags; + pmd_t *pmd_p; + pte_t *pte_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + + pmd_p += pmd_index(ppd->vaddr); + if (native_pmd_val(*pmd_p)) { + if (native_pmd_val(*pmd_p) & _PAGE_PSE) + return; + + pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); + } else { + pmd_t pmd; + pte_p = ppd->pgtable_area; + memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; + + pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); + native_set_pmd(pmd_p, pmd); + } + + pte_p += pte_index(ppd->vaddr); + if (!native_pte_val(*pte_p)) + native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ while (ppd->vaddr < ppd->vaddr_end) { sme_populate_pgd_large(ppd); @@ -585,33 +634,71 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, } } +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) { - __sme_map_range(ppd, PMD_FLAGS_ENC); + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); } static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { - __sme_map_range(ppd, PMD_FLAGS_DEC); + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); } static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) { - __sme_map_range(ppd, PMD_FLAGS_DEC_WP); + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } static unsigned long __init sme_pgtable_calc(unsigned long len) { - unsigned long p4d_size, pud_size, pmd_size; + unsigned long p4d_size, pud_size, pmd_size, pte_size; unsigned long total; /* * Perform a relatively simplistic calculation of the pagetable - * entries that are needed. That mappings will be covered by 2MB - * PMD entries so we can conservatively calculate the required + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required * number of P4D, PUD and PMD structures needed to perform the - * mappings. Incrementing the count for each covers the case where - * the addresses cross entries. + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. */ if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; @@ -625,8 +712,9 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) } pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; - total = p4d_size + pud_size + pmd_size; + total = p4d_size + pud_size + pmd_size + pte_size; /* * Now calculate the added pagetable structures needed to populate @@ -709,10 +797,13 @@ void __init sme_encrypt_kernel(void) /* * The total workarea includes the executable encryption area and - * the pagetable area. + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. */ workarea_len = execute_len + pgtable_area_len; - workarea_end = workarea_start + workarea_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); /* * Set the address to the start of where newly created pagetable diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index de3688461145..23a8a9e411ea 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -104,6 +104,7 @@ ENTRY(__enc_copy) mov %rdx, %cr4 push %r15 + push %r12 movq %rcx, %r9 /* Save kernel length */ movq %rdi, %r10 /* Save encrypted kernel address */ @@ -119,21 +120,27 @@ ENTRY(__enc_copy) wbinvd /* Invalidate any cache entries */ - /* Copy/encrypt 2MB at a time */ + /* Copy/encrypt up to 2MB at a time */ + movq $PMD_PAGE_SIZE, %r12 1: + cmpq %r12, %r9 + jnb 2f + movq %r9, %r12 + +2: movq %r11, %rsi /* Source - decrypted kernel */ movq %r8, %rdi /* Dest - intermediate copy buffer */ - movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + movq %r12, %rcx rep movsb movq %r8, %rsi /* Source - intermediate copy buffer */ movq %r10, %rdi /* Dest - encrypted kernel */ - movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + movq %r12, %rcx rep movsb - addq $PMD_PAGE_SIZE, %r11 - addq $PMD_PAGE_SIZE, %r10 - subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + addq %r12, %r11 + addq %r12, %r10 + subq %r12, %r9 /* Kernel length decrement */ jnz 1b /* Kernel length not zero? */ /* Restore PAT register */ @@ -142,6 +149,7 @@ ENTRY(__enc_copy) mov %r15, %rdx /* Restore original PAT value */ wrmsr + pop %r12 pop %r15 ret -- cgit v1.2.3 From 107cd2532181b96c549e8f224cdcca8631c3076b Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 10 Jan 2018 13:26:34 -0600 Subject: x86/mm: Encrypt the initrd earlier for BSP microcode update Currently the BSP microcode update code examines the initrd very early in the boot process. If SME is active, the initrd is treated as being encrypted but it has not been encrypted (in place) yet. Update the early boot code that encrypts the kernel to also encrypt the initrd so that early BSP microcode updates work. Tested-by: Gabriel Craciunescu Signed-off-by: Tom Lendacky Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brijesh Singh Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180110192634.6026.10452.stgit@tlendack-t1.amdoffice.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 4 +-- arch/x86/kernel/head64.c | 4 +-- arch/x86/kernel/setup.c | 10 ------ arch/x86/mm/mem_encrypt.c | 66 +++++++++++++++++++++++++++++++++----- arch/x86/mm/mem_encrypt_boot.S | 46 +++++++++++++------------- 5 files changed, 85 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c9459a4c3c68..22c5f3e6f820 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -39,7 +39,7 @@ void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sme_encrypt_kernel(void); +void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); @@ -67,7 +67,7 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } static inline bool sme_active(void) { return false; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a5d757b9cfd..7ba5d819ebe3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr, p = fixup_pointer(&phys_base, physaddr); *p += load_delta - sme_get_me_mask(); - /* Encrypt the kernel (if SME is active) */ - sme_encrypt_kernel(); + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); /* * Return the SME encryption mask (if SME is active) to be used as a diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 145810b0edf6..68d7ab81c62f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -364,16 +364,6 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - /* - * If SME is active, this memory will be marked encrypted by the - * kernel when it is accessed (including relocation). However, the - * ramdisk image was loaded decrypted by the bootloader, so make - * sure that it is encrypted before accessing it. For SEV the - * ramdisk will already be encrypted, so only do this for SME. - */ - if (sme_active()) - sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); - initrd_start = 0; mapped_size = memblock_mem_size(max_pfn_mapped); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index e74a1722d438..3ef362f598e3 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -738,11 +738,12 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return total; } -void __init sme_encrypt_kernel(void) +void __init sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; struct sme_populate_pgd_data ppd; unsigned long pgtable_area_len; unsigned long decrypted_base; @@ -751,14 +752,15 @@ void __init sme_encrypt_kernel(void) return; /* - * Prepare for encrypting the kernel by building new pagetables with - * the necessary attributes needed to encrypt the kernel in place. + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. * * One range of virtual addresses will map the memory occupied - * by the kernel as encrypted. + * by the kernel and initrd as encrypted. * * Another range of virtual addresses will map the memory occupied - * by the kernel as decrypted and write-protected. + * by the kernel and initrd as decrypted and write-protected. * * The use of write-protect attribute will prevent any of the * memory from being cached. @@ -769,6 +771,20 @@ void __init sme_encrypt_kernel(void) kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); kernel_len = kernel_end - kernel_start; + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + /* Set the encryption workarea to be immediately after the kernel */ workarea_start = kernel_end; @@ -791,6 +807,8 @@ void __init sme_encrypt_kernel(void) */ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; /* PUDs and PMDs needed in the current pagetables for the workarea */ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); @@ -829,9 +847,9 @@ void __init sme_encrypt_kernel(void) /* * A new pagetable structure is being built to allow for the kernel - * to be encrypted. It starts with an empty PGD that will then be - * populated with new PUDs and PMDs as the encrypted and decrypted - * kernel mappings are created. + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. */ ppd.pgd = ppd.pgtable_area; memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); @@ -844,6 +862,12 @@ void __init sme_encrypt_kernel(void) * the base of the mapping. */ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } decrypted_base <<= PGDIR_SHIFT; /* Add encrypted kernel (identity) mappings */ @@ -858,6 +882,21 @@ void __init sme_encrypt_kernel(void) ppd.vaddr_end = kernel_end + decrypted_base; sme_map_range_decrypted_wp(&ppd); + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + } + /* Add decrypted workarea mappings to both kernel mappings */ ppd.paddr = workarea_start; ppd.vaddr = workarea_start; @@ -873,6 +912,11 @@ void __init sme_encrypt_kernel(void) sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, kernel_len, workarea_start, (unsigned long)ppd.pgd); + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); + /* * At this point we are running encrypted. Remove the mappings for * the decrypted areas - all that is needed for this is to remove @@ -882,6 +926,12 @@ void __init sme_encrypt_kernel(void) ppd.vaddr_end = kernel_end + decrypted_base; sme_clear_pgd(&ppd); + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } + ppd.vaddr = workarea_start + decrypted_base; ppd.vaddr_end = workarea_end + decrypted_base; sme_clear_pgd(&ppd); diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 23a8a9e411ea..01f682cf77a8 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -22,9 +22,9 @@ ENTRY(sme_encrypt_execute) /* * Entry parameters: - * RDI - virtual address for the encrypted kernel mapping - * RSI - virtual address for the decrypted kernel mapping - * RDX - length of kernel + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping + * RDX - length to encrypt * RCX - virtual address of the encryption workarea, including: * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) @@ -41,9 +41,9 @@ ENTRY(sme_encrypt_execute) addq $PAGE_SIZE, %rax /* Workarea encryption routine */ push %r12 - movq %rdi, %r10 /* Encrypted kernel */ - movq %rsi, %r11 /* Decrypted kernel */ - movq %rdx, %r12 /* Kernel length */ + movq %rdi, %r10 /* Encrypted area */ + movq %rsi, %r11 /* Decrypted area */ + movq %rdx, %r12 /* Area length */ /* Copy encryption routine into the workarea */ movq %rax, %rdi /* Workarea encryption routine */ @@ -52,10 +52,10 @@ ENTRY(sme_encrypt_execute) rep movsb /* Setup registers for call */ - movq %r10, %rdi /* Encrypted kernel */ - movq %r11, %rsi /* Decrypted kernel */ + movq %r10, %rdi /* Encrypted area */ + movq %r11, %rsi /* Decrypted area */ movq %r8, %rdx /* Pagetables used for encryption */ - movq %r12, %rcx /* Kernel length */ + movq %r12, %rcx /* Area length */ movq %rax, %r8 /* Workarea encryption routine */ addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ @@ -71,7 +71,7 @@ ENDPROC(sme_encrypt_execute) ENTRY(__enc_copy) /* - * Routine used to encrypt kernel. + * Routine used to encrypt memory in place. * This routine must be run outside of the kernel proper since * the kernel will be encrypted during the process. So this * routine is defined here and then copied to an area outside @@ -79,19 +79,19 @@ ENTRY(__enc_copy) * during execution. * * On entry the registers must be: - * RDI - virtual address for the encrypted kernel mapping - * RSI - virtual address for the decrypted kernel mapping + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping * RDX - address of the pagetables to use for encryption - * RCX - length of kernel + * RCX - length of area * R8 - intermediate copy buffer * * RAX - points to this routine * - * The kernel will be encrypted by copying from the non-encrypted - * kernel space to an intermediate buffer and then copying from the - * intermediate buffer back to the encrypted kernel space. The physical - * addresses of the two kernel space mappings are the same which - * results in the kernel being encrypted "in place". + * The area will be encrypted by copying from the non-encrypted + * memory space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted memory space. The physical + * addresses of the two mappings are the same which results in the area + * being encrypted "in place". */ /* Enable the new page tables */ mov %rdx, %cr3 @@ -106,9 +106,9 @@ ENTRY(__enc_copy) push %r15 push %r12 - movq %rcx, %r9 /* Save kernel length */ - movq %rdi, %r10 /* Save encrypted kernel address */ - movq %rsi, %r11 /* Save decrypted kernel address */ + movq %rcx, %r9 /* Save area length */ + movq %rdi, %r10 /* Save encrypted area address */ + movq %rsi, %r11 /* Save decrypted area address */ /* Set the PAT register PA5 entry to write-protect */ movl $MSR_IA32_CR_PAT, %ecx @@ -128,13 +128,13 @@ ENTRY(__enc_copy) movq %r9, %r12 2: - movq %r11, %rsi /* Source - decrypted kernel */ + movq %r11, %rsi /* Source - decrypted area */ movq %r8, %rdi /* Dest - intermediate copy buffer */ movq %r12, %rcx rep movsb movq %r8, %rsi /* Source - intermediate copy buffer */ - movq %r10, %rdi /* Dest - encrypted kernel */ + movq %r10, %rdi /* Dest - encrypted area */ movq %r12, %rcx rep movsb -- cgit v1.2.3 From 673aa20c55a138621d1340d343cd6b07c1cb4e92 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 8 Jan 2018 13:39:59 -0600 Subject: x86/platform/UV: Update uv_mmrs.h to prepare for UV4A fixes Regenerate uv_mmrs.h file to accommodate fixes to UV4A MMRs. Signed-off-by: Mike Travis Acked-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515440405-20880-2-git-send-email-mike.travis@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_mmrs.h | 615 +++++++++++++++++++++++++++++++++----- 1 file changed, 533 insertions(+), 82 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 548d684a7960..f113e278ffff 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -3031,6 +3031,41 @@ union uvh_node_present_table_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_0_mmr_u { unsigned long v; @@ -3042,6 +3077,46 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; + struct uv1h_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s1; + struct uvxh_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } sx; + struct uv2h_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s2; + struct uv3h_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s3; + struct uv4h_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s4; }; /* ========================================================================= */ @@ -3064,6 +3139,41 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long v; @@ -3075,6 +3185,46 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; + struct uv1h_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s1; + struct uvxh_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } sx; + struct uv2h_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s2; + struct uv3h_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s3; + struct uv4h_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s4; }; /* ========================================================================= */ @@ -3097,6 +3247,41 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long v; @@ -3108,6 +3293,46 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; + struct uv1h_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s1; + struct uvxh_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } sx; + struct uv2h_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s2; + struct uv3h_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s3; + struct uv4h_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s4; }; /* ========================================================================= */ @@ -3126,6 +3351,21 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_0_mmr_u { unsigned long v; @@ -3134,6 +3374,31 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; + struct uv1h_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s1; + struct uvxh_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } sx; + struct uv2h_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s2; + struct uv3h_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s3; + struct uv4h_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s4; }; /* ========================================================================= */ @@ -3152,6 +3417,21 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_1_mmr_u { unsigned long v; @@ -3160,6 +3440,31 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; + struct uv1h_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s1; + struct uvxh_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } sx; + struct uv2h_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s2; + struct uv3h_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s3; + struct uv4h_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s4; }; /* ========================================================================= */ @@ -3178,6 +3483,21 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_2_mmr_u { unsigned long v; @@ -3186,6 +3506,31 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; + struct uv1h_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s1; + struct uvxh_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } sx; + struct uv2h_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s2; + struct uv3h_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s3; + struct uv4h_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s4; }; /* ========================================================================= */ @@ -3383,6 +3728,106 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { } s4; }; +/* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ +/* ========================================================================= */ +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR") +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR") +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x483000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR) + + + +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL + + +union uvh_rh_gam_mmioh_overlay_config0_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_overlay_config0_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; + struct uv4h_rh_gam_mmioh_overlay_config0_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s4; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR */ +/* ========================================================================= */ +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1604000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x484000UL +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR) + + + +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL + + +union uvh_rh_gam_mmioh_overlay_config1_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_overlay_config1_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; + struct uv4h_rh_gam_mmioh_overlay_config1_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s4; +}; + /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ @@ -3437,6 +3882,94 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { } s2; }; +/* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR */ +/* ========================================================================= */ +#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR") +#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR") +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x1603800UL +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x483800UL +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR) + +#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH") +#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH") +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH) + + + +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL + +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL + + +union uvh_rh_gam_mmioh_redirect_config0_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_redirect_config0_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; + struct uv4h_rh_gam_mmioh_redirect_config0_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s4; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR */ +/* ========================================================================= */ +#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR") +#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR") +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x1604800UL +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x484800UL +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR) + +#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH") +#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH") +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH) + + + +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL + +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 +#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL + + +union uvh_rh_gam_mmioh_redirect_config1_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_redirect_config1_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; + struct uv4h_rh_gam_mmioh_redirect_config1_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s4; +}; + /* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ @@ -4137,88 +4670,6 @@ union uv3h_gr0_gam_gr_config_u { } s3; }; -/* ========================================================================= */ -/* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ -/* ========================================================================= */ -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL - -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL - -union uv3h_rh_gam_mmioh_overlay_config0_mmr_u { - unsigned long v; - struct uv3h_rh_gam_mmioh_overlay_config0_mmr_s { - unsigned long rsvd_0_25:26; - unsigned long base:20; /* RW */ - unsigned long m_io:6; /* RW */ - unsigned long n_io:4; - unsigned long rsvd_56_62:7; - unsigned long enable:1; /* RW */ - } s3; -}; - -/* ========================================================================= */ -/* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR */ -/* ========================================================================= */ -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1604000UL - -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL - -union uv3h_rh_gam_mmioh_overlay_config1_mmr_u { - unsigned long v; - struct uv3h_rh_gam_mmioh_overlay_config1_mmr_s { - unsigned long rsvd_0_25:26; - unsigned long base:20; /* RW */ - unsigned long m_io:6; /* RW */ - unsigned long n_io:4; - unsigned long rsvd_56_62:7; - unsigned long enable:1; /* RW */ - } s3; -}; - -/* ========================================================================= */ -/* UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR */ -/* ========================================================================= */ -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x1603800UL -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 - -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL - -union uv3h_rh_gam_mmioh_redirect_config0_mmr_u { - unsigned long v; - struct uv3h_rh_gam_mmioh_redirect_config0_mmr_s { - unsigned long nasid:15; /* RW */ - unsigned long rsvd_15_63:49; - } s3; -}; - -/* ========================================================================= */ -/* UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR */ -/* ========================================================================= */ -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x1604800UL -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 - -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 -#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL - -union uv3h_rh_gam_mmioh_redirect_config1_mmr_u { - unsigned long v; - struct uv3h_rh_gam_mmioh_redirect_config1_mmr_s { - unsigned long nasid:15; /* RW */ - unsigned long rsvd_15_63:49; - } s3; -}; - /* ========================================================================= */ /* UV4H_LB_PROC_INTD_QUEUE_FIRST */ /* ========================================================================= */ -- cgit v1.2.3 From 62807106c3219d2d6ddbfc778a5ee7e6ba38e58f Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 8 Jan 2018 13:40:00 -0600 Subject: x86/platform/UV: Fix UV4A support on new Intel Processors Upcoming Intel CascadeLake and IceLake processors have some architecture changes that required fixes in the UV4 HUB bringing that chip to revision 2. The nomenclature for that new chip is "UV4A". This patch fixes the references for the expanded MMR definitions in the previous (automated) patch. Signed-off-by: Mike Travis Acked-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515440405-20880-3-git-send-email-mike.travis@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 6de35fc8fb3a..ebb7d264bcac 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -768,6 +768,7 @@ static __init void map_gru_high(int max_pnode) return; } + /* Only UV3 has distributed GRU mode */ if (is_uv3_hub() && gru.s3.mode) { map_gru_distributed(gru.v); return; @@ -817,17 +818,20 @@ static __initdata struct mmioh_config mmiohs[] = { /* UV3 & UV4 have identical MMIOH overlay configs */ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) { - union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; + union uvh_rh_gam_mmioh_overlay_config0_mmr_u overlay; unsigned long mmr; unsigned long base; + unsigned long m_overlay; int i, n, shift, m_io, max_io; int nasid, lnasid, fi, li; char *id; id = mmiohs[index].id; overlay.v = uv_read_local_mmr(mmiohs[index].overlay); + m_overlay = mmiohs[index].overlay; - pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io); + pr_info("UV: %s overlay 0x%lx(@0x%lx) base:0x%x m_io:%d\n", + id, overlay.v, m_overlay, overlay.s3.base, overlay.s3.m_io); if (!overlay.s3.enable) { pr_info("UV: %s disabled\n", id); return; @@ -844,10 +848,14 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) max_io = lnasid = fi = li = -1; for (i = 0; i < n; i++) { - union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect; + union uvh_rh_gam_mmioh_redirect_config0_mmr_u redirect; + unsigned long m_redirect = mmr + i * 8; redirect.v = uv_read_local_mmr(mmr + i * 8); nasid = redirect.s3.nasid; + printk_once(KERN_INFO + "UV: %s redirect 0x%lx(@0x%lx) 0x%04x\n", + id, redirect.v, m_redirect, nasid); /* Invalid NASID: */ if (nasid < min_pnode || max_pnode < nasid) nasid = -1; -- cgit v1.2.3 From 8078d1951da228e20dc36f83306845a565f51345 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 8 Jan 2018 13:40:01 -0600 Subject: x86/platform/UV: Add references to access fixed UV4A HUB MMRs Add references to enable access to fixed UV4A (rev2) HUB MMRs. Signed-off-by: Mike Travis Acked-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515440405-20880-4-git-send-email-mike.travis@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 14 ++++++++++++++ arch/x86/include/asm/uv/uv_mmrs.h | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 2 ++ 3 files changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 036e26d63d9a..44cf6d6deb7a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -241,6 +241,7 @@ static inline int uv_hub_info_check(int version) #define UV2_HUB_REVISION_BASE 3 #define UV3_HUB_REVISION_BASE 5 #define UV4_HUB_REVISION_BASE 7 +#define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ #ifdef UV1_HUB_IS_SUPPORTED static inline int is_uv1_hub(void) @@ -280,6 +281,19 @@ static inline int is_uv3_hub(void) } #endif +/* First test "is UV4A", then "is UV4" */ +#ifdef UV4A_HUB_IS_SUPPORTED +static inline int is_uv4a_hub(void) +{ + return (uv_hub_info->hub_revision >= UV4A_HUB_REVISION_BASE); +} +#else +static inline int is_uv4a_hub(void) +{ + return 0; +} +#endif + #ifdef UV4_HUB_IS_SUPPORTED static inline int is_uv4_hub(void) { diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index f113e278ffff..b3afccc2b92e 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -99,6 +99,7 @@ #define UV2_HUB_IS_SUPPORTED 1 #define UV3_HUB_IS_SUPPORTED 1 #define UV4_HUB_IS_SUPPORTED 1 +#define UV4A_HUB_IS_SUPPORTED 1 /* Error function to catch undefined references */ extern unsigned long uv_undefined(char *str); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ebb7d264bcac..2ddc140c23fe 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -137,6 +137,8 @@ static int __init early_get_pnodeid(void) case UV3_HUB_PART_NUMBER_X: uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; break; + + /* Update: UV4A has only a modified revision to indicate HUB fixes */ case UV4_HUB_PART_NUMBER: uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ -- cgit v1.2.3 From ecce47e0bde6faa3256740280754bfd06a1a4efa Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 8 Jan 2018 13:40:02 -0600 Subject: x86/platform/UV: Fix GAM MMR changes in UV4A Intel processor changes necessitated UV4 HUB Global Address Memory (GAM) fixes to accommodate support for those processors. This patch deals with the updated address range change from 46 to 52 bits in UV4A. Signed-off-by: Mike Travis Acked-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515440405-20880-5-git-send-email-mike.travis@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_mmrs.h | 86 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index b3afccc2b92e..30db549885e2 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -3743,7 +3743,6 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR) - #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 @@ -3758,6 +3757,30 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { #define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL #define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 52 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x000ffffffc000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x03f0000000000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT) + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK) + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK) union uvh_rh_gam_mmioh_overlay_config0_mmr_u { unsigned long v; @@ -3777,6 +3800,14 @@ union uvh_rh_gam_mmioh_overlay_config0_mmr_u { unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } s4; + struct uv4ah_rh_gam_mmioh_overlay_config0_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ + unsigned long enable:1; /* RW */ + } s4a; }; /* ========================================================================= */ @@ -3784,8 +3815,8 @@ union uvh_rh_gam_mmioh_overlay_config0_mmr_u { /* ========================================================================= */ #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") -#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1604000UL -#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x484000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1603000UL +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x483000UL #define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR ( \ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ @@ -3793,7 +3824,6 @@ union uvh_rh_gam_mmioh_overlay_config0_mmr_u { /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR) - #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 @@ -3808,6 +3838,24 @@ union uvh_rh_gam_mmioh_overlay_config0_mmr_u { #define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL #define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 52 +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x000ffffffc000000UL +#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x03f0000000000000UL + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT) + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK) + +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) union uvh_rh_gam_mmioh_overlay_config1_mmr_u { unsigned long v; @@ -3827,6 +3875,14 @@ union uvh_rh_gam_mmioh_overlay_config1_mmr_u { unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } s4; + struct uv4ah_rh_gam_mmioh_overlay_config1_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:26; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long undef_62:1; /* Undefined */ + unsigned long enable:1; /* RW */ + } s4a; }; /* ========================================================================= */ @@ -3907,13 +3963,18 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH) - #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000000fffUL + +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK) union uvh_rh_gam_mmioh_redirect_config0_mmr_u { unsigned long v; @@ -3925,6 +3986,10 @@ union uvh_rh_gam_mmioh_redirect_config0_mmr_u { unsigned long nasid:15; /* RW */ unsigned long rsvd_15_63:49; } s4; + struct uv4ah_rh_gam_mmioh_redirect_config0_mmr_s { + unsigned long nasid:12; /* RW */ + unsigned long rsvd_12_63:52; + } s4a; }; /* ========================================================================= */ @@ -3951,13 +4016,18 @@ union uvh_rh_gam_mmioh_redirect_config0_mmr_u { /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH) - #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000000fffUL + +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK ( \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK : \ + is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK) union uvh_rh_gam_mmioh_redirect_config1_mmr_u { unsigned long v; @@ -3969,6 +4039,10 @@ union uvh_rh_gam_mmioh_redirect_config1_mmr_u { unsigned long nasid:15; /* RW */ unsigned long rsvd_15_63:49; } s4; + struct uv4ah_rh_gam_mmioh_redirect_config1_mmr_s { + unsigned long nasid:12; /* RW */ + unsigned long rsvd_12_63:52; + } s4a; }; /* ========================================================================= */ -- cgit v1.2.3 From 09c3ae12b2bf6dc2837d89c1017bf151af610a1f Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 8 Jan 2018 13:40:03 -0600 Subject: x86/platform/UV: Fix GAM MMR references in the UV x2apic code Along with the fixes in UV4A (rev2) MMRs, the code to access those MMRs also was modified by the fixes. UV3, UV4, and UV4A no longer have compatible setups for Global Address Memory (GAM). Correct the new mistakes. Signed-off-by: Mike Travis Acked-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515440405-20880-6-git-send-email-mike.travis@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 83 +++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2ddc140c23fe..46b675aaf20b 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -794,70 +794,61 @@ static __init void map_mmr_high(int max_pnode) pr_info("UV: MMR disabled\n"); } -/* - * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY - * and REDIRECT MMR regs are exactly the same on UV3. - */ -struct mmioh_config { - unsigned long overlay; - unsigned long redirect; - char *id; -}; - -static __initdata struct mmioh_config mmiohs[] = { - { - UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR, - UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR, - "MMIOH0" - }, - { - UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR, - UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR, - "MMIOH1" - }, -}; - -/* UV3 & UV4 have identical MMIOH overlay configs */ -static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) +/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */ +static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) { - union uvh_rh_gam_mmioh_overlay_config0_mmr_u overlay; + unsigned long overlay; unsigned long mmr; unsigned long base; + unsigned long nasid_mask; unsigned long m_overlay; int i, n, shift, m_io, max_io; int nasid, lnasid, fi, li; char *id; - id = mmiohs[index].id; - overlay.v = uv_read_local_mmr(mmiohs[index].overlay); - m_overlay = mmiohs[index].overlay; - - pr_info("UV: %s overlay 0x%lx(@0x%lx) base:0x%x m_io:%d\n", - id, overlay.v, m_overlay, overlay.s3.base, overlay.s3.m_io); - if (!overlay.s3.enable) { + if (index == 0) { + id = "MMIOH0"; + m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR; + overlay = uv_read_local_mmr(m_overlay); + base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK; + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR; + m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) + >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; + shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK; + } else { + id = "MMIOH1"; + m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR; + overlay = uv_read_local_mmr(m_overlay); + base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK; + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR; + m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) + >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; + shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK; + } + pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io); + if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) { pr_info("UV: %s disabled\n", id); return; } - shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT; - base = (unsigned long)overlay.s3.base; - m_io = overlay.s3.m_io; - mmr = mmiohs[index].redirect; - n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; /* Convert to NASID: */ min_pnode *= 2; max_pnode *= 2; max_io = lnasid = fi = li = -1; for (i = 0; i < n; i++) { - union uvh_rh_gam_mmioh_redirect_config0_mmr_u redirect; unsigned long m_redirect = mmr + i * 8; + unsigned long redirect = uv_read_local_mmr(m_redirect); + + nasid = redirect & nasid_mask; + if (i == 0) + pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", + id, redirect, m_redirect, nasid); - redirect.v = uv_read_local_mmr(mmr + i * 8); - nasid = redirect.s3.nasid; - printk_once(KERN_INFO - "UV: %s redirect 0x%lx(@0x%lx) 0x%04x\n", - id, redirect.v, m_redirect, nasid); /* Invalid NASID: */ if (nasid < min_pnode || max_pnode < nasid) nasid = -1; @@ -905,8 +896,8 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) if (is_uv3_hub() || is_uv4_hub()) { /* Map both MMIOH regions: */ - map_mmioh_high_uv3(0, min_pnode, max_pnode); - map_mmioh_high_uv3(1, min_pnode, max_pnode); + map_mmioh_high_uv34(0, min_pnode, max_pnode); + map_mmioh_high_uv34(1, min_pnode, max_pnode); return; } -- cgit v1.2.3 From a631a0a7a3caf6a9924856f3dcfe256e747f7467 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 8 Jan 2018 13:40:04 -0600 Subject: x86/platform/UV: Fix UV4A BAU MMRs Fixes to accommodate Intel Processor changes for UV4A broadcast assist unit (BAU) MMRs. Signed-off-by: Mike Travis Acked-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515440405-20880-7-git-send-email-mike.travis@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_mmrs.h | 59 +++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 30db549885e2..ecb9ddef128f 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -39,9 +39,11 @@ * #define UV2Hxxx b * #define UV3Hxxx c * #define UV4Hxxx d + * #define UV4AHxxx e * #define UVHxxx (is_uv1_hub() ? UV1Hxxx : * (is_uv2_hub() ? UV2Hxxx : * (is_uv3_hub() ? UV3Hxxx : + * (is_uv4a_hub() ? UV4AHxxx : * UV4Hxxx)) * * If the MMR exists on all hub types > 1 but have different addresses, the @@ -49,8 +51,10 @@ * #define UV2Hxxx b * #define UV3Hxxx c * #define UV4Hxxx d + * #define UV4AHxxx e * #define UVHxxx (is_uv2_hub() ? UV2Hxxx : * (is_uv3_hub() ? UV3Hxxx : + * (is_uv4a_hub() ? UV4AHxxx : * UV4Hxxx)) * * union uvh_xxx { @@ -63,6 +67,7 @@ * } s2; * struct uv3h_xxx_s { # Full UV3 definition (*) * } s3; + * (NOTE: No struct uv4ah_xxx_s members exist) * struct uv4h_xxx_s { # Full UV4 definition (*) * } s4; * }; @@ -2780,35 +2785,47 @@ union uvh_lb_bau_sb_activation_status_1_u { /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32) #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 #define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL - +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 #define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 #define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 #define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x00003ffffffff000UL - - -union uvh_lb_bau_sb_descriptor_base_u { - unsigned long v; - struct uvh_lb_bau_sb_descriptor_base_s { - unsigned long rsvd_0_11:12; - unsigned long rsvd_12_48:37; - unsigned long node_id:14; /* RW */ - unsigned long rsvd_63:1; - } s; - struct uv4h_lb_bau_sb_descriptor_base_s { - unsigned long rsvd_0_11:12; - unsigned long page_address:34; /* RW */ - unsigned long rsvd_46_48:3; - unsigned long node_id:14; /* RW */ - unsigned long rsvd_63:1; - } s4; -}; +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL + +#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 53 +#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000ffffffffff000UL +#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0xffe0000000000000UL + +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ + is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT) + +#define UVH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ + is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK) + +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ + is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK) /* ========================================================================= */ /* UVH_NODE_ID */ -- cgit v1.2.3 From 1da2fd61d956a01ead87173a8367e5c664617f7b Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Mon, 8 Jan 2018 13:43:12 -0600 Subject: x86/platform/uv/BAU: Replace hard-coded values with MMR definitions Replaces hard-coded node ID shift for the descriptor base MMR to fix initialization on UV4A while maintaining support for previous architectures. Signed-off-by: Andrew Banman Acked-by: Mike Travis Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1515440592-44060-1-git-send-email-abanman@hpe.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 - arch/x86/platform/uv/tlb_uv.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 7cac79802ad2..7803114aa140 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -48,7 +48,6 @@ #define UV2_NET_ENDPOINT_INTD 0x28 #define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \ UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD) -#define UV_DESC_PSHIFT 49 #define UV_PAYLOADQ_GNODE_SHIFT 49 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" #define UV_BAU_BASENAME "sgi_uv/bau_tunables" diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 8538a6723171..c2e9285d1bf1 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1751,7 +1751,8 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) uv1 = 1; /* the 14-bit pnode */ - write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); + write_mmr_descriptor_base(pnode, + (n << UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT | m)); /* * Initializing all 8 (ITEMS_PER_DESC) descriptors for each * cpu even though we only use the first one; one descriptor can -- cgit v1.2.3 From 955999c9023290da18230b57df1f04187a43a4c0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 27 Nov 2017 09:02:22 +0100 Subject: m68k/defconfig: Update defconfigs for v4.15-rc1 Signed-off-by: Geert Uytterhoeven --- arch/m68k/configs/amiga_defconfig | 4 +++- arch/m68k/configs/apollo_defconfig | 4 +++- arch/m68k/configs/atari_defconfig | 4 +++- arch/m68k/configs/bvme6000_defconfig | 4 +++- arch/m68k/configs/hp300_defconfig | 4 +++- arch/m68k/configs/mac_defconfig | 4 +++- arch/m68k/configs/multi_defconfig | 4 +++- arch/m68k/configs/mvme147_defconfig | 4 +++- arch/m68k/configs/mvme16x_defconfig | 4 +++- arch/m68k/configs/q40_defconfig | 4 +++- arch/m68k/configs/sun3_defconfig | 4 +++- arch/m68k/configs/sun3x_defconfig | 4 +++- 12 files changed, 36 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 5b5fa9831b4d..e0b285e1e75f 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -454,7 +454,6 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_CIRRUS=y CONFIG_FB_AMIGA=y @@ -595,6 +594,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -624,6 +624,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -653,3 +654,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 72a7764b74ed..3281026a3e15 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -422,7 +422,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -554,6 +553,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -583,6 +583,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -612,3 +613,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 884b43a2f0d9..e943fad480cf 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -437,7 +437,6 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_ATARI=y CONFIG_FRAMEBUFFER_CONSOLE=y @@ -576,6 +575,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -605,6 +605,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -634,3 +635,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index fcfa60d31499..700c2310c336 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -420,7 +420,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -546,6 +545,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -575,6 +575,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -604,3 +605,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 9d597bbbbbfe..271d57fa4301 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -425,7 +425,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -556,6 +555,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -585,6 +585,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -614,3 +615,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 45da20d1286c..88761b867975 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -447,7 +447,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_VALKYRIE=y CONFIG_FB_MAC=y @@ -578,6 +577,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -607,6 +607,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -636,3 +637,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index fda880c10861..7cb35dadf03b 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -504,7 +504,6 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FB_CIRRUS=y CONFIG_FB_AMIGA=y @@ -658,6 +657,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -687,6 +687,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -716,3 +717,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 7d5e4863efec..b139d7b68393 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -420,7 +420,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -546,6 +545,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -575,6 +575,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -604,3 +605,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 7763b71a9c49..398346138769 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -420,7 +420,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_HID=m CONFIG_HIDRAW=y CONFIG_UHID=m @@ -546,6 +545,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -575,6 +575,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -604,3 +605,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 17eaebfa3e19..14c608326f6d 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -437,7 +437,6 @@ CONFIG_PPS_CLIENT_LDISC=m CONFIG_PPS_CLIENT_PARPORT=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -569,6 +568,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -598,6 +598,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -627,3 +628,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index d1cb7a04ae1d..97dec0bf52f1 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -419,7 +419,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -548,6 +547,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -576,6 +576,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -605,3 +606,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index ea3a331c62d5..56df28d6d91d 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -419,7 +419,6 @@ CONFIG_NTP_PPS=y CONFIG_PPS_CLIENT_LDISC=m CONFIG_PTP_1588_CLOCK=m # CONFIG_HWMON is not set -# CONFIG_RC_CORE is not set CONFIG_FB=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y @@ -548,6 +547,7 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m +CONFIG_TEST_FIND_BIT=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m CONFIG_TEST_UDELAY=m @@ -577,6 +577,7 @@ CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -606,3 +607,4 @@ CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set CONFIG_CRC32_SELFTEST=m CONFIG_XZ_DEC_TEST=m +CONFIG_STRING_SELFTEST=m -- cgit v1.2.3 From b87eaec27eca3def6c8ed617e3b1bac08d7bc715 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Sat, 13 Jan 2018 17:37:13 -0500 Subject: nubus: Add expansion_type values for various Mac models Add an expansion slot attribute to allow drivers to properly handle cards like Comm Slot cards and PDS cards without declaration ROMs. This clarifies the logic for the Centris 610 model which has no Comm Slot but has an optional on-board SONIC device. Cc: "David S. Miller" Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/macintosh.h | 9 ++- arch/m68k/mac/config.c | 110 +++++++++++++------------------- drivers/net/ethernet/natsemi/macsonic.c | 8 +-- 3 files changed, 54 insertions(+), 73 deletions(-) (limited to 'arch') diff --git a/arch/m68k/include/asm/macintosh.h b/arch/m68k/include/asm/macintosh.h index f42c27400dbc..9b840c03ebb7 100644 --- a/arch/m68k/include/asm/macintosh.h +++ b/arch/m68k/include/asm/macintosh.h @@ -33,7 +33,7 @@ struct mac_model char ide_type; char scc_type; char ether_type; - char nubus_type; + char expansion_type; char floppy_type; }; @@ -73,8 +73,11 @@ struct mac_model #define MAC_ETHER_SONIC 1 #define MAC_ETHER_MACE 2 -#define MAC_NO_NUBUS 0 -#define MAC_NUBUS 1 +#define MAC_EXP_NONE 0 +#define MAC_EXP_PDS 1 /* Accepts only a PDS card */ +#define MAC_EXP_NUBUS 2 /* Accepts only NuBus card(s) */ +#define MAC_EXP_PDS_NUBUS 3 /* Accepts PDS card and/or NuBus card(s) */ +#define MAC_EXP_PDS_COMM 4 /* Accepts PDS card or Comm Slot card */ #define MAC_FLOPPY_IWM 0 #define MAC_FLOPPY_SWIM_ADDR1 1 diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index 16cd5cea5207..d3d435248a24 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -212,7 +212,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_II, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_IWM, }, @@ -227,7 +227,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_II, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_IWM, }, { .ident = MAC_MODEL_IIX, @@ -236,7 +236,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_II, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_IICX, @@ -245,7 +245,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_II, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_SE30, @@ -254,7 +254,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_II, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, @@ -272,7 +272,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_IIFX, @@ -281,7 +281,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_IIFX, .scc_type = MAC_SCC_IOP, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_IOP, }, { .ident = MAC_MODEL_IISI, @@ -290,7 +290,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_IIVI, @@ -299,7 +299,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_IIVX, @@ -308,7 +308,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, @@ -323,7 +323,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_CCL, @@ -332,7 +331,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_CCLII, @@ -341,7 +340,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, @@ -356,7 +355,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_LCII, @@ -365,7 +364,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_LCIII, @@ -374,7 +373,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, @@ -395,7 +394,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_Q605_ACC, @@ -404,7 +403,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_Q610, @@ -414,7 +413,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_QUADRA, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_Q630, @@ -424,8 +423,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA, .ide_type = MAC_IDE_QUADRA, .scc_type = MAC_SCC_QUADRA, - .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_COMM, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_Q650, @@ -435,7 +433,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_QUADRA, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, /* The Q700 does have a NS Sonic */ @@ -447,7 +445,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA2, .scc_type = MAC_SCC_QUADRA, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_Q800, @@ -457,7 +455,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_QUADRA, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_Q840, @@ -467,7 +465,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA3, .scc_type = MAC_SCC_PSC, .ether_type = MAC_ETHER_MACE, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_AV, }, { .ident = MAC_MODEL_Q900, @@ -477,7 +475,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA2, .scc_type = MAC_SCC_IOP, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_IOP, }, { .ident = MAC_MODEL_Q950, @@ -487,7 +485,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA2, .scc_type = MAC_SCC_IOP, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_IOP, }, @@ -502,7 +500,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_P475, @@ -511,7 +509,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_P475F, @@ -520,7 +518,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_P520, @@ -529,7 +527,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_P550, @@ -538,7 +536,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, /* These have the comm slot, and therefore possibly SONIC ethernet */ @@ -549,8 +547,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_II, - .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_COMM, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_P588, @@ -560,8 +557,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA, .ide_type = MAC_IDE_QUADRA, .scc_type = MAC_SCC_II, - .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_COMM, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_TV, @@ -570,7 +566,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_P600, @@ -579,7 +574,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, @@ -596,7 +591,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_QUADRA, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_C650, @@ -606,7 +601,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA, .scc_type = MAC_SCC_QUADRA, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR1, }, { .ident = MAC_MODEL_C660, @@ -616,7 +611,7 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_QUADRA3, .scc_type = MAC_SCC_PSC, .ether_type = MAC_ETHER_MACE, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_PDS_NUBUS, .floppy_type = MAC_FLOPPY_AV, }, @@ -633,7 +628,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB145, @@ -642,7 +636,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB150, @@ -652,7 +645,6 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_OLD, .ide_type = MAC_IDE_PB, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB160, @@ -661,7 +653,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB165, @@ -670,7 +661,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB165C, @@ -679,7 +669,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB170, @@ -688,7 +677,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB180, @@ -697,7 +685,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB180C, @@ -706,7 +693,6 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_QUADRA, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB190, @@ -716,7 +702,6 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_LATE, .ide_type = MAC_IDE_BABOON, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB520, @@ -726,7 +711,6 @@ static struct mac_model mac_data_table[] = { .scsi_type = MAC_SCSI_LATE, .scc_type = MAC_SCC_QUADRA, .ether_type = MAC_ETHER_SONIC, - .nubus_type = MAC_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, @@ -743,7 +727,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_DUO, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB230, @@ -752,7 +736,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_DUO, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB250, @@ -761,7 +745,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_DUO, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB270C, @@ -770,7 +754,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_DUO, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB280, @@ -779,7 +763,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_DUO, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, { .ident = MAC_MODEL_PB280C, @@ -788,7 +772,7 @@ static struct mac_model mac_data_table[] = { .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_DUO, .scc_type = MAC_SCC_QUADRA, - .nubus_type = MAC_NUBUS, + .expansion_type = MAC_EXP_NUBUS, .floppy_type = MAC_FLOPPY_SWIM_ADDR2, }, @@ -1100,14 +1084,12 @@ int __init mac_platform_init(void) * Ethernet device */ - switch (macintosh_config->ether_type) { - case MAC_ETHER_SONIC: + if (macintosh_config->ether_type == MAC_ETHER_SONIC || + macintosh_config->expansion_type == MAC_EXP_PDS_COMM) platform_device_register_simple("macsonic", -1, NULL, 0); - break; - case MAC_ETHER_MACE: + + if (macintosh_config->ether_type == MAC_ETHER_MACE) platform_device_register_simple("macmace", -1, NULL, 0); - break; - } return 0; } diff --git a/drivers/net/ethernet/natsemi/macsonic.c b/drivers/net/ethernet/natsemi/macsonic.c index 313fe5e0184b..b922ab5cedea 100644 --- a/drivers/net/ethernet/natsemi/macsonic.c +++ b/drivers/net/ethernet/natsemi/macsonic.c @@ -311,7 +311,7 @@ static int mac_onboard_sonic_probe(struct net_device *dev) { struct sonic_local* lp = netdev_priv(dev); int sr; - int commslot = 0; + bool commslot = macintosh_config->expansion_type == MAC_EXP_PDS_COMM; if (!MACH_IS_MAC) return -ENODEV; @@ -322,10 +322,7 @@ static int mac_onboard_sonic_probe(struct net_device *dev) Ethernet (BTW, the Ethernet *is* always at the same address, and nothing else lives there, at least if Apple's documentation is to be believed) */ - if (macintosh_config->ident == MAC_MODEL_Q630 || - macintosh_config->ident == MAC_MODEL_P588 || - macintosh_config->ident == MAC_MODEL_P575 || - macintosh_config->ident == MAC_MODEL_C610) { + if (commslot || macintosh_config->ident == MAC_MODEL_C610) { int card_present; card_present = hwreg_present((void*)ONBOARD_SONIC_REGISTERS); @@ -333,7 +330,6 @@ static int mac_onboard_sonic_probe(struct net_device *dev) printk("none.\n"); return -ENODEV; } - commslot = 1; } printk("yes\n"); -- cgit v1.2.3 From 317b749e37789ecb3366f304dab905a97e650b41 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Sat, 13 Jan 2018 17:44:31 -0500 Subject: m68k/mac: Fix race conditions in OSS interrupt dispatch The interrupt dispatch algorithm used in the OSS driver seems to be subject to race conditions: an IRQ flag could be lost if asserted between the MOV instructions from and to the interrupt flag register. But testing shows that the write to the flag register has no effect, so rewrite the algorithm without the theoretical race condition. There is a second theoretical race condition here. When oss_irq() is called with say, IPL == 2 it will invoke the SCSI interrupt handler. The SCSI IRQ is then cleared by the mac_scsi driver. If SCSI and NuBus IRQs are now asserted together, oss_irq() will be invoked with IPL == 3 and the mac_scsi interrupt handler can be re-entered. This re-entrance issue is not limited to SCSI and could affect NuBus and ADB drivers too. Fix it by splitting up oss_irq() into separate handlers for each IPL. No-one seems to know how OSS irq flags can be cleared, if at all, so add a comment to this effect (actually reinstate one I previously removed). Testing showed that a slot IRQ with no handler can remain asserted (in this case a Radius video card) without causing problems for other IRQs. Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Geert Uytterhoeven --- arch/m68k/mac/oss.c | 67 +++++++++++++++++++++-------------------------------- 1 file changed, 26 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/m68k/mac/oss.c b/arch/m68k/mac/oss.c index 3f81892527ad..921e6c092f2c 100644 --- a/arch/m68k/mac/oss.c +++ b/arch/m68k/mac/oss.c @@ -53,56 +53,41 @@ void __init oss_init(void) } /* - * Handle miscellaneous OSS interrupts. + * Handle OSS interrupts. + * XXX how do you clear a pending IRQ? is it even necessary? */ -static void oss_irq(struct irq_desc *desc) +static void oss_iopism_irq(struct irq_desc *desc) { - int events = oss->irq_pending & - (OSS_IP_IOPSCC | OSS_IP_SCSI | OSS_IP_IOPISM); - - if (events & OSS_IP_IOPSCC) { - oss->irq_pending &= ~OSS_IP_IOPSCC; - generic_handle_irq(IRQ_MAC_SCC); - } - - if (events & OSS_IP_SCSI) { - oss->irq_pending &= ~OSS_IP_SCSI; - generic_handle_irq(IRQ_MAC_SCSI); - } - - if (events & OSS_IP_IOPISM) { - oss->irq_pending &= ~OSS_IP_IOPISM; - generic_handle_irq(IRQ_MAC_ADB); - } + generic_handle_irq(IRQ_MAC_ADB); } -/* - * Nubus IRQ handler, OSS style - * - * Unlike the VIA/RBV this is on its own autovector interrupt level. - */ +static void oss_scsi_irq(struct irq_desc *desc) +{ + generic_handle_irq(IRQ_MAC_SCSI); +} static void oss_nubus_irq(struct irq_desc *desc) { - int events, irq_bit, i; + u16 events, irq_bit; + int irq_num; events = oss->irq_pending & OSS_IP_NUBUS; - if (!events) - return; - - /* There are only six slots on the OSS, not seven */ - - i = 6; - irq_bit = 0x40; + irq_num = NUBUS_SOURCE_BASE + 5; + irq_bit = OSS_IP_NUBUS5; do { - --i; - irq_bit >>= 1; if (events & irq_bit) { - oss->irq_pending &= ~irq_bit; - generic_handle_irq(NUBUS_SOURCE_BASE + i); + events &= ~irq_bit; + generic_handle_irq(irq_num); } - } while(events & (irq_bit - 1)); + --irq_num; + irq_bit >>= 1; + } while (events); +} + +static void oss_iopscc_irq(struct irq_desc *desc) +{ + generic_handle_irq(IRQ_MAC_SCC); } /* @@ -122,14 +107,14 @@ static void oss_nubus_irq(struct irq_desc *desc) void __init oss_register_interrupts(void) { - irq_set_chained_handler(OSS_IRQLEV_IOPISM, oss_irq); - irq_set_chained_handler(OSS_IRQLEV_SCSI, oss_irq); + irq_set_chained_handler(OSS_IRQLEV_IOPISM, oss_iopism_irq); + irq_set_chained_handler(OSS_IRQLEV_SCSI, oss_scsi_irq); irq_set_chained_handler(OSS_IRQLEV_NUBUS, oss_nubus_irq); - irq_set_chained_handler(OSS_IRQLEV_IOPSCC, oss_irq); + irq_set_chained_handler(OSS_IRQLEV_IOPSCC, oss_iopscc_irq); irq_set_chained_handler(OSS_IRQLEV_VIA1, via1_irq); /* OSS_VIA1 gets enabled here because it has no machspec interrupt. */ - oss->irq_level[OSS_VIA1] = IRQ_AUTO_6; + oss->irq_level[OSS_VIA1] = OSS_IRQLEV_VIA1; } /* -- cgit v1.2.3 From acfb3b883f6d6a4b5d27ad7fdded11f6a09ae6dd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Jan 2018 10:23:47 +0000 Subject: arm64: KVM: Fix SMCCC handling of unimplemented SMC/HVC calls KVM doesn't follow the SMCCC when it comes to unimplemented calls, and inject an UNDEF instead of returning an error. Since firmware calls are now used for security mitigation, they are becoming more common, and the undef is counter productive. Instead, let's follow the SMCCC which states that -1 must be returned to the caller when getting an unknown function number. Cc: Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/kvm/handle_exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 304203fa9e33..e60494f1eef9 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -45,7 +45,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_psci_call(vcpu); if (ret < 0) { - kvm_inject_undefined(vcpu); + vcpu_set_reg(vcpu, 0, ~0UL); return 1; } @@ -54,7 +54,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) { - kvm_inject_undefined(vcpu); + vcpu_set_reg(vcpu, 0, ~0UL); return 1; } -- cgit v1.2.3 From 838cda3697073982acd276ac43387b2a0aed04b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 16 Jan 2018 10:43:17 +0100 Subject: x86/PCI: Enable AMD 64-bit window on resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reenable the 64-bit window during resume. Fixes: fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") Reported-by: Tom St Denis Signed-off-by: Christian König Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index f6a26e3cb476..54ef19e90705 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -662,11 +662,11 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); */ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) { + static const char *name = "PCI Bus 0000:00"; + struct resource *res, *conflict; u32 base, limit, high; struct pci_dev *other; - struct resource *res; unsigned i; - int r; if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) return; @@ -707,21 +707,26 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) * Allocate a 256GB window directly below the 0xfd00000000 hardware * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6). */ - res->name = "PCI Bus 0000:00"; + res->name = name; res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_WINDOW; res->start = 0xbd00000000ull; res->end = 0xfd00000000ull - 1; - r = request_resource(&iomem_resource, res); - if (r) { + conflict = request_resource_conflict(&iomem_resource, res); + if (conflict) { kfree(res); - return; - } + if (conflict->name != name) + return; - dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", - res); - add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + /* We are resuming from suspend; just reenable the window */ + res = conflict; + } else { + dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", + res); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + pci_bus_add_resource(dev->bus, res, 0); + } base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) | AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK; @@ -733,13 +738,16 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev) pci_write_config_dword(dev, AMD_141b_MMIO_HIGH(i), high); pci_write_config_dword(dev, AMD_141b_MMIO_LIMIT(i), limit); pci_write_config_dword(dev, AMD_141b_MMIO_BASE(i), base); - - pci_bus_add_resource(dev->bus, res, 0); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); #endif -- cgit v1.2.3 From a2284d912bfc865cdca4c00488e08a3550f9a405 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 16 Jan 2018 03:46:08 +0100 Subject: bpf, arm64: fix stack_depth tracking in combination with tail calls Using dynamic stack_depth tracking in arm64 JIT is currently broken in combination with tail calls. In prologue, we cache ctx->stack_size and adjust SP reg for setting up function call stack, and tearing it down again in epilogue. Problem is that when doing a tail call, the cached ctx->stack_size might not be the same. One way to fix the problem with minimal overhead is to re-adjust SP in emit_bpf_tail_call() and properly adjust it to the current program's ctx->stack_size. Tested on Cavium ThunderX ARMv8. Fixes: f1c9eed7f437 ("bpf, arm64: take advantage of stack_depth tracking") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index ba38d403abb2..bb32f7f6dd0f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -148,7 +148,8 @@ static inline int epilogue_offset(const struct jit_ctx *ctx) /* Stack must be multiples of 16B */ #define STACK_ALIGN(sz) (((sz) + 15) & ~15) -#define PROLOGUE_OFFSET 8 +/* Tail call offset to jump into */ +#define PROLOGUE_OFFSET 7 static int build_prologue(struct jit_ctx *ctx) { @@ -200,19 +201,19 @@ static int build_prologue(struct jit_ctx *ctx) /* Initialize tail_call_cnt */ emit(A64_MOVZ(1, tcc, 0, 0), ctx); - /* 4 byte extra for skb_copy_bits buffer */ - ctx->stack_size = prog->aux->stack_depth + 4; - ctx->stack_size = STACK_ALIGN(ctx->stack_size); - - /* Set up function call stack */ - emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); - cur_offset = ctx->idx - idx0; if (cur_offset != PROLOGUE_OFFSET) { pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n", cur_offset, PROLOGUE_OFFSET); return -1; } + + /* 4 byte extra for skb_copy_bits buffer */ + ctx->stack_size = prog->aux->stack_depth + 4; + ctx->stack_size = STACK_ALIGN(ctx->stack_size); + + /* Set up function call stack */ + emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); return 0; } @@ -260,11 +261,12 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_LDR64(prg, tmp, prg), ctx); emit(A64_CBZ(1, prg, jmp_offset), ctx); - /* goto *(prog->bpf_func + prologue_size); */ + /* goto *(prog->bpf_func + prologue_offset); */ off = offsetof(struct bpf_prog, bpf_func); emit_a64_mov_i64(tmp, off, ctx); emit(A64_LDR64(tmp, prg, tmp), ctx); emit(A64_ADD_I(1, tmp, tmp, sizeof(u32) * PROLOGUE_OFFSET), ctx); + emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); emit(A64_BR(tmp), ctx); /* out: */ -- cgit v1.2.3 From 0d83620fd18e5b8d79d390e482583b379a6a986d Mon Sep 17 00:00:00 2001 From: Michael Cree Date: Wed, 3 Jan 2018 21:58:00 +1300 Subject: alpha: extend memset16 to EV6 optimised routines Commit 92ce4c3ea7c4, "alpha: add support for memset16", renamed the function memsetw() to be memset16() but neglected to do this for the EV6 optimised version, thus when building a kernel optimised for EV6 (or later) link errors result. This extends the memset16 support to EV6. Signed-off-by: Michael Cree Signed-off-by: Matt Turner --- arch/alpha/lib/ev6-memset.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S index 316a99aa9efe..1cfcfbbea6f0 100644 --- a/arch/alpha/lib/ev6-memset.S +++ b/arch/alpha/lib/ev6-memset.S @@ -18,7 +18,7 @@ * The algorithm for the leading and trailing quadwords remains the same, * however the loop has been unrolled to enable better memory throughput, * and the code has been replicated for each of the entry points: __memset - * and __memsetw to permit better scheduling to eliminate the stalling + * and __memset16 to permit better scheduling to eliminate the stalling * encountered during the mask replication. * A future enhancement might be to put in a byte store loop for really * small (say < 32 bytes) memset()s. Whether or not that change would be @@ -34,7 +34,7 @@ .globl memset .globl __memset .globl ___memset - .globl __memsetw + .globl __memset16 .globl __constant_c_memset .ent ___memset @@ -415,9 +415,9 @@ end: * to mask stalls. Note that entry point names also had to change */ .align 5 - .ent __memsetw + .ent __memset16 -__memsetw: +__memset16: .frame $30,0,$26,0 .prologue 0 @@ -596,8 +596,8 @@ end_w: nop ret $31,($26),1 # L0 : - .end __memsetw - EXPORT_SYMBOL(__memsetw) + .end __memset16 + EXPORT_SYMBOL(__memset16) memset = ___memset __memset = ___memset -- cgit v1.2.3 From 4fdec2034b7540dda461c6ba33325dfcff345c64 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 16 Jan 2018 16:42:25 +0100 Subject: x86/cpufeature: Move processor tracing out of scattered features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Processor tracing is already enumerated in word 9 (CPUID[7,0].EBX), so do not duplicate it in the scattered features word. Besides being more tidy, this will be useful for KVM when it presents processor tracing to the guests. KVM selects host features that are supported by both the host kernel (depending on command line options, CPU errata, or whatever) and KVM. Whenever a full feature word exists, KVM's code is written in the expectation that the CPUID bit number matches the X86_FEATURE_* bit number, but this is not the case for X86_FEATURE_INTEL_PT. Signed-off-by: Paolo Bonzini Cc: Borislav Petkov Cc: Linus Torvalds Cc: Luwei Kang Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Thomas Gleixner Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/1516117345-34561-1-git-send-email-pbonzini@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index aa09559b2c0b..25b9375c1484 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -206,7 +206,6 @@ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ @@ -246,6 +245,7 @@ #define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 05459ad3db46..d0e69769abfd 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -21,7 +21,6 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, -- cgit v1.2.3 From d47924417319e3b6a728c0b690f183e75bc2a702 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Jan 2018 19:59:59 +0100 Subject: x86/intel_rdt/cqm: Prevent use after free intel_rdt_iffline_cpu() -> domain_remove_cpu() frees memory first and then proceeds accessing it. BUG: KASAN: use-after-free in find_first_bit+0x1f/0x80 Read of size 8 at addr ffff883ff7c1e780 by task cpuhp/31/195 find_first_bit+0x1f/0x80 has_busy_rmid+0x47/0x70 intel_rdt_offline_cpu+0x4b4/0x510 Freed by task 195: kfree+0x94/0x1a0 intel_rdt_offline_cpu+0x17d/0x510 Do the teardown first and then free memory. Fixes: 24247aeeabe9 ("x86/intel_rdt/cqm: Improve limbo list processing") Reported-by: Joseph Salisbury Signed-off-by: Thomas Gleixner Cc: Ravi Shankar Cc: Peter Zilstra Cc: Stephane Eranian Cc: Vikas Shivappa Cc: Andi Kleen Cc: "Roderick W. Smith" Cc: 1733662@bugs.launchpad.net Cc: Fenghua Yu Cc: Tony Luck Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801161957510.2366@nanos --- arch/x86/kernel/cpu/intel_rdt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 88dcf8479013..99442370de40 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -525,10 +525,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); - kfree(d->ctrl_val); - kfree(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); list_del(&d->list); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); @@ -545,6 +541,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cancel_delayed_work(&d->cqm_limbo); } + kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); kfree(d); return; } -- cgit v1.2.3 From 45d55e7bac4028af93f5fa324e69958a0b868e96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Jan 2018 12:20:18 +0100 Subject: x86/apic/vector: Fix off by one in error path Keith reported the following warning: WARNING: CPU: 28 PID: 1420 at kernel/irq/matrix.c:222 irq_matrix_remove_managed+0x10f/0x120 x86_vector_free_irqs+0xa1/0x180 x86_vector_alloc_irqs+0x1e4/0x3a0 msi_domain_alloc+0x62/0x130 The reason for this is that if the vector allocation fails the error handling code tries to free the failed vector as well, which causes the above imbalance warning to trigger. Adjust the error path to handle this correctly. Fixes: b5dc8e6c21e7 ("x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors") Reported-by: Keith Busch Signed-off-by: Thomas Gleixner Tested-by: Keith Busch Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801161217300.1823@nanos --- arch/x86/kernel/apic/vector.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f8b03bb8e725..3cc471beb50b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -542,14 +542,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, err = assign_irq_vector_policy(irqd, info); trace_vector_setup(virq + i, false, err); - if (err) + if (err) { + irqd->chip_data = NULL; + free_apic_chip_data(apicd); goto error; + } } return 0; error: - x86_vector_free_irqs(domain, virq, i + 1); + x86_vector_free_irqs(domain, virq, i); return err; } -- cgit v1.2.3 From fd6e440f20b1a4304553775fc55938848ff617c9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 16 Jan 2018 21:20:05 +1100 Subject: powerpc/64s: Wire up cpu_show_meltdown() The recent commit 87590ce6e373 ("sysfs/cpu: Add vulnerability folder") added a generic folder and set of files for reporting information on CPU vulnerabilities. One of those was for meltdown: /sys/devices/system/cpu/vulnerabilities/meltdown This commit wires up that file for 64-bit Book3S powerpc. For now we default to "Vulnerable" unless the RFI flush is enabled. That may not actually be true on all hardware, further patches will refine the reporting based on the CPU/platform etc. But for now we default to being pessimists. Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/setup_64.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c51e6ce42e7a..2ed525a44734 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -166,6 +166,7 @@ config PPC select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64 select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_SMP_IDLE_THREAD diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 491be4179ddd..624d2a62d05d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -901,4 +901,12 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) if (!no_rfi_flush) rfi_flush_enable(enable); } + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (rfi_flush) + return sprintf(buf, "Mitigation: RFI Flush\n"); + + return sprintf(buf, "Vulnerable\n"); +} #endif /* CONFIG_PPC_BOOK3S_64 */ -- cgit v1.2.3 From 236003e6b5443c45c18e613d2b0d776a9f87540e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 16 Jan 2018 22:17:18 +1100 Subject: powerpc/64s: Allow control of RFI flush via debugfs Expose the state of the RFI flush (enabled/disabled) via debugfs, and allow it to be enabled/disabled at runtime. eg: $ cat /sys/kernel/debug/powerpc/rfi_flush 1 $ echo 0 > /sys/kernel/debug/powerpc/rfi_flush $ cat /sys/kernel/debug/powerpc/rfi_flush 0 Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin --- arch/powerpc/kernel/setup_64.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 624d2a62d05d..e67413f4a8f0 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -902,6 +903,35 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) rfi_flush_enable(enable); } +#ifdef CONFIG_DEBUG_FS +static int rfi_flush_set(void *data, u64 val) +{ + if (val == 1) + rfi_flush_enable(true); + else if (val == 0) + rfi_flush_enable(false); + else + return -EINVAL; + + return 0; +} + +static int rfi_flush_get(void *data, u64 *val) +{ + *val = rfi_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); + +static __init int rfi_flush_debugfs_init(void) +{ + debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); + return 0; +} +device_initcall(rfi_flush_debugfs_init); +#endif + ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { if (rfi_flush) -- cgit v1.2.3 From 1b689a95ce7427075f9ac9fb4aea1af530742b7f Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Mon, 15 Jan 2018 14:30:03 +0100 Subject: powerpc/pseries: include linux/types.h in asm/hvcall.h Commit 6e032b350cd1 ("powerpc/powernv: Check device-tree for RFI flush settings") uses u64 in asm/hvcall.h without including linux/types.h This breaks hvcall.h users that do not include the header themselves. Fixes: 6e032b350cd1 ("powerpc/powernv: Check device-tree for RFI flush settings") Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hvcall.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index f0461618bf7b..eca3f9c68907 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -353,6 +353,7 @@ #define PROC_TABLE_GTSE 0x01 #ifndef __ASSEMBLY__ +#include /** * plpar_hcall_norets: - Make a pseries hypervisor call with no return arguments -- cgit v1.2.3 From 37b95951c58fdf08dc10afa9d02066ed9f176fb5 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Tue, 16 Jan 2018 17:34:07 +0800 Subject: KVM/x86: Fix wrong macro references of X86_CR0_PG_BIT and X86_CR4_PAE_BIT in kvm_valid_sregs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_valid_sregs() should use X86_CR0_PG and X86_CR4_PAE to check bit status rather than X86_CR0_PG_BIT and X86_CR4_PAE_BIT. This patch is to fix it. Fixes: f29810335965a(KVM/x86: Check input paging mode when cs.l is set) Reported-by: Jeremi Piotrowski Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Tianyu Lan Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1cec2c62a0b0..c53298dfbf50 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7496,13 +7496,13 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) { + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in * 64-bit mode (though maybe in a 32-bit code segment). * CR4.PAE and EFER.LMA must be set. */ - if (!(sregs->cr4 & X86_CR4_PAE_BIT) + if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) return -EINVAL; } else { -- cgit v1.2.3 From e9062481824384f00299971f923fecf6b3668001 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 13 Jan 2018 11:35:15 +0000 Subject: ARM: net: bpf: avoid 'bx' instruction on non-Thumb capable CPUs Avoid the 'bx' instruction on CPUs that have no support for Thumb and thus do not implement this instruction by moving the generation of this opcode to a separate function that selects between: bx reg and mov pc, reg according to the capabilities of the CPU. Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler") Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index c199990e12b6..4efb3743a89e 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -285,16 +285,20 @@ static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx) emit_mov_i_no8m(rd, val, ctx); } -static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) +static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx) { - ctx->seen |= SEEN_CALL; -#if __LINUX_ARM_ARCH__ < 5 - emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); - if (elf_hwcap & HWCAP_THUMB) emit(ARM_BX(tgt_reg), ctx); else emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); +} + +static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) +{ + ctx->seen |= SEEN_CALL; +#if __LINUX_ARM_ARCH__ < 5 + emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); + emit_bx_r(tgt_reg, ctx); #else emit(ARM_BLX_R(tgt_reg), ctx); #endif @@ -997,7 +1001,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit_a32_mov_i(tmp2[1], off, false, ctx); emit(ARM_LDR_R(tmp[1], tmp[1], tmp2[1]), ctx); emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx); - emit(ARM_BX(tmp[1]), ctx); + emit_bx_r(tmp[1], ctx); /* out: */ if (out_offset == -1) @@ -1166,7 +1170,7 @@ static void build_epilogue(struct jit_ctx *ctx) emit(ARM_POP(reg_set), ctx); /* Return back to the callee function */ if (!(ctx->seen & SEEN_CALL)) - emit(ARM_BX(ARM_LR), ctx); + emit_bx_r(ARM_LR, ctx); #endif } -- cgit v1.2.3 From f4483f2cc1fdc03488c8a1452e545545ae5bda93 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 13 Jan 2018 11:39:54 +0000 Subject: ARM: net: bpf: fix tail call jumps When a tail call fails, it is documented that the tail call should continue execution at the following instruction. An example tail call sequence is: 12: (85) call bpf_tail_call#12 13: (b7) r0 = 0 14: (95) exit The ARM assembler for the tail call in this case ends up branching to instruction 14 instead of instruction 13, resulting in the BPF filter returning a non-zero value: 178: ldr r8, [sp, #588] ; insn 12 17c: ldr r6, [r8, r6] 180: ldr r8, [sp, #580] 184: cmp r8, r6 188: bcs 0x1e8 18c: ldr r6, [sp, #524] 190: ldr r7, [sp, #528] 194: cmp r7, #0 198: cmpeq r6, #32 19c: bhi 0x1e8 1a0: adds r6, r6, #1 1a4: adc r7, r7, #0 1a8: str r6, [sp, #524] 1ac: str r7, [sp, #528] 1b0: mov r6, #104 1b4: ldr r8, [sp, #588] 1b8: add r6, r8, r6 1bc: ldr r8, [sp, #580] 1c0: lsl r7, r8, #2 1c4: ldr r6, [r6, r7] 1c8: cmp r6, #0 1cc: beq 0x1e8 1d0: mov r8, #32 1d4: ldr r6, [r6, r8] 1d8: add r6, r6, #44 1dc: bx r6 1e0: mov r0, #0 ; insn 13 1e4: mov r1, #0 1e8: add sp, sp, #596 ; insn 14 1ec: pop {r4, r5, r6, r7, r8, sl, pc} For other sequences, the tail call could end up branching midway through the following BPF instructions, or maybe off the end of the function, leading to unknown behaviours. Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler") Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 4efb3743a89e..ce36d2cab50c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -949,7 +949,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) const u8 *tcc = bpf2a32[TCALL_CNT]; const int idx0 = ctx->idx; #define cur_offset (ctx->idx - idx0) -#define jmp_offset (out_offset - (cur_offset)) +#define jmp_offset (out_offset - (cur_offset) - 2) u32 off, lo, hi; /* if (index >= array->map.max_entries) -- cgit v1.2.3 From d1220efd23484c72c82d5471f05daeb35b5d1916 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 13 Jan 2018 16:10:07 +0000 Subject: ARM: net: bpf: fix stack alignment As per 2dede2d8e925 ("ARM EABI: stack pointer must be 64-bit aligned after a CPU exception") the stack should be aligned to a 64-bit boundary on EABI systems. Ensure that the eBPF JIT appropraitely aligns the stack. Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler") Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index ce36d2cab50c..d00a0eb0386e 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -179,8 +179,13 @@ static void jit_fill_hole(void *area, unsigned int size) *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF); } -/* Stack must be multiples of 16 Bytes */ -#define STACK_ALIGN(sz) (((sz) + 3) & ~3) +#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5) +/* EABI requires the stack to be aligned to 64-bit boundaries */ +#define STACK_ALIGNMENT 8 +#else +/* Stack must be aligned to 32-bit boundaries */ +#define STACK_ALIGNMENT 4 +#endif /* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4, * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9, @@ -194,7 +199,7 @@ static void jit_fill_hole(void *area, unsigned int size) + SCRATCH_SIZE + \ + 4 /* extra for skb_copy_bits buffer */) -#define STACK_SIZE STACK_ALIGN(_STACK_SIZE) +#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT) /* Get the offset of eBPF REGISTERs stored on scratch space. */ #define STACK_VAR(off) (STACK_SIZE-off-4) -- cgit v1.2.3 From 70ec3a6c2c11e4b0e107a65de943a082f9aff351 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 13 Jan 2018 21:26:14 +0000 Subject: ARM: net: bpf: move stack documentation Move the stack documentation towards the top of the file, where it's relevant for things like the register layout. Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index d00a0eb0386e..e90229d58c77 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -27,6 +27,27 @@ int bpf_jit_enable __read_mostly; +/* + * eBPF prog stack layout + * + * high + * original ARM_SP => +-----+ eBPF prologue + * |FP/LR| + * current ARM_FP => +-----+ + * | ... | callee saved registers + * eBPF fp register => +-----+ <= (BPF_FP) + * | ... | eBPF JIT scratch space + * | | eBPF prog stack + * +-----+ + * |RSVD | JIT scratchpad + * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE) + * | | + * | ... | Function call stack + * | | + * +-----+ + * low + */ + #define STACK_OFFSET(k) (k) #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ @@ -1091,27 +1112,6 @@ static void build_prologue(struct jit_ctx *ctx) u16 reg_set = 0; - /* - * eBPF prog stack layout - * - * high - * original ARM_SP => +-----+ eBPF prologue - * |FP/LR| - * current ARM_FP => +-----+ - * | ... | callee saved registers - * eBPF fp register => +-----+ <= (BPF_FP) - * | ... | eBPF JIT scratch space - * | | eBPF prog stack - * +-----+ - * |RSVD | JIT scratchpad - * current A64_SP => +-----+ <= (BPF_FP - STACK_SIZE) - * | | - * | ... | Function call stack - * | | - * +-----+ - * low - */ - /* Save callee saved registers. */ reg_set |= (1< Date: Sat, 13 Jan 2018 22:51:27 +0000 Subject: ARM: net: bpf: correct stack layout documentation The stack layout documentation incorrectly suggests that the BPF JIT scratch space starts immediately below BPF_FP. This is not correct, so let's fix the documentation to reflect reality. Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index e90229d58c77..dcb3181e85f3 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -28,24 +28,43 @@ int bpf_jit_enable __read_mostly; /* - * eBPF prog stack layout + * eBPF prog stack layout: * * high - * original ARM_SP => +-----+ eBPF prologue - * |FP/LR| - * current ARM_FP => +-----+ - * | ... | callee saved registers - * eBPF fp register => +-----+ <= (BPF_FP) + * original ARM_SP => +-----+ + * | | callee saved registers + * +-----+ <= (BPF_FP + SCRATCH_SIZE) * | ... | eBPF JIT scratch space - * | | eBPF prog stack + * eBPF fp register => +-----+ + * (BPF_FP) | ... | eBPF prog stack * +-----+ * |RSVD | JIT scratchpad - * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE) + * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE) * | | * | ... | Function call stack * | | * +-----+ * low + * + * The callee saved registers depends on whether frame pointers are enabled. + * With frame pointers (to be compliant with the ABI): + * + * high + * original ARM_SP => +------------------+ \ + * | pc | | + * current ARM_FP => +------------------+ } callee saved registers + * |r4-r8,r10,fp,ip,lr| | + * +------------------+ / + * low + * + * Without frame pointers: + * + * high + * original ARM_SP => +------------------+ + * | lr | (optional) + * | r4-r8,r10 | callee saved registers + * +------------------+ + * low */ #define STACK_OFFSET(k) (k) -- cgit v1.2.3 From 02088d9b392f605c892894b46aa8c83e3abd0115 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 13 Jan 2018 22:38:18 +0000 Subject: ARM: net: bpf: fix register saving When an eBPF program tail-calls another eBPF program, it enters it after the prologue to avoid having complex stack manipulations. This can lead to kernel oopses, and similar. Resolve this by always using a fixed stack layout, a CPU register frame pointer, and using this when reloading registers before returning. Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler") Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 80 +++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 58 deletions(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index dcb3181e85f3..95bb3f896c8f 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -61,20 +61,24 @@ int bpf_jit_enable __read_mostly; * * high * original ARM_SP => +------------------+ - * | lr | (optional) - * | r4-r8,r10 | callee saved registers - * +------------------+ + * | r4-r8,r10,fp,lr | callee saved registers + * current ARM_FP => +------------------+ * low + * + * When popping registers off the stack at the end of a BPF function, we + * reference them via the current ARM_FP register. */ +#define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \ + 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R10 | \ + 1 << ARM_FP) +#define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR) +#define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC) #define STACK_OFFSET(k) (k) #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ #define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */ -/* Flags used for JIT optimization */ -#define SEEN_CALL (1 << 0) - #define FLAG_IMM_OVERFLOW (1 << 0) /* @@ -135,7 +139,6 @@ static const u8 bpf2a32[][2] = { * idx : index of current last JITed instruction. * prologue_bytes : bytes used in prologue. * epilogue_offset : offset of epilogue starting. - * seen : bit mask used for JIT optimization. * offsets : array of eBPF instruction offsets in * JITed code. * target : final JITed code. @@ -150,7 +153,6 @@ struct jit_ctx { unsigned int idx; unsigned int prologue_bytes; unsigned int epilogue_offset; - u32 seen; u32 flags; u32 *offsets; u32 *target; @@ -340,7 +342,6 @@ static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx) static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) { - ctx->seen |= SEEN_CALL; #if __LINUX_ARM_ARCH__ < 5 emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); emit_bx_r(tgt_reg, ctx); @@ -403,7 +404,6 @@ static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) } /* Call appropriate function */ - ctx->seen |= SEEN_CALL; emit_mov_i(ARM_IP, op == BPF_DIV ? (u32)jit_udiv32 : (u32)jit_mod32, ctx); emit_blx_r(ARM_IP, ctx); @@ -669,8 +669,6 @@ static inline void emit_a32_lsh_r64(const u8 dst[], const u8 src[], bool dstk, /* Do LSH operation */ emit(ARM_SUB_I(ARM_IP, rt, 32), ctx); emit(ARM_RSB_I(tmp2[0], rt, 32), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_MOV_SR(ARM_LR, rm, SRTYPE_ASL, rt), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd, SRTYPE_ASL, ARM_IP), ctx); emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd, SRTYPE_LSR, tmp2[0]), ctx); @@ -705,8 +703,6 @@ static inline void emit_a32_arsh_r64(const u8 dst[], const u8 src[], bool dstk, /* Do the ARSH operation */ emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); _emit(ARM_COND_MI, ARM_B(0), ctx); @@ -741,8 +737,6 @@ static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk, /* Do LSH operation */ emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx); emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_LSR, tmp2[0]), ctx); @@ -877,8 +871,6 @@ static inline void emit_a32_mul_r64(const u8 dst[], const u8 src[], bool dstk, /* Do Multiplication */ emit(ARM_MUL(ARM_IP, rd, rn), ctx); emit(ARM_MUL(ARM_LR, rm, rt), ctx); - /* As we are using ARM_LR */ - ctx->seen |= SEEN_CALL; emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx); emit(ARM_UMULL(ARM_IP, rm, rd, rt), ctx); @@ -955,7 +947,6 @@ static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm, const u8 rn, struct jit_ctx *ctx, u8 op) { switch (op) { case BPF_JSET: - ctx->seen |= SEEN_CALL; emit(ARM_AND_R(ARM_IP, rt, rn), ctx); emit(ARM_AND_R(ARM_LR, rd, rm), ctx); emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx); @@ -1119,33 +1110,22 @@ static void build_prologue(struct jit_ctx *ctx) const u8 r2 = bpf2a32[BPF_REG_1][1]; const u8 r3 = bpf2a32[BPF_REG_1][0]; const u8 r4 = bpf2a32[BPF_REG_6][1]; - const u8 r5 = bpf2a32[BPF_REG_6][0]; - const u8 r6 = bpf2a32[TMP_REG_1][1]; - const u8 r7 = bpf2a32[TMP_REG_1][0]; - const u8 r8 = bpf2a32[TMP_REG_2][1]; - const u8 r10 = bpf2a32[TMP_REG_2][0]; const u8 fplo = bpf2a32[BPF_REG_FP][1]; const u8 fphi = bpf2a32[BPF_REG_FP][0]; - const u8 sp = ARM_SP; const u8 *tcc = bpf2a32[TCALL_CNT]; - u16 reg_set = 0; - /* Save callee saved registers. */ - reg_set |= (1<seen & SEEN_CALL) - reg_set |= (1<stack_size = imm8m(STACK_SIZE); @@ -1168,33 +1148,19 @@ static void build_prologue(struct jit_ctx *ctx) /* end of prologue */ } +/* restore callee saved registers. */ static void build_epilogue(struct jit_ctx *ctx) { - const u8 r4 = bpf2a32[BPF_REG_6][1]; - const u8 r5 = bpf2a32[BPF_REG_6][0]; - const u8 r6 = bpf2a32[TMP_REG_1][1]; - const u8 r7 = bpf2a32[TMP_REG_1][0]; - const u8 r8 = bpf2a32[TMP_REG_2][1]; - const u8 r10 = bpf2a32[TMP_REG_2][0]; - u16 reg_set = 0; - - /* unwind function call stack */ - emit(ARM_ADD_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); - - /* restore callee saved registers. */ - reg_set |= (1<seen & SEEN_CALL) - reg_set |= (1<seen & SEEN_CALL)) - emit_bx_r(ARM_LR, ctx); + emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx); + emit(ARM_POP(CALLEE_POP_MASK), ctx); #endif } @@ -1422,8 +1388,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit_rev32(rt, rt, ctx); goto emit_bswap_uxt; case 64: - /* Because of the usage of ARM_LR */ - ctx->seen |= SEEN_CALL; emit_rev32(ARM_LR, rt, ctx); emit_rev32(rt, rd, ctx); emit(ARM_MOV_R(rd, ARM_LR), ctx); -- cgit v1.2.3 From ec19e02b343db991d2d1610c409efefebf4e2ca9 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 13 Jan 2018 21:06:16 +0000 Subject: ARM: net: bpf: fix LDX instructions When the source and destination register are identical, our JIT does not generate correct code, which leads to kernel oopses. Fix this by (a) generating more efficient code, and (b) making use of the temporary earlier if we will overwrite the address register. Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler") Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 61 +++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 95bb3f896c8f..715e7250de86 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -913,33 +913,53 @@ static inline void emit_str_r(const u8 dst, const u8 src, bool dstk, } /* dst = *(size*)(src + off) */ -static inline void emit_ldx_r(const u8 dst, const u8 src, bool dstk, - const s32 off, struct jit_ctx *ctx, const u8 sz){ +static inline void emit_ldx_r(const u8 dst[], const u8 src, bool dstk, + s32 off, struct jit_ctx *ctx, const u8 sz){ const u8 *tmp = bpf2a32[TMP_REG_1]; - u8 rd = dstk ? tmp[1] : dst; + const u8 *rd = dstk ? tmp : dst; u8 rm = src; + s32 off_max; - if (off) { + if (sz == BPF_H) + off_max = 0xff; + else + off_max = 0xfff; + + if (off < 0 || off > off_max) { emit_a32_mov_i(tmp[0], off, false, ctx); emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); rm = tmp[0]; + off = 0; + } else if (rd[1] == rm) { + emit(ARM_MOV_R(tmp[0], rm), ctx); + rm = tmp[0]; } switch (sz) { - case BPF_W: - /* Load a Word */ - emit(ARM_LDR_I(rd, rm, 0), ctx); + case BPF_B: + /* Load a Byte */ + emit(ARM_LDRB_I(rd[1], rm, off), ctx); + emit_a32_mov_i(dst[0], 0, dstk, ctx); break; case BPF_H: /* Load a HalfWord */ - emit(ARM_LDRH_I(rd, rm, 0), ctx); + emit(ARM_LDRH_I(rd[1], rm, off), ctx); + emit_a32_mov_i(dst[0], 0, dstk, ctx); break; - case BPF_B: - /* Load a Byte */ - emit(ARM_LDRB_I(rd, rm, 0), ctx); + case BPF_W: + /* Load a Word */ + emit(ARM_LDR_I(rd[1], rm, off), ctx); + emit_a32_mov_i(dst[0], 0, dstk, ctx); + break; + case BPF_DW: + /* Load a Double Word */ + emit(ARM_LDR_I(rd[1], rm, off), ctx); + emit(ARM_LDR_I(rd[0], rm, off + 4), ctx); break; } if (dstk) - emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx); + emit(ARM_STR_I(rd[1], ARM_SP, STACK_VAR(dst[1])), ctx); + if (dstk && sz == BPF_DW) + emit(ARM_STR_I(rd[0], ARM_SP, STACK_VAR(dst[0])), ctx); } /* Arithmatic Operation */ @@ -1440,22 +1460,7 @@ exit: rn = sstk ? tmp2[1] : src_lo; if (sstk) emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx); - switch (BPF_SIZE(code)) { - case BPF_W: - /* Load a Word */ - case BPF_H: - /* Load a Half-Word */ - case BPF_B: - /* Load a Byte */ - emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_SIZE(code)); - emit_a32_mov_i(dst_hi, 0, dstk, ctx); - break; - case BPF_DW: - /* Load a double word */ - emit_ldx_r(dst_lo, rn, dstk, off, ctx, BPF_W); - emit_ldx_r(dst_hi, rn, dstk, off+4, ctx, BPF_W); - break; - } + emit_ldx_r(dst, rn, dstk, off, ctx, BPF_SIZE(code)); break; /* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */ case BPF_LD | BPF_ABS | BPF_W: -- cgit v1.2.3 From 091f02483df7b56615b524491f404e574c5e0668 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 13 Jan 2018 12:11:26 +0000 Subject: ARM: net: bpf: clarify tail_call index As per 90caccdd8cc0 ("bpf: fix bpf_tail_call() x64 JIT"), the index used for array lookup is defined to be 32-bit wide. Update a misleading comment that suggests it is 64-bit wide. Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler") Signed-off-by: Russell King --- arch/arm/net/bpf_jit_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 715e7250de86..323a4df59a6c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1016,7 +1016,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit_a32_mov_i(tmp[1], off, false, ctx); emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx); emit(ARM_LDR_R(tmp[1], tmp2[1], tmp[1]), ctx); - /* index (64 bit) */ + /* index is 32-bit for arrays */ emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx); /* index >= array->map.max_entries */ emit(ARM_CMP_R(tmp2[1], tmp[1]), ctx); -- cgit v1.2.3 From a511e7935378ef1f321456a90beae2a2632d3d83 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 20 Dec 2017 14:57:21 -0800 Subject: x86/intel_rdt: Enumerate L2 Code and Data Prioritization (CDP) feature L2 Code and Data Prioritization (CDP) is enumerated in CPUID(EAX=0x10, ECX=0x2):ECX.bit2 Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: Vikas" Cc: Sai Praneeth" Cc: Reinette" Link: https://lkml.kernel.org/r/1513810644-78015-4-git-send-email-fenghua.yu@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 25b9375c1484..67bbfaa1448b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -206,6 +206,7 @@ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ +#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d0e69769abfd..df4d8f7595a5 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, -- cgit v1.2.3 From def10853930a82456ab862a3a8292a3a16c386e7 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 20 Dec 2017 14:57:22 -0800 Subject: x86/intel_rdt: Add two new resources for L2 Code and Data Prioritization (CDP) L2 data and L2 code are added as new resources in rdt_resources_all[] and data in the resources are configured. When L2 CDP is enabled, the schemata will have the two resources in this format: L2DATA:l2id0=xxxx;l2id1=xxxx;.... L2CODE:l2id0=xxxx;l2id1=xxxx;.... xxxx represent CBM (Cache Bit Mask) values in the schemata, similar to all others (L2 CAT/L3 CAT/L3 CDP). Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: Vikas" Cc: Sai Praneeth" Cc: Reinette" Link: https://lkml.kernel.org/r/1513810644-78015-5-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 66 ++++++++++++++++++++++++++++++++++------- arch/x86/kernel/cpu/intel_rdt.h | 2 ++ 2 files changed, 58 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 99442370de40..5202da08fd6f 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -135,6 +135,40 @@ struct rdt_resource rdt_resources_all[] = { .format_str = "%d=%0*x", .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2DATA] = + { + .rid = RDT_RESOURCE_L2DATA, + .name = "L2DATA", + .domains = domain_init(RDT_RESOURCE_L2DATA), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_L2CODE] = + { + .rid = RDT_RESOURCE_L2CODE, + .name = "L2CODE", + .domains = domain_init(RDT_RESOURCE_L2CODE), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, [RDT_RESOURCE_MBA] = { .rid = RDT_RESOURCE_MBA, @@ -259,15 +293,15 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) r->alloc_enabled = true; } -static void rdt_get_cdp_l3_config(int type) +static void rdt_get_cdp_config(int level, int type) { - struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r_l = &rdt_resources_all[level]; struct rdt_resource *r = &rdt_resources_all[type]; - r->num_closid = r_l3->num_closid / 2; - r->cache.cbm_len = r_l3->cache.cbm_len; - r->default_ctrl = r_l3->default_ctrl; - r->cache.shareable_bits = r_l3->cache.shareable_bits; + r->num_closid = r_l->num_closid / 2; + r->cache.cbm_len = r_l->cache.cbm_len; + r->default_ctrl = r_l->default_ctrl; + r->cache.shareable_bits = r_l->cache.shareable_bits; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; /* @@ -277,6 +311,18 @@ static void rdt_get_cdp_l3_config(int type) r->alloc_enabled = false; } +static void rdt_get_cdp_l3_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA); + rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE); +} + +static void rdt_get_cdp_l2_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA); + rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); +} + static int get_cache_id(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -729,15 +775,15 @@ static __init bool get_rdt_alloc_resources(void) if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { - rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); - rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); - } + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) + rdt_get_cdp_l3_config(); ret = true; } if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); + if (rdt_cpu_has(X86_FEATURE_CDP_L2)) + rdt_get_cdp_l2_config(); ret = true; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 3397244984f5..19ffc5a7c116 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -357,6 +357,8 @@ enum { RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE, RDT_RESOURCE_L2, + RDT_RESOURCE_L2DATA, + RDT_RESOURCE_L2CODE, RDT_RESOURCE_MBA, /* Must be the last */ -- cgit v1.2.3 From 99adde9b370de8e07ef76630c6f60dbf586cdf0e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 20 Dec 2017 14:57:23 -0800 Subject: x86/intel_rdt: Enable L2 CDP in MSR IA32_L2_QOS_CFG Bit 0 in MSR IA32_L2_QOS_CFG (0xc82) is L2 CDP enable bit. By default, the bit is zero, i.e. L2 CAT is enabled, and L2 CDP is disabled. When the resctrl mount parameter "cdpl2" is given, the bit is set to 1 and L2 CDP is enabled. In L2 CDP mode, the L2 CAT mask MSRs are re-mapped into interleaved pairs of mask MSRs for code (referenced by an odd CLOSID) and data (referenced by an even CLOSID). Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: Vikas" Cc: Sai Praneeth" Cc: Reinette" Link: https://lkml.kernel.org/r/1513810644-78015-6-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel_rdt.h | 3 + arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 117 ++++++++++++++++++++++++------- 2 files changed, 94 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 19ffc5a7c116..3fd7a70ee04a 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -7,12 +7,15 @@ #include #define IA32_L3_QOS_CFG 0xc81 +#define IA32_L2_QOS_CFG 0xc82 #define IA32_L3_CBM_BASE 0xc90 #define IA32_L2_CBM_BASE 0xd10 #define IA32_MBA_THRTL_BASE 0xd50 #define L3_QOS_CDP_ENABLE 0x01ULL +#define L2_QOS_CDP_ENABLE 0x01ULL + /* * Event IDs are used to program IA32_QM_EVTSEL before reading event * counter from IA32_QM_CTR diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 64c5ff97ee0d..bdab7d2f51af 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -990,6 +990,7 @@ out_destroy: kernfs_remove(kn); return ret; } + static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -997,8 +998,17 @@ static void l3_qos_cfg_update(void *arg) wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); } -static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) +static void l2_qos_cfg_update(void *arg) { + bool *enable = arg; + + wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); +} + +static int set_cache_qos_cfg(int level, bool enable) +{ + void (*update)(void *arg); + struct rdt_resource *r_l; cpumask_var_t cpu_mask; struct rdt_domain *d; int cpu; @@ -1006,16 +1016,24 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; - list_for_each_entry(d, &r->domains, list) { + if (level == RDT_RESOURCE_L3) + update = l3_qos_cfg_update; + else if (level == RDT_RESOURCE_L2) + update = l2_qos_cfg_update; + else + return -EINVAL; + + r_l = &rdt_resources_all[level]; + list_for_each_entry(d, &r_l->domains, list) { /* Pick one CPU from each domain instance to update MSR */ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); } cpu = get_cpu(); /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - l3_qos_cfg_update(&enable); + update(&enable); /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); + smp_call_function_many(cpu_mask, update, &enable, 1); put_cpu(); free_cpumask_var(cpu_mask); @@ -1023,52 +1041,99 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) return 0; } -static int cdp_enable(void) +static int cdp_enable(int level, int data_type, int code_type) { - struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; - struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; - struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; + struct rdt_resource *r_lcode = &rdt_resources_all[code_type]; + struct rdt_resource *r_l = &rdt_resources_all[level]; int ret; - if (!r_l3->alloc_capable || !r_l3data->alloc_capable || - !r_l3code->alloc_capable) + if (!r_l->alloc_capable || !r_ldata->alloc_capable || + !r_lcode->alloc_capable) return -EINVAL; - ret = set_l3_qos_cfg(r_l3, true); + ret = set_cache_qos_cfg(level, true); if (!ret) { - r_l3->alloc_enabled = false; - r_l3data->alloc_enabled = true; - r_l3code->alloc_enabled = true; + r_l->alloc_enabled = false; + r_ldata->alloc_enabled = true; + r_lcode->alloc_enabled = true; } return ret; } -static void cdp_disable(void) +static int cdpl3_enable(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE); +} + +static int cdpl2_enable(void) +{ + return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, + RDT_RESOURCE_L2CODE); +} + +static void cdp_disable(int level, int data_type, int code_type) +{ + struct rdt_resource *r = &rdt_resources_all[level]; r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; - set_l3_qos_cfg(r, false); + if (rdt_resources_all[data_type].alloc_enabled) { + rdt_resources_all[data_type].alloc_enabled = false; + rdt_resources_all[code_type].alloc_enabled = false; + set_cache_qos_cfg(level, false); } } +static void cdpl3_disable(void) +{ + cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE); +} + +static void cdpl2_disable(void) +{ + cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE); +} + +static void cdp_disable_all(void) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) + cdpl3_disable(); + if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + cdpl2_disable(); +} + static int parse_rdtgroupfs_options(char *data) { char *token, *o = data; int ret = 0; while ((token = strsep(&o, ",")) != NULL) { - if (!*token) - return -EINVAL; + if (!*token) { + ret = -EINVAL; + goto out; + } - if (!strcmp(token, "cdp")) - ret = cdp_enable(); + if (!strcmp(token, "cdp")) { + ret = cdpl3_enable(); + if (ret) + goto out; + } else if (!strcmp(token, "cdpl2")) { + ret = cdpl2_enable(); + if (ret) + goto out; + } else { + ret = -EINVAL; + goto out; + } } + return 0; + +out: + pr_err("Invalid mount option \"%s\"\n", token); + return ret; } @@ -1223,7 +1288,7 @@ out_mongrp: out_info: kernfs_remove(kn_info); out_cdp: - cdp_disable(); + cdp_disable_all(); out: rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); @@ -1383,7 +1448,7 @@ static void rdt_kill_sb(struct super_block *sb) /*Put everything back to default values. */ for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); - cdp_disable(); + cdp_disable_all(); rmdir_all_sub(); static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); -- cgit v1.2.3 From 31516de306c0c9235156cdc7acb976ea21f1f646 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 20 Dec 2017 14:57:24 -0800 Subject: x86/intel_rdt: Add command line parameter to control L2_CDP L2 CDP can be controlled by kernel parameter "rdt=". If "rdt=l2cdp", L2 CDP is turned on. If "rdt=!l2cdp", L2 CDP is turned off. Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: Vikas" Cc: Sai Praneeth" Cc: Reinette" Link: https://lkml.kernel.org/r/1513810644-78015-7-git-send-email-fenghua.yu@intel.com --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- arch/x86/kernel/cpu/intel_rdt.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 46b26bfee27b..fde058ca8419 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3682,7 +3682,8 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: - cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba. + cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, + mba. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5202da08fd6f..410629f10ad3 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -691,6 +691,7 @@ enum { RDT_FLAG_L3_CAT, RDT_FLAG_L3_CDP, RDT_FLAG_L2_CAT, + RDT_FLAG_L2_CDP, RDT_FLAG_MBA, }; @@ -713,6 +714,7 @@ static struct rdt_options rdt_options[] __initdata = { RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2), RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) -- cgit v1.2.3 From f23d74f6c66c3697e032550eeef3f640391a3a7d Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 17 Jan 2018 17:41:41 -0600 Subject: x86/mm: Rework wbinvd, hlt operation in stop_this_cpu() Some issues have been reported with the for loop in stop_this_cpu() that issues the 'wbinvd; hlt' sequence. Reverting this sequence to halt() has been shown to resolve the issue. However, the wbinvd is needed when running with SME. The reason for the wbinvd is to prevent cache flush races between encrypted and non-encrypted entries that have the same physical address. This can occur when kexec'ing from memory encryption active to inactive or vice-versa. The important thing is to not have outside of kernel text memory references (such as stack usage), so the usage of the native_*() functions is needed since these expand as inline asm sequences. So instead of reverting the change, rework the sequence. Move the wbinvd instruction outside of the for loop as native_wbinvd() and make its execution conditional on X86_FEATURE_SME. In the for loop, change the asm 'wbinvd; hlt' sequence back to a halt sequence but use the native_halt() call. Fixes: bba4ed011a52 ("x86/mm, kexec: Allow kexec to be used with SME") Reported-by: Dave Young Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Tested-by: Dave Young Cc: Juergen Gross Cc: Tony Luck Cc: Yu Chen Cc: Baoquan He Cc: Linus Torvalds Cc: kexec@lists.infradead.org Cc: ebiederm@redhat.com Cc: Borislav Petkov Cc: Rui Zhang Cc: Arjan van de Ven Cc: Boris Ostrovsky Cc: Dan Williams Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180117234141.21184.44067.stgit@tlendack-t1.amdoffice.net --- arch/x86/kernel/process.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 832a6acd730f..cb368c2a22ab 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -380,19 +380,24 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + /* + * Use wbinvd on processors that support SME. This provides support + * for performing a successful kexec when going from SME inactive + * to SME active (or vice-versa). The cache must be cleared so that + * if there are entries with the same physical address, both with and + * without the encryption bit, they don't race each other when flushed + * and potentially end up with the wrong entry being committed to + * memory. + */ + if (boot_cpu_has(X86_FEATURE_SME)) + native_wbinvd(); for (;;) { /* - * Use wbinvd followed by hlt to stop the processor. This - * provides support for kexec on a processor that supports - * SME. With kexec, going from SME inactive to SME active - * requires clearing cache entries so that addresses without - * the encryption bit set don't corrupt the same physical - * address that has the encryption bit set when caches are - * flushed. To achieve this a wbinvd is performed followed by - * a hlt. Even if the processor is not in the kexec/SME - * scenario this only adds a wbinvd to a halting processor. + * Use native_halt() so that memory contents don't change + * (stack usage and variables) after possibly issuing the + * native_wbinvd() above. */ - asm volatile("wbinvd; hlt" : : : "memory"); + native_halt(); } } -- cgit v1.2.3 From f35764e74f0e45e1d89ca9ed9c8299f5e746a4d1 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 15 Jan 2018 20:54:35 +0000 Subject: MIPS: Fix undefined reference to physical_memsize Since commit d41e6858ba58 ("MIPS: Kconfig: Set default MIPS system type as generic") switched the default platform to the "generic" platform, allmodconfig has been failing with the following linker error (among other errors): arch/mips/kernel/vpe-mt.o In function `vpe_run': (.text+0x59c): undefined reference to `physical_memsize' The Lantiq platform already worked around the same issue in commit 9050d50e2244 ("MIPS: lantiq: Set physical_memsize") by declaring physical_memsize with the initial value of 0 (on the assumption that the actual memory size will be hard-coded in the loaded VPE firmware), and the Malta platform already provided physical_memsize. Since all other platforms will fail to link with the VPE loader enabled, only allow Lantiq and Malta platforms to enable it, by way of a SYS_SUPPORTS_VPE_LOADER which is selected by those two platforms and which MIPS_VPE_LOADER depends on. SYS_SUPPORTS_MULTITHREADING is now a dependency of SYS_SUPPORTS_VPE_LOADER so that Kconfig emits a warning if SYS_SUPPORTS_VPE_LOADER is selected without SYS_SUPPORTS_MULTITHREADING. Fixes: d41e6858ba58 ("MIPS: Kconfig: Set default MIPS system type as generic") Signed-off-by: James Hogan Cc: Ralf Baechle Cc: John Crispin Cc: Hauke Mehrtens Cc: Paul Burton Cc: Matt Redfearn Cc: Guenter Roeck Cc: linux-mips@linux-mips.org Tested-by: Guenter Roeck Patchwork: https://patchwork.linux-mips.org/patch/18453/ --- arch/mips/Kconfig | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 659e0079487f..8e0b3702f1c0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -390,6 +390,7 @@ config LANTIQ select SYS_SUPPORTS_32BIT_KERNEL select SYS_SUPPORTS_MIPS16 select SYS_SUPPORTS_MULTITHREADING + select SYS_SUPPORTS_VPE_LOADER select SYS_HAS_EARLY_PRINTK select GPIOLIB select SWAP_IO_SPACE @@ -517,6 +518,7 @@ config MIPS_MALTA select SYS_SUPPORTS_MIPS16 select SYS_SUPPORTS_MULTITHREADING select SYS_SUPPORTS_SMARTMIPS + select SYS_SUPPORTS_VPE_LOADER select SYS_SUPPORTS_ZBOOT select SYS_SUPPORTS_RELOCATABLE select USE_OF @@ -2282,9 +2284,16 @@ config MIPSR2_TO_R6_EMULATOR The only reason this is a build-time option is to save ~14K from the final kernel image. +config SYS_SUPPORTS_VPE_LOADER + bool + depends on SYS_SUPPORTS_MULTITHREADING + help + Indicates that the platform supports the VPE loader, and provides + physical_memsize. + config MIPS_VPE_LOADER bool "VPE loader support." - depends on SYS_SUPPORTS_MULTITHREADING && MODULES + depends on SYS_SUPPORTS_VPE_LOADER && MODULES select CPU_MIPSR2_IRQ_VI select CPU_MIPSR2_IRQ_EI select MIPS_MT -- cgit v1.2.3 From 3214d01f139b7544e870fc0b7fcce8da13c1cb51 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Jan 2018 16:06:47 +1100 Subject: KVM: PPC: Book3S: Provide information about hardware/firmware CVE workarounds This adds a new ioctl, KVM_PPC_GET_CPU_CHAR, that gives userspace information about the underlying machine's level of vulnerability to the recently announced vulnerabilities CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754, and whether the machine provides instructions to assist software to work around the vulnerabilities. The ioctl returns two u64 words describing characteristics of the CPU and required software behaviour respectively, plus two mask words which indicate which bits have been filled in by the kernel, for extensibility. The bit definitions are the same as for the new H_GET_CPU_CHARACTERISTICS hypercall. There is also a new capability, KVM_CAP_PPC_GET_CPU_CHAR, which indicates whether the new ioctl is available. Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 46 +++++++++++++ arch/powerpc/include/uapi/asm/kvm.h | 25 +++++++ arch/powerpc/kvm/powerpc.c | 131 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 3 + 4 files changed, 205 insertions(+) (limited to 'arch') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 57d3ee9e4bde..fc3ae951bc07 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3403,6 +3403,52 @@ invalid, if invalid pages are written to (e.g. after the end of memory) or if no page table is present for the addresses (e.g. when using hugepages). +4.108 KVM_PPC_GET_CPU_CHAR + +Capability: KVM_CAP_PPC_GET_CPU_CHAR +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_cpu_char (out) +Returns: 0 on successful completion + -EFAULT if struct kvm_ppc_cpu_char cannot be written + +This ioctl gives userspace information about certain characteristics +of the CPU relating to speculative execution of instructions and +possible information leakage resulting from speculative execution (see +CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is +returned in struct kvm_ppc_cpu_char, which looks like this: + +struct kvm_ppc_cpu_char { + __u64 character; /* characteristics of the CPU */ + __u64 behaviour; /* recommended software behaviour */ + __u64 character_mask; /* valid bits in character */ + __u64 behaviour_mask; /* valid bits in behaviour */ +}; + +For extensibility, the character_mask and behaviour_mask fields +indicate which bits of character and behaviour have been filled in by +the kernel. If the set of defined bits is extended in future then +userspace will be able to tell whether it is running on a kernel that +knows about the new bits. + +The character field describes attributes of the CPU which can help +with preventing inadvertent information disclosure - specifically, +whether there is an instruction to flash-invalidate the L1 data cache +(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set +to a mode where entries can only be used by the thread that created +them, whether the bcctr[l] instruction prevents speculation, and +whether a speculation barrier instruction (ori 31,31,0) is provided. + +The behaviour field describes actions that software should take to +prevent inadvertent information disclosure, and thus describes which +vulnerabilities the hardware is subject to; specifically whether the +L1 data cache should be flushed when returning to user mode from the +kernel, and whether a speculation barrier should be placed between an +array bounds check and the array access. + +These fields use the same bit definitions as the new +H_GET_CPU_CHARACTERISTICS hypercall. + 5. The kvm_run structure ------------------------ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 61d6049f4c1e..637b7263cb86 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -443,6 +443,31 @@ struct kvm_ppc_rmmu_info { __u32 ap_encodings[8]; }; +/* For KVM_PPC_GET_CPU_CHAR */ +struct kvm_ppc_cpu_char { + __u64 character; /* characteristics of the CPU */ + __u64 behaviour; /* recommended software behaviour */ + __u64 character_mask; /* valid bits in character */ + __u64 behaviour_mask; /* valid bits in behaviour */ +}; + +/* + * Values for character and character_mask. + * These are identical to the values used by H_GET_CPU_CHARACTERISTICS. + */ +#define KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 (1ULL << 63) +#define KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED (1ULL << 62) +#define KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 (1ULL << 61) +#define KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 (1ULL << 60) +#define KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV (1ULL << 59) +#define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED (1ULL << 58) +#define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF (1ULL << 57) +#define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS (1ULL << 56) + +#define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY (1ULL << 63) +#define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR (1ULL << 62) +#define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ULL << 61) + /* Per-vcpu XICS interrupt controller state */ #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1915e86cef6f..0a7c88786ec0 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,6 +39,10 @@ #include #include #include +#ifdef CONFIG_PPC_PSERIES +#include +#include +#endif #include "timing.h" #include "irq.h" @@ -548,6 +552,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_KVM_XICS case KVM_CAP_IRQ_XICS: #endif + case KVM_CAP_PPC_GET_CPU_CHAR: r = 1; break; @@ -1759,6 +1764,124 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return r; } +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * These functions check whether the underlying hardware is safe + * against attacks based on observing the effects of speculatively + * executed instructions, and whether it supplies instructions for + * use in workarounds. The information comes from firmware, either + * via the device tree on powernv platforms or from an hcall on + * pseries platforms. + */ +#ifdef CONFIG_PPC_PSERIES +static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp) +{ + struct h_cpu_char_result c; + unsigned long rc; + + if (!machine_is(pseries)) + return -ENOTTY; + + rc = plpar_get_cpu_characteristics(&c); + if (rc == H_SUCCESS) { + cp->character = c.character; + cp->behaviour = c.behaviour; + cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | + KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | + KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | + KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | + KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | + KVM_PPC_CPU_CHAR_BR_HINT_HONOURED | + KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF | + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | + KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + } + return 0; +} +#else +static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp) +{ + return -ENOTTY; +} +#endif + +static inline bool have_fw_feat(struct device_node *fw_features, + const char *state, const char *name) +{ + struct device_node *np; + bool r = false; + + np = of_get_child_by_name(fw_features, name); + if (np) { + r = of_property_read_bool(np, state); + of_node_put(np); + } + return r; +} + +static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp) +{ + struct device_node *np, *fw_features; + int r; + + memset(cp, 0, sizeof(*cp)); + r = pseries_get_cpu_char(cp); + if (r != -ENOTTY) + return r; + + np = of_find_node_by_name(NULL, "ibm,opal"); + if (np) { + fw_features = of_get_child_by_name(np, "fw-features"); + of_node_put(np); + if (!fw_features) + return 0; + if (have_fw_feat(fw_features, "enabled", + "inst-spec-barrier-ori31,31,0")) + cp->character |= KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31; + if (have_fw_feat(fw_features, "enabled", + "fw-bcctrl-serialized")) + cp->character |= KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED; + if (have_fw_feat(fw_features, "enabled", + "inst-l1d-flush-ori30,30,0")) + cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30; + if (have_fw_feat(fw_features, "enabled", + "inst-l1d-flush-trig2")) + cp->character |= KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2; + if (have_fw_feat(fw_features, "enabled", + "fw-l1d-thread-split")) + cp->character |= KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV; + if (have_fw_feat(fw_features, "enabled", + "fw-count-cache-disabled")) + cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | + KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | + KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | + KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | + KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | + KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; + + if (have_fw_feat(fw_features, "enabled", + "speculation-policy-favor-security")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY; + if (!have_fw_feat(fw_features, "disabled", + "needs-l1d-flush-msr-pr-0-to-1")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR; + if (!have_fw_feat(fw_features, "disabled", + "needs-spec-barrier-for-bound-checks")) + cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | + KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | + KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; + + of_node_put(fw_features); + } + + return 0; +} +#endif + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1861,6 +1984,14 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_PPC_GET_CPU_CHAR: { + struct kvm_ppc_cpu_char cpuchar; + + r = kvmppc_get_cpu_char(&cpuchar); + if (r >= 0 && copy_to_user(argp, &cpuchar, sizeof(cpuchar))) + r = -EFAULT; + break; + } default: { struct kvm *kvm = filp->private_data; r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 496e59a2738b..7a99b98cf88e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -932,6 +932,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_SYNIC2 148 #define KVM_CAP_HYPERV_VP_INDEX 149 #define KVM_CAP_S390_AIS_MIGRATION 150 +#define KVM_CAP_PPC_GET_CPU_CHAR 151 #ifdef KVM_CAP_IRQ_ROUTING @@ -1261,6 +1262,8 @@ struct kvm_s390_ucas_mapping { #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) +/* Available with KVM_CAP_PPC_GET_CPU_CHAR */ +#define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) -- cgit v1.2.3 From a912a7584ec39647fb032c1001eb69746f27b1d3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 17 Jan 2018 19:34:08 +0200 Subject: x86/platform/intel-mid: Move PCI initialization to arch_init() ACPI redefines x86_init.pci.init when enabled. Though we still need special treatment for MID platforms. Move our specific callback to x86_init.pci.arch_init() and, by calling acpi_noirq_set(), take back a control over IRQ assignment. Signed-off-by: Andy Shevchenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: linux-acpi@vger.kernel.org Link: http://lkml.kernel.org/r/20180117173409.88136-2-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/pci/intel_mid_pci.c | 1 + arch/x86/platform/intel-mid/intel-mid.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 511921045312..43867bc85368 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -300,6 +300,7 @@ int __init intel_mid_pci_init(void) pci_root_ops = intel_mid_pci_ops; pci_soc_mode = 1; /* Continue with standard init */ + acpi_noirq_set(); return 1; } diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 86676cec99a1..2c67bae6bb53 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -194,7 +194,7 @@ void __init x86_intel_mid_early_setup(void) x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; - x86_init.pci.init = intel_mid_pci_init; + x86_init.pci.arch_init = intel_mid_pci_init; x86_init.pci.fixup_irqs = x86_init_noop; legacy_pic = &null_legacy_pic; -- cgit v1.2.3 From c13e7f313da33d1488355440f1a10feb1897480a Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 19 Jan 2018 14:32:08 +0100 Subject: ARM: sunxi_defconfig: Enable CMA The DRM driver most notably, but also out of tree drivers (for now) like the VPU or GPU drivers, are quite big consumers of large, contiguous memory buffers. However, the sunxi_defconfig doesn't enable CMA in order to mitigate that, which makes them almost unusable. Enable it to make sure it somewhat works. Cc: Signed-off-by: Maxime Ripard Signed-off-by: Arnd Bergmann --- arch/arm/configs/sunxi_defconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/configs/sunxi_defconfig b/arch/arm/configs/sunxi_defconfig index 5caaf971fb50..df433abfcb02 100644 --- a/arch/arm/configs/sunxi_defconfig +++ b/arch/arm/configs/sunxi_defconfig @@ -10,6 +10,7 @@ CONFIG_SMP=y CONFIG_NR_CPUS=8 CONFIG_AEABI=y CONFIG_HIGHMEM=y +CONFIG_CMA=y CONFIG_ARM_APPENDED_DTB=y CONFIG_ARM_ATAG_DTB_COMPAT=y CONFIG_CPU_FREQ=y @@ -33,6 +34,7 @@ CONFIG_CAN_SUN4I=y # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y +CONFIG_DMA_CMA=y CONFIG_BLK_DEV_SD=y CONFIG_ATA=y CONFIG_AHCI_SUNXI=y -- cgit v1.2.3 From 6f41c34d69eb005e7848716bbcafc979b35037d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Jan 2018 16:28:26 +0100 Subject: x86/mce: Make machine check speculation protected The machine check idtentry uses an indirect branch directly from the low level code. This evades the speculation protection. Replace it by a direct call into C code and issue the indirect call there so the compiler can apply the proper speculation protection. Signed-off-by: Thomas Gleixner Reviewed-by:Borislav Petkov Reviewed-by: David Woodhouse Niced-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801181626290.1847@nanos --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/traps.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d54a0ede61d1..63f4320602a3 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1258,7 +1258,7 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 31051f35cbb7..3de69330e6c5 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -88,6 +88,7 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif +dotraplinkage void do_mce(struct pt_regs *, long); static inline int get_si_code(unsigned long condition) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3b413065c613..a9e898b71208 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1788,6 +1788,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +{ + machine_check_vector(regs, error_code); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: -- cgit v1.2.3 From 736e80a4213e9bbce40a7c050337047128b472ac Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 19 Jan 2018 01:14:21 +0900 Subject: retpoline: Introduce start/end markers of indirect thunk Introduce start/end markers of __x86_indirect_thunk_* functions. To make it easy, consolidate .text.__x86.indirect_thunk.* sections to one .text.__x86.indirect_thunk section and put it in the end of kernel text section and adds __indirect_thunk_start/end so that other subsystem (e.g. kprobes) can identify it. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/151629206178.10241.6828804696410044771.stgit@devbox --- arch/x86/include/asm/nospec-branch.h | 3 +++ arch/x86/kernel/vmlinux.lds.S | 6 ++++++ arch/x86/lib/retpoline.S | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7b45d8424150..19ba5ad19c65 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -194,6 +194,9 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +extern char __indirect_thunk_start[]; +extern char __indirect_thunk_end[]; + /* * On VMEXIT we must ensure that no RSB predictions learned in the guest * can be followed in the host, by overwriting the RSB completely. Both diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1e413a9326aa..9b138a06c1a4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -124,6 +124,12 @@ SECTIONS ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); #endif +#ifdef CONFIG_RETPOLINE + __indirect_thunk_start = .; + *(.text.__x86.indirect_thunk) + __indirect_thunk_end = .; +#endif + /* End of text section */ _etext = .; } :text = 0x9090 diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index cb45c6cb465f..d3415dc30f82 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -9,7 +9,7 @@ #include .macro THUNK reg - .section .text.__x86.indirect_thunk.\reg + .section .text.__x86.indirect_thunk ENTRY(__x86_indirect_thunk_\reg) CFI_STARTPROC -- cgit v1.2.3 From c1804a236894ecc942da7dc6c5abe209e56cba93 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 19 Jan 2018 01:14:51 +0900 Subject: kprobes/x86: Blacklist indirect thunk functions for kprobes Mark __x86_indirect_thunk_* functions as blacklist for kprobes because those functions can be called from anywhere in the kernel including blacklist functions of kprobes. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/151629209111.10241.5444852823378068683.stgit@devbox --- arch/x86/lib/retpoline.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index d3415dc30f82..dfb2ba91b670 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -25,7 +25,8 @@ ENDPROC(__x86_indirect_thunk_\reg) * than one per register with the correct names. So we do it * the simple and nasty way... */ -#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) +#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) +#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) #define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) GENERATE_THUNK(_ASM_AX) -- cgit v1.2.3 From c86a32c09f8ced67971a2310e3b0dda4d1749007 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 19 Jan 2018 01:15:20 +0900 Subject: kprobes/x86: Disable optimizing on the function jumps to indirect thunk Since indirect jump instructions will be replaced by jump to __x86_indirect_thunk_*, those jmp instruction must be treated as an indirect jump. Since optprobe prohibits to optimize probes in the function which uses an indirect jump, it also needs to find out the function which jump to __x86_indirect_thunk_* and disable optimization. Add a check that the jump target address is between the __indirect_thunk_start/end when optimizing kprobe. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/151629212062.10241.6991266100233002273.stgit@devbox --- arch/x86/kernel/kprobes/opt.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4f98aad38237..3668f28cf5fc 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "common.h" @@ -205,7 +206,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src) } /* Check whether insn is indirect jump */ -static int insn_is_indirect_jump(struct insn *insn) +static int __insn_is_indirect_jump(struct insn *insn) { return ((insn->opcode.bytes[0] == 0xff && (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ @@ -239,6 +240,26 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) return (start <= target && target <= start + len); } +static int insn_is_indirect_jump(struct insn *insn) +{ + int ret = __insn_is_indirect_jump(insn); + +#ifdef CONFIG_RETPOLINE + /* + * Jump to x86_indirect_thunk_* is treated as an indirect jump. + * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with + * older gcc may use indirect jump. So we add this check instead of + * replace indirect-jump check. + */ + if (!ret) + ret = insn_jump_into_range(insn, + (unsigned long)__indirect_thunk_start, + (unsigned long)__indirect_thunk_end - + (unsigned long)__indirect_thunk_start); +#endif + return ret; +} + /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { -- cgit v1.2.3 From 3f7d875566d8e79c5e0b2c9a413e91b2c29e0854 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 17 Jan 2018 14:53:28 -0800 Subject: x86/retpoline: Optimize inline assembler for vmexit_fill_RSB The generated assembler for the C fill RSB inline asm operations has several issues: - The C code sets up the loop register, which is then immediately overwritten in __FILL_RETURN_BUFFER with the same value again. - The C code also passes in the iteration count in another register, which is not used at all. Remove these two unnecessary operations. Just rely on the single constant passed to the macro for the iterations. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: dave.hansen@intel.com Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: arjan@linux.intel.com Link: https://lkml.kernel.org/r/20180117225328.15414-1-andi@firstfloor.org --- arch/x86/include/asm/nospec-branch.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 19ba5ad19c65..4ad41087ce0e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -206,16 +206,17 @@ extern char __indirect_thunk_end[]; static inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE - unsigned long loops = RSB_CLEAR_LOOPS / 2; + unsigned long loops; asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE ALTERNATIVE("jmp 910f", __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), X86_FEATURE_RETPOLINE) "910:" - : "=&r" (loops), ASM_CALL_CONSTRAINT - : "r" (loops) : "memory" ); + : "=r" (loops), ASM_CALL_CONSTRAINT + : : "memory" ); #endif } + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ -- cgit v1.2.3 From 4b664e739f7743f91e1d12ebfb7a76307ebea702 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 18 Jan 2018 13:52:17 -0800 Subject: ia64: Rewrite atomic_add and atomic_sub Force __builtin_constant_p to evaluate whether the argument to atomic_add & atomic_sub is constant in the front-end before optimisations which can lead GCC to output a call to __bad_increment_for_ia64_fetch_and_add(). See GCC bugzilla 83653. Signed-off-by: Jakub Jelinek Signed-off-by: Matthew Wilcox Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/atomic.h | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 28e02c99be6d..762eeb0fcc1d 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -65,29 +65,30 @@ ia64_atomic_fetch_##op (int i, atomic_t *v) \ ATOMIC_OPS(add, +) ATOMIC_OPS(sub, -) -#define atomic_add_return(i,v) \ +#ifdef __OPTIMIZE__ +#define __ia64_atomic_const(i) __builtin_constant_p(i) ? \ + ((i) == 1 || (i) == 4 || (i) == 8 || (i) == 16 || \ + (i) == -1 || (i) == -4 || (i) == -8 || (i) == -16) : 0 + +#define atomic_add_return(i, v) \ ({ \ - int __ia64_aar_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \ - || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \ - || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \ - || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \ - ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \ - : ia64_atomic_add(__ia64_aar_i, v); \ + int __i = (i); \ + static const int __ia64_atomic_p = __ia64_atomic_const(i); \ + __ia64_atomic_p ? ia64_fetch_and_add(__i, &(v)->counter) : \ + ia64_atomic_add(__i, v); \ }) -#define atomic_sub_return(i,v) \ +#define atomic_sub_return(i, v) \ ({ \ - int __ia64_asr_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \ - || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \ - || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \ - || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \ - ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \ - : ia64_atomic_sub(__ia64_asr_i, v); \ + int __i = (i); \ + static const int __ia64_atomic_p = __ia64_atomic_const(i); \ + __ia64_atomic_p ? ia64_fetch_and_add(-__i, &(v)->counter) : \ + ia64_atomic_sub(__i, v); \ }) +#else +#define atomic_add_return(i, v) ia64_atomic_add(i, v) +#define atomic_sub_return(i, v) ia64_atomic_sub(i, v) +#endif #define atomic_fetch_add(i,v) \ ({ \ -- cgit v1.2.3 From 11f19ec025dd421c54978c69e42d86758fa310de Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 19 Jan 2018 11:06:17 +0100 Subject: x86/jailhouse: Set X86_FEATURE_TSC_KNOWN_FREQ Otherwise, Linux will not recognize precalibrated_tsc_khz and disable the tsc as clocksource. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: Jailhouse Link: https://lkml.kernel.org/r/975fbfc9-2a64-cc56-40d5-164992ec3916@siemens.com --- arch/x86/kernel/jailhouse.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 7ade152133c7..2b7ebbe9043d 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -176,6 +176,7 @@ static void __init jailhouse_init_platform(void) pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); precalibrated_tsc_khz = setup_data.tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); pci_probe = 0; -- cgit v1.2.3 From 3b42349d56c96e144401d2317d8eeb9937511423 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 19 Jan 2018 11:06:30 +0100 Subject: x86/jailhouse: Respect pci=lastbus command line settings Limiting the scan width to the known last bus via the command line can accelerate the boot noteworthy. Signed-off-by: Jan Kiszka Signed-off-by: Thomas Gleixner Cc: Jailhouse Link: https://lkml.kernel.org/r/51f5fe62-ca8f-9286-5cdb-39df3fad78b4@siemens.com --- arch/x86/kernel/jailhouse.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 2b7ebbe9043d..b68fd895235a 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -119,8 +119,10 @@ static int __init jailhouse_pci_arch_init(void) /* * There are no bridges on the virtual PCI root bus under Jailhouse, * thus no other way to discover all devices than a full scan. + * Respect any overrides via the command line, though. */ - pcibios_last_bus = 0xff; + if (pcibios_last_bus < 0) + pcibios_last_bus = 0xff; return 0; } -- cgit v1.2.3 From 35b3fde6203b932b2b1a5b53b3d8808abc9c4f60 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 17 Jan 2018 14:44:34 +0100 Subject: KVM: s390: wire up bpb feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new firmware interfaces for branch prediction behaviour changes are transparently available for the guest. Nevertheless, there is new state attached that should be migrated and properly resetted. Provide a mechanism for handling reset, migration and VSIE. Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck [Changed capability number to 152. - Radim] Signed-off-by: Radim Krčmář --- arch/s390/include/asm/kvm_host.h | 3 ++- arch/s390/include/uapi/asm/kvm.h | 5 ++++- arch/s390/kvm/kvm-s390.c | 12 ++++++++++++ arch/s390/kvm/vsie.c | 10 ++++++++++ include/uapi/linux/kvm.h | 1 + 5 files changed, 29 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e14f381757f6..c1b0a9ac1dc8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -207,7 +207,8 @@ struct kvm_s390_sie_block { __u16 ipa; /* 0x0056 */ __u32 ipb; /* 0x0058 */ __u32 scaoh; /* 0x005c */ - __u8 reserved60; /* 0x0060 */ +#define FPF_BPBC 0x20 + __u8 fpf; /* 0x0060 */ #define ECB_GS 0x40 #define ECB_TE 0x10 #define ECB_SRSI 0x04 diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 38535a57fef8..4cdaa55fabfe 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -224,6 +224,7 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_RICCB (1UL << 7) #define KVM_SYNC_FPRS (1UL << 8) #define KVM_SYNC_GSCB (1UL << 9) +#define KVM_SYNC_BPBC (1UL << 10) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 #define SDNXL (1UL << SDNXC) @@ -247,7 +248,9 @@ struct kvm_sync_regs { }; __u8 reserved[512]; /* for future vector expansion */ __u32 fpc; /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */ - __u8 padding1[52]; /* riccb needs to be 64byte aligned */ + __u8 bpbc : 1; /* bp mode */ + __u8 reserved2 : 7; + __u8 padding1[51]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ union { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2c93cbbcd15e..2598cf243b86 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -421,6 +421,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_GS: r = test_facility(133); break; + case KVM_CAP_S390_BPB: + r = test_facility(82); + break; default: r = 0; } @@ -2198,6 +2201,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 82)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; if (test_kvm_facility(vcpu->kvm, 133)) vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; /* fprs can be synchronized via vrs, even if the guest has no vx. With @@ -2339,6 +2344,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) current->thread.fpu.fpc = 0; vcpu->arch.sie_block->gbea = 1; vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) @@ -3298,6 +3304,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; vcpu->arch.gs_enabled = 1; } + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_BPBC) && + test_kvm_facility(vcpu->kvm, 82)) { + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; + } save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); /* save host (userspace) fprs/vrs */ @@ -3344,6 +3355,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; + kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); /* Save guest register state */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 5d6ae0326d9e..751348348477 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -223,6 +223,12 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) memcpy(scb_o->gcr, scb_s->gcr, 128); scb_o->pp = scb_s->pp; + /* branch prediction */ + if (test_kvm_facility(vcpu->kvm, 82)) { + scb_o->fpf &= ~FPF_BPBC; + scb_o->fpf |= scb_s->fpf & FPF_BPBC; + } + /* interrupt intercept */ switch (scb_s->icptcode) { case ICPT_PROGI: @@ -265,6 +271,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecb3 = 0; scb_s->ecd = 0; scb_s->fac = 0; + scb_s->fpf = 0; rc = prepare_cpuflags(vcpu, vsie_page); if (rc) @@ -324,6 +331,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= scb_o->ecb & ECB_TE; } + /* branch prediction */ + if (test_kvm_facility(vcpu->kvm, 82)) + scb_s->fpf |= scb_o->fpf & FPF_BPBC; /* SIMD */ if (test_kvm_facility(vcpu->kvm, 129)) { scb_s->eca |= scb_o->eca & ECA_VX; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7a99b98cf88e..8fb90a0819c3 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -933,6 +933,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_HYPERV_VP_INDEX 149 #define KVM_CAP_S390_AIS_MIGRATION 150 #define KVM_CAP_PPC_GET_CPU_CHAR 151 +#define KVM_CAP_S390_BPB 152 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 86be89939d11a84800f66e2a283b915b704bf33d Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 16 Jan 2018 11:52:59 +0000 Subject: alpha/PCI: Fix noname IRQ level detection The conversion of the alpha architecture PCI host bridge legacy IRQ mapping/swizzling to the new PCI host bridge map/swizzle hooks carried out through: commit 0e4c2eeb758a ("alpha/PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") implies that IRQ for devices are now allocated through pci_assign_irq() function in pci_device_probe() that is called when a driver matching a device is found in order to probe the device through the device driver. Alpha noname platforms required IRQ level programming to be executed in sio_fixup_irq_levels(), that is called in noname_init_pci(), a platform hook called within a subsys_initcall. In noname_init_pci(), present IRQs are detected through sio_collect_irq_levels() that check the struct pci_dev->irq number to detect if an IRQ has been allocated for the device. By the time sio_collect_irq_levels() is called, some devices may still have not a matching driver loaded to match them (eg loadable module) therefore their IRQ allocation is still pending - which means that sio_collect_irq_levels() does not programme the correct IRQ level for those devices, causing their IRQ handling to be broken when the device driver is actually loaded and the device is probed. Fix the issue by adding code in the noname map_irq() function (noname_map_irq()) that, whilst mapping/swizzling the IRQ line, it also ensures that the correct IRQ level programming is executed at platform level, fixing the issue. Fixes: 0e4c2eeb758a ("alpha/PCI: Replace pci_fixup_irqs() call with host bridge IRQ mapping hooks") Reported-by: Mikulas Patocka Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # 4.14 Cc: Bjorn Helgaas Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Mikulas Patocka Cc: Meelis Roos Signed-off-by: Matt Turner --- arch/alpha/kernel/sys_sio.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index 37bd6d9b8eb9..a6bdc1da47ad 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -102,6 +102,15 @@ sio_pci_route(void) alpha_mv.sys.sio.route_tab); } +static bool sio_pci_dev_irq_needs_level(const struct pci_dev *dev) +{ + if ((dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) && + (dev->class >> 8 != PCI_CLASS_BRIDGE_PCMCIA)) + return false; + + return true; +} + static unsigned int __init sio_collect_irq_levels(void) { @@ -110,8 +119,7 @@ sio_collect_irq_levels(void) /* Iterate through the devices, collecting IRQ levels. */ for_each_pci_dev(dev) { - if ((dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) && - (dev->class >> 8 != PCI_CLASS_BRIDGE_PCMCIA)) + if (!sio_pci_dev_irq_needs_level(dev)) continue; if (dev->irq) @@ -120,8 +128,7 @@ sio_collect_irq_levels(void) return level_bits; } -static void __init -sio_fixup_irq_levels(unsigned int level_bits) +static void __sio_fixup_irq_levels(unsigned int level_bits, bool reset) { unsigned int old_level_bits; @@ -139,12 +146,21 @@ sio_fixup_irq_levels(unsigned int level_bits) */ old_level_bits = inb(0x4d0) | (inb(0x4d1) << 8); - level_bits |= (old_level_bits & 0x71ff); + if (reset) + old_level_bits &= 0x71ff; + + level_bits |= old_level_bits; outb((level_bits >> 0) & 0xff, 0x4d0); outb((level_bits >> 8) & 0xff, 0x4d1); } +static inline void +sio_fixup_irq_levels(unsigned int level_bits) +{ + __sio_fixup_irq_levels(level_bits, true); +} + static inline int noname_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) { @@ -181,7 +197,14 @@ noname_map_irq(const struct pci_dev *dev, u8 slot, u8 pin) const long min_idsel = 6, max_idsel = 14, irqs_per_slot = 5; int irq = COMMON_TABLE_LOOKUP, tmp; tmp = __kernel_extbl(alpha_mv.sys.sio.route_tab, irq); - return irq >= 0 ? tmp : -1; + + irq = irq >= 0 ? tmp : -1; + + /* Fixup IRQ level if an actual IRQ mapping is detected */ + if (sio_pci_dev_irq_needs_level(dev) && irq >= 0) + __sio_fixup_irq_levels(1 << irq, false); + + return irq; } static inline int -- cgit v1.2.3 From 91cfc88c66bf8ab95937606569670cf67fa73e09 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Sat, 20 Jan 2018 17:14:02 -0800 Subject: x86: Use __nostackprotect for sme_encrypt_kernel Commit bacf6b499e11 ("x86/mm: Use a struct to reduce parameters for SME PGD mapping") moved some parameters into a structure. The structure was large enough to trigger the stack protection canary in sme_encrypt_kernel which doesn't work this early, causing reboots. Mark sme_encrypt_kernel appropriately to not use the canary. Fixes: bacf6b499e11 ("x86/mm: Use a struct to reduce parameters for SME PGD mapping") Signed-off-by: Laura Abbott Cc: Tom Lendacky Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/mm/mem_encrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3ef362f598e3..e1d61e8500f9 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -738,7 +738,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return total; } -void __init sme_encrypt_kernel(struct boot_params *bp) +void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; -- cgit v1.2.3 From e2ac83d74a4d753cea88407e65136c84a0cb60b2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 22 Jan 2018 22:07:46 -0600 Subject: x86/ftrace: Fix ORC unwinding from ftrace handlers Steven Rostedt discovered that the ftrace stack tracer is broken when it's used with the ORC unwinder. The problem is that objtool is instructed by the Makefile to ignore the ftrace_64.S code, so it doesn't generate any ORC data for it. Fix it by making the asm code objtool-friendly: - Objtool doesn't like the fact that save_mcount_regs pushes RBP at the beginning, but it's never restored (directly, at least). So just skip the original RBP push, which is only needed for frame pointers anyway. - Annotate some functions as normal callable functions with ENTRY/ENDPROC. - Add an empty unwind hint to return_to_handler(). The return address isn't on the stack, so there's nothing ORC can do there. It will just punt in the unlikely case it tries to unwind from that code. With all that fixed, remove the OBJECT_FILES_NON_STANDARD Makefile annotation so objtool can read the file. Link: http://lkml.kernel.org/r/20180123040746.ih4ep3tk4pbjvg7c@treble Reported-by: Steven Rostedt Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/Makefile | 5 ++++- arch/x86/kernel/ftrace_64.S | 24 +++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 81bb565f4497..7e2baf7304ae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,10 +29,13 @@ KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y +ifdef CONFIG_FRAME_POINTER +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y +endif + # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7cb8ba08beb9..ef61f540cf0a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -8,6 +8,7 @@ #include #include #include +#include .code64 .section .entry.text, "ax" @@ -20,7 +21,6 @@ EXPORT_SYMBOL(__fentry__) EXPORT_SYMBOL(mcount) #endif -/* All cases save the original rbp (8 bytes) */ #ifdef CONFIG_FRAME_POINTER # ifdef CC_USING_FENTRY /* Save parent and function stack frames (rip and rbp) */ @@ -31,7 +31,7 @@ EXPORT_SYMBOL(mcount) # endif #else /* No need to save a stack frame */ -# define MCOUNT_FRAME_SIZE 8 +# define MCOUNT_FRAME_SIZE 0 #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ @@ -64,10 +64,10 @@ EXPORT_SYMBOL(mcount) */ .macro save_mcount_regs added=0 - /* Always save the original rbp */ +#ifdef CONFIG_FRAME_POINTER + /* Save the original rbp */ pushq %rbp -#ifdef CONFIG_FRAME_POINTER /* * Stack traces will stop at the ftrace trampoline if the frame pointer * is not set up properly. If fentry is used, we need to save a frame @@ -105,7 +105,11 @@ EXPORT_SYMBOL(mcount) * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. */ +#ifdef CONFIG_FRAME_POINTER movq MCOUNT_REG_SIZE-8(%rsp), %rdx +#else + movq %rbp, %rdx +#endif movq %rdx, RBP(%rsp) /* Copy the parent address into %rsi (second parameter) */ @@ -148,7 +152,7 @@ EXPORT_SYMBOL(mcount) ENTRY(function_hook) retq -END(function_hook) +ENDPROC(function_hook) ENTRY(ftrace_caller) /* save_mcount_regs fills in first two parameters */ @@ -184,7 +188,7 @@ GLOBAL(ftrace_graph_call) /* This is weak to keep gas from relaxing the jumps */ WEAK(ftrace_stub) retq -END(ftrace_caller) +ENDPROC(ftrace_caller) ENTRY(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -255,7 +259,7 @@ GLOBAL(ftrace_regs_caller_end) jmp ftrace_epilogue -END(ftrace_regs_caller) +ENDPROC(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -313,9 +317,10 @@ ENTRY(ftrace_graph_caller) restore_mcount_regs retq -END(ftrace_graph_caller) +ENDPROC(ftrace_graph_caller) -GLOBAL(return_to_handler) +ENTRY(return_to_handler) + UNWIND_HINT_EMPTY subq $24, %rsp /* Save the return values */ @@ -330,4 +335,5 @@ GLOBAL(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC %rdi +END(return_to_handler) #endif -- cgit v1.2.3 From 6be7fa3c74d1e0cd50f2157b5c1524f152bf641e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 22 Jan 2018 22:32:51 -0500 Subject: ftrace, orc, x86: Handle ftrace dynamically allocated trampolines The function tracer can create a dynamically allocated trampoline that is called by the function mcount or fentry hook that is used to call the function callback that is registered. The problem is that the orc undwinder will bail if it encounters one of these trampolines. This breaks the stack trace of function callbacks, which include the stack tracer and setting the stack trace for individual functions. Since these dynamic trampolines are basically copies of the static ftrace trampolines defined in ftrace_*.S, we do not need to create new orc entries for the dynamic trampolines. Finding the return address on the stack will be identical as the functions that were copied to create the dynamic trampolines. When encountering a ftrace dynamic trampoline, we can just use the orc entry of the ftrace static function that was copied for that trampoline. Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/unwind_orc.c | 48 +++++++++++++++++++++++++++++++++++++++++++- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 29 +++++++++++++++----------- 3 files changed, 66 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index be86a865087a..1f9188f5357c 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the undwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entrie, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + struct ftrace_ops *ops; + unsigned long caller; + + ops = ftrace_ops_trampoline(ip); + if (!ops) + return NULL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + caller = (unsigned long)ftrace_regs_call; + else + caller = (unsigned long)ftrace_call; + + /* Prevent unlikely recursion */ + if (ip == caller) + return NULL; + + return orc_find(caller); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + return NULL; +} +#endif + static struct orc_entry *orc_find(unsigned long ip) { + static struct orc_entry *orc; + if (!orc_init) return NULL; @@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip) __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); /* Module lookup: */ - return orc_module_find(ip); + orc = orc_module_find(ip); + if (orc) + return orc; + + return orc_ftrace_find(ip); } static void orc_sort_swap(void *_a, void *_b, int size) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2bab81951ced..3319df9727aa 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -332,6 +332,8 @@ extern int ftrace_text_reserved(const void *start, const void *end); extern int ftrace_nr_registered_ops(void); +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr); + bool is_ftrace_trampoline(unsigned long addr); /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf3664e4a9..554b517c61a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { }; /* - * This is used by __kernel_text_address() to return true if the - * address is on a dynamically allocated trampoline that would - * not return true for either core_kernel_text() or - * is_module_text_address(). + * Used by the stack undwinder to know about dynamic ftrace trampolines. */ -bool is_ftrace_trampoline(unsigned long addr) +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { - struct ftrace_ops *op; - bool ret = false; + struct ftrace_ops *op = NULL; /* * Some of the ops may be dynamically allocated, @@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) if (op->trampoline && op->trampoline_size) if (addr >= op->trampoline && addr < op->trampoline + op->trampoline_size) { - ret = true; - goto out; + preempt_enable_notrace(); + return op; } } while_for_each_ftrace_op(op); - - out: preempt_enable_notrace(); - return ret; + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; } struct ftrace_page { -- cgit v1.2.3 From 1df37383a8aeabb9b418698f0bcdffea01f4b1b2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 22 Jan 2018 17:09:34 -0500 Subject: x86/retpoline: Remove the esp/rsp thunk It doesn't make sense to have an indirect call thunk with esp/rsp as retpoline code won't work correctly with the stack pointer register. Removing it will help compiler writers to catch error in case such a thunk call is emitted incorrectly. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Suggested-by: Jeff Law Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Tom Lendacky Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1516658974-27852-1-git-send-email-longman@redhat.com --- arch/x86/include/asm/asm-prototypes.h | 1 - arch/x86/lib/retpoline.S | 1 - 2 files changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 0927cdc4f946..1908214b9125 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -38,5 +38,4 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) -INDIRECT_THUNK(sp) #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index dfb2ba91b670..c909961e678a 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -36,7 +36,6 @@ GENERATE_THUNK(_ASM_DX) GENERATE_THUNK(_ASM_SI) GENERATE_THUNK(_ASM_DI) GENERATE_THUNK(_ASM_BP) -GENERATE_THUNK(_ASM_SP) #ifdef CONFIG_64BIT GENERATE_THUNK(r8) GENERATE_THUNK(r9) -- cgit v1.2.3 From c5baa1be8f559d5f33c412d00cc1c86762a8bbbf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 17 Jan 2018 14:26:47 +0000 Subject: irqdomain: Kill CONFIG_IRQ_DOMAIN_DEBUG CONFIG_IRQ_DOMAIN_DEBUG is similar to CONFIG_GENERIC_IRQ_DEBUGFS, just with less information. Spring cleanup time. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Yang Shunyong Link: https://lkml.kernel.org/r/20180117142647.23622-1-marc.zyngier@arm.com --- Documentation/IRQ-domain.txt | 36 +-------- arch/arm/configs/aspeed_g4_defconfig | 1 - arch/arm/configs/aspeed_g5_defconfig | 1 - arch/arm/configs/hisi_defconfig | 1 - arch/arm/configs/multi_v7_defconfig | 1 - arch/arm/configs/mvebu_v7_defconfig | 1 - arch/arm/configs/pxa_defconfig | 1 - arch/arm/configs/sama5_defconfig | 1 - arch/arm/configs/tegra_defconfig | 1 - arch/arm/configs/vt8500_v6_v7_defconfig | 1 - arch/powerpc/configs/fsl-emb-nonhw.config | 1 - arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/ppc64_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/xtensa/configs/audio_kc705_defconfig | 1 - arch/xtensa/configs/cadence_csp_defconfig | 1 - arch/xtensa/configs/generic_kc705_defconfig | 1 - arch/xtensa/configs/nommu_kc705_defconfig | 1 - arch/xtensa/configs/smp_lx200_defconfig | 1 - kernel/irq/Kconfig | 10 --- kernel/irq/irqdomain.c | 118 ---------------------------- 21 files changed, 2 insertions(+), 180 deletions(-) (limited to 'arch') diff --git a/Documentation/IRQ-domain.txt b/Documentation/IRQ-domain.txt index 4a1cd7645d85..507775cce753 100644 --- a/Documentation/IRQ-domain.txt +++ b/Documentation/IRQ-domain.txt @@ -265,37 +265,5 @@ support other architectures, such as ARM, ARM64 etc. === Debugging === -If you switch on CONFIG_IRQ_DOMAIN_DEBUG (which depends on -CONFIG_IRQ_DOMAIN and CONFIG_DEBUG_FS), you will find a new file in -your debugfs mount point, called irq_domain_mapping. This file -contains a live snapshot of all the IRQ domains in the system: - - name mapped linear-max direct-max devtree-node - pl061 8 8 0 /smb/gpio@e0080000 - pl061 8 8 0 /smb/gpio@e1050000 - pMSI 0 0 0 /interrupt-controller@e1101000/v2m@e0080000 - MSI 37 0 0 /interrupt-controller@e1101000/v2m@e0080000 - GICv2m 37 0 0 /interrupt-controller@e1101000/v2m@e0080000 - GICv2 448 448 0 /interrupt-controller@e1101000 - -it also iterates over the interrupts to display their mapping in the -domains, and makes the domain stacking visible: - - -irq hwirq chip name chip data active type domain - 1 0x00019 GICv2 0xffff00000916bfd8 * LINEAR GICv2 - 2 0x0001d GICv2 0xffff00000916bfd8 LINEAR GICv2 - 3 0x0001e GICv2 0xffff00000916bfd8 * LINEAR GICv2 - 4 0x0001b GICv2 0xffff00000916bfd8 * LINEAR GICv2 - 5 0x0001a GICv2 0xffff00000916bfd8 LINEAR GICv2 -[...] - 96 0x81808 MSI 0x (null) RADIX MSI - 96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m - 96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2 - 97 0x08800 MSI 0x (null) * RADIX MSI - 97+ 0x00064 GICv2m 0xffff8003ee116980 * RADIX GICv2m - 97+ 0x00064 GICv2 0xffff00000916bfd8 * LINEAR GICv2 - -Here, interrupts 1-5 are only using a single domain, while 96 and 97 -are build out of a stack of three domain, each level performing a -particular function. +Most of the internals of the IRQ subsystem are exposed in debugfs by +turning CONFIG_GENERIC_IRQ_DEBUGFS on. diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig index d23b9d56a88b..95946dee9c77 100644 --- a/arch/arm/configs/aspeed_g4_defconfig +++ b/arch/arm/configs/aspeed_g4_defconfig @@ -1,7 +1,6 @@ CONFIG_KERNEL_XZ=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index c0ad7b82086b..8c7ea033cdc2 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -1,7 +1,6 @@ CONFIG_KERNEL_XZ=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 diff --git a/arch/arm/configs/hisi_defconfig b/arch/arm/configs/hisi_defconfig index b2e340b272ee..74d611e41e02 100644 --- a/arch/arm/configs/hisi_defconfig +++ b/arch/arm/configs/hisi_defconfig @@ -1,4 +1,3 @@ -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 61509c4b769f..b659244902cd 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1,6 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_CGROUPS=y diff --git a/arch/arm/configs/mvebu_v7_defconfig b/arch/arm/configs/mvebu_v7_defconfig index 69553704f2dc..ee61be093633 100644 --- a/arch/arm/configs/mvebu_v7_defconfig +++ b/arch/arm/configs/mvebu_v7_defconfig @@ -1,6 +1,5 @@ CONFIG_SYSVIPC=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index 830e817a028a..837d0c9c8b0e 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BSD_PROCESS_ACCT=y diff --git a/arch/arm/configs/sama5_defconfig b/arch/arm/configs/sama5_defconfig index 6529cb43e0fd..2080025556b5 100644 --- a/arch/arm/configs/sama5_defconfig +++ b/arch/arm/configs/sama5_defconfig @@ -2,7 +2,6 @@ # CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 diff --git a/arch/arm/configs/tegra_defconfig b/arch/arm/configs/tegra_defconfig index 6678f2929356..c819be04187e 100644 --- a/arch/arm/configs/tegra_defconfig +++ b/arch/arm/configs/tegra_defconfig @@ -1,5 +1,4 @@ CONFIG_SYSVIPC=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y diff --git a/arch/arm/configs/vt8500_v6_v7_defconfig b/arch/arm/configs/vt8500_v6_v7_defconfig index 1bfaa7bfc392..9b85326ba287 100644 --- a/arch/arm/configs/vt8500_v6_v7_defconfig +++ b/arch/arm/configs/vt8500_v6_v7_defconfig @@ -1,4 +1,3 @@ -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config index cc49c95494da..e0567dc41968 100644 --- a/arch/powerpc/configs/fsl-emb-nonhw.config +++ b/arch/powerpc/configs/fsl-emb-nonhw.config @@ -71,7 +71,6 @@ CONFIG_IP_ROUTE_MULTIPATH=y CONFIG_IP_ROUTE_VERBOSE=y CONFIG_IP_SCTP=m CONFIG_IPV6=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_ISO9660_FS=m CONFIG_JFFS2_FS_DEBUG=1 CONFIG_JFFS2_FS=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 4891bbed6258..73dab7a37386 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -4,7 +4,6 @@ CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_TASKSTATS=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 6ddca80c52c3..5033e630afea 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -1,7 +1,6 @@ CONFIG_PPC64=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_TASKSTATS=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index bde2cd1005a2..0dd5cf7b566d 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -3,7 +3,6 @@ CONFIG_NR_CPUS=2048 CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_TASKSTATS=y diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig index 8d16925765cb..2bf964df37ba 100644 --- a/arch/xtensa/configs/audio_kc705_defconfig +++ b/arch/xtensa/configs/audio_kc705_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IRQ_TIME_ACCOUNTING=y diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig index f2d3094aa1d1..3221b7053fa3 100644 --- a/arch/xtensa/configs/cadence_csp_defconfig +++ b/arch/xtensa/configs/cadence_csp_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_USELIB=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IRQ_TIME_ACCOUNTING=y diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig index 744adeaf2945..985fa8546e4e 100644 --- a/arch/xtensa/configs/generic_kc705_defconfig +++ b/arch/xtensa/configs/generic_kc705_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IRQ_TIME_ACCOUNTING=y diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig index 78c2529d0459..624f9b3a3878 100644 --- a/arch/xtensa/configs/nommu_kc705_defconfig +++ b/arch/xtensa/configs/nommu_kc705_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IRQ_TIME_ACCOUNTING=y diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig index 14e3ca353ac8..11fed6c06a7c 100644 --- a/arch/xtensa/configs/smp_lx200_defconfig +++ b/arch/xtensa/configs/smp_lx200_defconfig @@ -1,7 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_FHANDLE=y -CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IRQ_TIME_ACCOUNTING=y diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 89e355866450..6fc87ccda1d7 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -103,16 +103,6 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR config GENERIC_IRQ_RESERVATION_MODE bool -config IRQ_DOMAIN_DEBUG - bool "Expose hardware/virtual IRQ mapping via debugfs" - depends on IRQ_DOMAIN && DEBUG_FS - help - This option will show the mapping relationship between hardware irq - numbers and Linux irq numbers. The mapping is exposed via debugfs - in the file "irq_domain_mapping". - - If you don't know what this means you don't need it. - # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 62068ad46930..e6a9c36470ee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -897,124 +897,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_find_mapping); -#ifdef CONFIG_IRQ_DOMAIN_DEBUG -static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) -{ - struct irq_domain *domain; - struct irq_data *data; - - domain = desc->irq_data.domain; - data = &desc->irq_data; - - while (domain) { - unsigned int irq = data->irq; - unsigned long hwirq = data->hwirq; - struct irq_chip *chip; - bool direct; - - if (data == &desc->irq_data) - seq_printf(m, "%5d ", irq); - else - seq_printf(m, "%5d+ ", irq); - seq_printf(m, "0x%05lx ", hwirq); - - chip = irq_data_get_irq_chip(data); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data)); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", domain->name); -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - domain = domain->parent; - data = data->parent_data; -#else - domain = NULL; -#endif - } -} - -static int virq_debug_show(struct seq_file *m, void *private) -{ - unsigned long flags; - struct irq_desc *desc; - struct irq_domain *domain; - struct radix_tree_iter iter; - void __rcu **slot; - int i; - - seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", - "name", "mapped", "linear-max", "direct-max", "devtree-node"); - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, link) { - struct device_node *of_node; - const char *name; - - int count = 0; - - of_node = irq_domain_get_of_node(domain); - if (of_node) - name = of_node_full_name(of_node); - else if (is_fwnode_irqchip(domain->fwnode)) - name = container_of(domain->fwnode, struct irqchip_fwid, - fwnode)->name; - else - name = ""; - - radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) - count++; - seq_printf(m, "%c%-16s %6u %10u %10u %s\n", - domain == irq_default_domain ? '*' : ' ', domain->name, - domain->revmap_size + count, domain->revmap_size, - domain->revmap_direct_max_irq, - name); - } - mutex_unlock(&irq_domain_mutex); - - seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", - "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "active", "type", "domain"); - - for (i = 1; i < nr_irqs; i++) { - desc = irq_to_desc(i); - if (!desc) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - virq_debug_show_one(m, desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - } - - return 0; -} - -static int virq_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, virq_debug_show, inode->i_private); -} - -static const struct file_operations virq_debug_fops = { - .open = virq_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init irq_debugfs_init(void) -{ - if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL, - NULL, &virq_debug_fops) == NULL) - return -ENOMEM; - - return 0; -} -__initcall(irq_debugfs_init); -#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ - /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * -- cgit v1.2.3 From 40d4071ce2d20840d224b4a77b5dc6f752c9ab15 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Mon, 22 Jan 2018 14:12:52 +0800 Subject: perf/x86/amd/power: Do not load AMD power module on !AMD platforms The AMD power module can be loaded on non AMD platforms, but unload fails with the following Oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: __list_del_entry_valid+0x29/0x90 Call Trace: perf_pmu_unregister+0x25/0xf0 amd_power_pmu_exit+0x1c/0xd23 [power] SyS_delete_module+0x1a8/0x2b0 ? exit_to_usermode_loop+0x8f/0xb0 entry_SYSCALL_64_fastpath+0x20/0x83 Return -ENODEV instead of 0 from the module init function if the CPU does not match. Fixes: c7ab62bfbe0e ("perf/x86/amd/power: Add AMD accumulated power reporting mechanism") Signed-off-by: Xiao Liang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180122061252.6394-1-xiliang@redhat.com --- arch/x86/events/amd/power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index a6eee5ac4f58..2aefacf5c5b2 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -277,7 +277,7 @@ static int __init amd_power_pmu_init(void) int ret; if (!x86_match_cpu(cpu_match)) - return 0; + return -ENODEV; if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) return -ENODEV; -- cgit v1.2.3 From 7e702d17ed138cf4ae7c00e8c00681ed464587c7 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Tue, 23 Jan 2018 11:41:32 +0100 Subject: x86/microcode/intel: Extend BDW late-loading further with LLC size check Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") reduced the impact of erratum BDF90 for Broadwell model 79. The impact can be reduced further by checking the size of the last level cache portion per core. Tony: "The erratum says the problem only occurs on the large-cache SKUs. So we only need to avoid the update if we are on a big cache SKU that is also running old microcode." For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux.alibaba.com --- arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d9e460fc7a3b..f7c55b0e753a 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,6 +45,9 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ static struct microcode_intel *intel_ucode_patch; +/* last level cache size per core */ +static int llc_size_per_core; + static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, unsigned int s2, unsigned int p2) { @@ -912,12 +915,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 - * may result in a system hang. This behavior is documented in item - * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). */ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X && c->x86_mask == 0x01 && + llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); @@ -975,6 +980,15 @@ static struct microcode_ops microcode_intel_ops = { .apply_microcode = apply_microcode_intel, }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -985,5 +999,7 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + llc_size_per_core = calc_llc_size_per_core(c); + return µcode_intel_ops; } -- cgit v1.2.3 From 1d080f096fe33f031d26e19b3ef0146f66b8b0f1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 23 Jan 2018 11:41:33 +0100 Subject: x86/microcode: Fix again accessing initrd after having been freed Commit 24c2503255d3 ("x86/microcode: Do not access the initrd after it has been freed") fixed attempts to access initrd from the microcode loader after it has been freed. However, a similar KASAN warning was reported (stack trace edited): smpboot: Booting Node 0 Processor 1 APIC 0x11 ================================================================== BUG: KASAN: use-after-free in find_cpio_data+0x9b5/0xa50 Read of size 1 at addr ffff880035ffd000 by task swapper/1/0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.14.8-slack #7 Hardware name: System manufacturer System Product Name/A88X-PLUS, BIOS 3003 03/10/2016 Call Trace: dump_stack print_address_description kasan_report ? find_cpio_data __asan_report_load1_noabort find_cpio_data find_microcode_in_initrd __load_ucode_amd load_ucode_amd_ap load_ucode_ap After some investigation, it turned out that a merge was done using the wrong side to resolve, leading to picking up the previous state, before the 24c2503255d3 fix. Therefore the Fixes tag below contains a merge commit. Revert the mismerge by catching the save_microcode_in_initrd_amd() retval and thus letting the function exit with the last return statement so that initrd_gone can be set to true. Fixes: f26483eaedec ("Merge branch 'x86/urgent' into x86/microcode, to resolve conflicts") Reported-by: Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=198295 Link: https://lkml.kernel.org/r/20180123104133.918-2-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index c4fa4a85d4cb..e4fc595cd6ea 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -239,7 +239,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(cpuid_eax(1)); + ret = save_microcode_in_initrd_amd(cpuid_eax(1)); break; default: break; -- cgit v1.2.3 From fe6daab1ee9dfe7f89974ee6c486cccb0f18a61d Mon Sep 17 00:00:00 2001 From: davidwang Date: Mon, 22 Jan 2018 18:14:17 +0800 Subject: x86/centaur: Mark TSC invariant Centaur CPU has a constant frequency TSC and that TSC does not stop in C-States. But because the corresponding TSC feature flags are not set for that CPU, the TSC is treated as not constant frequency and assumed to stop in C-States, which makes it an unreliable and unusable clock source. Setting those flags tells the kernel that the TSC is usable, so it will select it over HPET. The effect of this is that reading time stamps (from kernel or user space) will be faster and more efficent. Signed-off-by: davidwang Signed-off-by: Thomas Gleixner Cc: qiyuanwang@zhaoxin.com Cc: linux-pm@vger.kernel.org Cc: brucechang@via-alliance.com Cc: cooperyan@zhaoxin.com Cc: benjaminpan@viatech.com Link: https://lkml.kernel.org/r/1516616057-5158-1-git-send-email-davidwang@zhaoxin.com --- arch/x86/kernel/cpu/centaur.c | 4 ++++ drivers/acpi/processor_idle.c | 1 + 2 files changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 68bc6d9b3132..c578cd29c2d2 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -106,6 +106,10 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index d50a7b6ccddd..5f0071c7e2e1 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -207,6 +207,7 @@ static void tsc_check_state(int state) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: case X86_VENDOR_INTEL: + case X86_VENDOR_CENTAUR: /* * AMD Fam10h TSC will tick in all * C/P/S0/S1 states when this bit is set. -- cgit v1.2.3 From 1de1ea7efeb9e8543212210e34518b4049ccd285 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 22 Dec 2017 10:54:20 +0100 Subject: KVM: s390: add proper locking for CMMA migration bitmap Some parts of the cmma migration bitmap is already protected with the kvm->lock (e.g. the migration start). On the other hand the read of the cmma bits is not protected against a concurrent free, neither is the emulation of the ESSA instruction. Let's extend the locking to all related ioctls by using the slots lock for - kvm_s390_vm_start_migration - kvm_s390_vm_stop_migration - kvm_s390_set_cmma_bits - kvm_s390_get_cmma_bits In addition to that, we use synchronize_srcu before freeing the migration structure as all users hold kvm->srcu for read. (e.g. the ESSA handler). Reported-by: David Hildenbrand Signed-off-by: Christian Borntraeger Cc: stable@vger.kernel.org # 4.13+ Fixes: 190df4a212a7 (KVM: s390: CMMA tracking, ESSA emulation, migration mode) Reviewed-by: Claudio Imbrenda Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck --- arch/s390/kvm/kvm-s390.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index abcd24fdde3f..52880e980a33 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -766,7 +766,7 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) /* * Must be called with kvm->srcu held to avoid races on memslots, and with - * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + * kvm->slots_lock to avoid races with ourselves and kvm_s390_vm_stop_migration. */ static int kvm_s390_vm_start_migration(struct kvm *kvm) { @@ -822,7 +822,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) } /* - * Must be called with kvm->lock to avoid races with ourselves and + * Must be called with kvm->slots_lock to avoid races with ourselves and * kvm_s390_vm_start_migration. */ static int kvm_s390_vm_stop_migration(struct kvm *kvm) @@ -837,6 +837,8 @@ static int kvm_s390_vm_stop_migration(struct kvm *kvm) if (kvm->arch.use_cmma) { kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + /* We have to wait for the essa emulation to finish */ + synchronize_srcu(&kvm->srcu); vfree(mgs->pgste_bitmap); } kfree(mgs); @@ -846,14 +848,12 @@ static int kvm_s390_vm_stop_migration(struct kvm *kvm) static int kvm_s390_vm_set_migration(struct kvm *kvm, struct kvm_device_attr *attr) { - int idx, res = -ENXIO; + int res = -ENXIO; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->slots_lock); switch (attr->attr) { case KVM_S390_VM_MIGRATION_START: - idx = srcu_read_lock(&kvm->srcu); res = kvm_s390_vm_start_migration(kvm); - srcu_read_unlock(&kvm->srcu, idx); break; case KVM_S390_VM_MIGRATION_STOP: res = kvm_s390_vm_stop_migration(kvm); @@ -861,7 +861,7 @@ static int kvm_s390_vm_set_migration(struct kvm *kvm, default: break; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->slots_lock); return res; } @@ -1751,7 +1751,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&args, argp, sizeof(args))) break; + mutex_lock(&kvm->slots_lock); r = kvm_s390_get_cmma_bits(kvm, &args); + mutex_unlock(&kvm->slots_lock); if (!r) { r = copy_to_user(argp, &args, sizeof(args)); if (r) @@ -1765,7 +1767,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&args, argp, sizeof(args))) break; + mutex_lock(&kvm->slots_lock); r = kvm_s390_set_cmma_bits(kvm, &args); + mutex_unlock(&kvm->slots_lock); break; } default: -- cgit v1.2.3 From aebb48f5e465772576359c1705c4a84abea92027 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 23 Jan 2018 14:33:14 +0000 Subject: sparc64: fix typo in CONFIG_CRYPTO_DES_SPARC64 => CONFIG_CRYPTO_CAMELLIA_SPARC64 This patch fixes the typo CONFIG_CRYPTO_DES_SPARC64 => CONFIG_CRYPTO_CAMELLIA_SPARC64 Fixes: 81658ad0d923 ("sparc64: Add CAMELLIA driver making use of the new camellia opcodes.") Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- arch/sparc/crypto/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile index 818d3aa5172e..d257186c27d1 100644 --- a/arch/sparc/crypto/Makefile +++ b/arch/sparc/crypto/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o -obj-$(CONFIG_CRYPTO_DES_SPARC64) += camellia-sparc64.o +obj-$(CONFIG_CRYPTO_CAMELLIA_SPARC64) += camellia-sparc64.o obj-$(CONFIG_CRYPTO_CRC32C_SPARC64) += crc32c-sparc64.o -- cgit v1.2.3 From 617ab45c9a8900e64a78b43696c02598b8cad68b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 11:36:29 +0100 Subject: x86/hyperv: Stop suppressing X86_FEATURE_PCID When hypercall-based TLB flush was enabled for Hyper-V guests PCID feature was deliberately suppressed as a precaution: back then PCID was never exposed to Hyper-V guests and it wasn't clear what will happen if some day it becomes available. The day came and PCID/INVPCID features are already exposed on certain Hyper-V hosts. From TLFS (as of 5.0b) it is unclear how TLB flush hypercalls combine with PCID. In particular the usage of PCID is per-cpu based: the same mm gets different CR3 values on different CPUs. If the hypercall does exact matching this will fail. However, this is not the case. David Zhang explains: "In practice, the AddressSpace argument is ignored on any VM that supports PCIDs. Architecturally, the AddressSpace argument must match the CR3 with PCID bits stripped out (i.e., the low 12 bits of AddressSpace should be 0 in long mode). The flush hypercalls flush all PCIDs for the specified AddressSpace." With this, PCID can be enabled. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Cc: David Zhang Cc: Stephen Hemminger Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Aditya Bhandari Link: https://lkml.kernel.org/r/20180124103629.29980-1-vkuznets@redhat.com --- arch/x86/hyperv/mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 9cc9e1c1e2db..56c9ebac946f 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -137,7 +137,12 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, } if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; flush->flags = 0; } else { flush->address_space = 0; @@ -219,7 +224,12 @@ static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, } if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; flush->flags = 0; } else { flush->address_space = 0; @@ -278,8 +288,6 @@ void hyperv_setup_mmu_ops(void) if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) return; - setup_clear_cpu_cap(X86_FEATURE_PCID); - if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { pr_info("Using hypercall for remote TLB flush\n"); pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; -- cgit v1.2.3 From 782bf20c2a1795f35dcd526aa8005cd1870745da Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Wed, 24 Jan 2018 20:29:14 +0000 Subject: x86: Remove unused IOMMU_STRESS Kconfig Last use of IOMMU_STRESS was removed in commit 29b68415e335 ("x86: amd_iommu: move to drivers/iommu/"). 6 years later the Kconfig entry is definitely due for removal. Signed-off-by: Corentin Labbe Signed-off-by: Thomas Gleixner Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/1516825754-28415-1-git-send-email-clabbe@baylibre.com --- arch/x86/Kconfig.debug | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..49fb85a5a2b7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -169,14 +169,6 @@ config IOMMU_DEBUG options. See Documentation/x86/x86_64/boot-options.txt for more details. -config IOMMU_STRESS - bool "Enable IOMMU stress-test mode" - ---help--- - This option disables various optimizations in IOMMU related - code to do real stress testing of the IOMMU code. This option - will cause a performance drop and should only be enabled for - testing. - config IOMMU_LEAK bool "IOMMU leak tracing" depends on IOMMU_DEBUG && DMA_API_DEBUG -- cgit v1.2.3 From 1a29b5b7f347a1a9230c1e0af5b37e3e571588ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jan 2018 10:58:13 +0100 Subject: KVM: x86: Make indirect calls in emulator speculation safe Replace the indirect calls with CALL_NOSPEC. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: David Woodhouse Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Ashok Raj Cc: Greg KH Cc: Jun Nakajima Cc: David Woodhouse Cc: Linus Torvalds Cc: rga@amazon.de Cc: Dave Hansen Cc: Asit Mallick Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Jason Baron Cc: Paolo Bonzini Cc: Dan Williams Cc: Arjan Van De Ven Cc: Tim Chen Link: https://lkml.kernel.org/r/20180125095843.595615683@infradead.org --- arch/x86/kvm/emulate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d90cdc77e077..453d8c990108 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "x86.h" #include "tss.h" @@ -1021,8 +1022,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; - asm("push %[flags]; popf; call *%[fastop]" - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); + asm("push %[flags]; popf; " CALL_NOSPEC + : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags)); return rc; } @@ -5305,9 +5306,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" + asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), ASM_CALL_CONSTRAINT + [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); -- cgit v1.2.3 From c940a3fb1e2e9b7d03228ab28f375fb5a47ff699 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jan 2018 10:58:14 +0100 Subject: KVM: VMX: Make indirect call speculation safe Replace indirect call with CALL_NOSPEC. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: David Woodhouse Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Ashok Raj Cc: Greg KH Cc: Jun Nakajima Cc: David Woodhouse Cc: Linus Torvalds Cc: rga@amazon.de Cc: Dave Hansen Cc: Asit Mallick Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Jason Baron Cc: Paolo Bonzini Cc: Dan Williams Cc: Arjan Van De Ven Cc: Tim Chen Link: https://lkml.kernel.org/r/20180125095843.645776917@infradead.org --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d1e25dba3112..924589c53422 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9064,14 +9064,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) #endif "pushf\n\t" __ASM_SIZE(push) " $%c[cs]\n\t" - "call *%[entry]\n\t" + CALL_NOSPEC : #ifdef CONFIG_X86_64 [sp]"=&r"(tmp), #endif ASM_CALL_CONSTRAINT : - [entry]"r"(entry), + THUNK_TARGET(entry), [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); -- cgit v1.2.3 From efe951d3de9141626a494bcb1efb0650eaef6491 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jan 2018 19:23:08 +0100 Subject: perf/x86: Fix perf,x86,cpuhp deadlock More lockdep gifts, a 5-way lockup race: perf_event_create_kernel_counter() perf_event_alloc() perf_try_init_event() x86_pmu_event_init() __x86_pmu_event_init() x86_reserve_hardware() #0 mutex_lock(&pmc_reserve_mutex); reserve_ds_buffer() #1 get_online_cpus() perf_event_release_kernel() _free_event() hw_perf_event_destroy() x86_release_hardware() #0 mutex_lock(&pmc_reserve_mutex) release_ds_buffer() #1 get_online_cpus() #1 do_cpu_up() perf_event_init_cpu() #2 mutex_lock(&pmus_lock) #3 mutex_lock(&ctx->mutex) sys_perf_event_open() mutex_lock_double() #3 mutex_lock(ctx->mutex) #4 mutex_lock_nested(ctx->mutex, 1); perf_try_init_event() #4 mutex_lock_nested(ctx->mutex, 1) x86_pmu_event_init() intel_pmu_hw_config() x86_add_exclusive() #0 mutex_lock(&pmc_reserve_mutex) Fix it by using ordering constructs instead of locking. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8156e47da7ba..18c25ab28557 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu) static void release_pebs_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs) return; kfree(per_cpu(insn_buffer, cpu)); @@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu) /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - ds->pebs_buffer_base = 0; dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); hwev->ds_pebs_vaddr = NULL; } @@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu) static void release_bts_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.bts) + if (!x86_pmu.bts) return; /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; ds_clear_cea(cea, BTS_BUFFER_SIZE); - ds->bts_buffer_base = 0; dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); hwev->ds_bts_vaddr = NULL; } @@ -454,16 +450,22 @@ void release_ds_buffers(void) if (!x86_pmu.bts && !x86_pmu.pebs) return; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + + for_each_possible_cpu(cpu) { + /* + * Again, ignore errors from offline CPUs, they will no longer + * observe cpu_hw_events.ds and not program the DS_AREA when + * they come up. + */ fini_debug_store_on_cpu(cpu); + } for_each_possible_cpu(cpu) { release_pebs_buffer(cpu); release_bts_buffer(cpu); - release_ds_buffer(cpu); } - put_online_cpus(); } void reserve_ds_buffers(void) @@ -483,8 +485,6 @@ void reserve_ds_buffers(void) if (!x86_pmu.pebs) pebs_err = 1; - get_online_cpus(); - for_each_possible_cpu(cpu) { if (alloc_ds_buffer(cpu)) { bts_err = 1; @@ -521,11 +521,14 @@ void reserve_ds_buffers(void) if (x86_pmu.pebs && !pebs_err) x86_pmu.pebs_active = 1; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { + /* + * Ignores wrmsr_on_cpu() errors for offline CPUs they + * will get this call through intel_pmu_cpu_starting(). + */ init_debug_store_on_cpu(cpu); + } } - - put_online_cpus(); } /* -- cgit v1.2.3 From caf7501a1b4ec964190f31f9c3f163de252273b8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 25 Jan 2018 15:50:28 -0800 Subject: module/retpoline: Warn about missing retpoline in module There's a risk that a kernel which has full retpoline mitigations becomes vulnerable when a module gets loaded that hasn't been compiled with the right compiler or the right option. To enable detection of that mismatch at module load time, add a module info string "retpoline" at build time when the module was compiled with retpoline support. This only covers compiled C source, but assembler source or prebuilt object files are not checked. If a retpoline enabled kernel detects a non retpoline protected module at load time, print a warning and report it in the sysfs vulnerability file. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: jeyu@kernel.org Cc: arjan@linux.intel.com Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org --- arch/x86/kernel/cpu/bugs.c | 17 ++++++++++++++++- include/linux/module.h | 9 +++++++++ kernel/module.c | 11 +++++++++++ scripts/mod/modpost.c | 9 +++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 390b3dc3d438..4a39d7bb4bd8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -93,6 +94,19 @@ static const char *spectre_v2_strings[] = { #define pr_fmt(fmt) "Spectre V2 mitigation: " fmt static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +static bool spectre_v2_bad_module; + +#ifdef RETPOLINE +bool retpoline_module_ok(bool has_retpoline) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) + return true; + + pr_err("System may be vunerable to spectre v2\n"); + spectre_v2_bad_module = true; + return false; +} +#endif static void __init spec2_print_if_insecure(const char *reason) { @@ -278,6 +292,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); + return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], + spectre_v2_bad_module ? " - vulnerable module loaded" : ""); } #endif diff --git a/include/linux/module.h b/include/linux/module.h index fe5aa3736707..b1cc541f2ddf 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr, static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef RETPOLINE +extern bool retpoline_module_ok(bool has_retpoline); +#else +static inline bool retpoline_module_ok(bool has_retpoline) +{ + return true; +} +#endif + #ifdef CONFIG_MODULE_SIG static inline bool module_sig_ok(struct module *module) { diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..690c0651c40f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) } #endif /* CONFIG_LIVEPATCH */ +static void check_modinfo_retpoline(struct module *mod, struct load_info *info) +{ + if (retpoline_module_ok(get_modinfo(info, "retpoline"))) + return; + + pr_warn("%s: loading module not compiled with retpoline compiler.\n", + mod->name); +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); } + check_modinfo_retpoline(mod, info); + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); pr_warn("%s: module is from the staging directory, the quality " diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 98314b400a95..54deaa1066cf 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2165,6 +2165,14 @@ static void add_intree_flag(struct buffer *b, int is_intree) buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n"); } +/* Cannot check for assembler */ +static void add_retpoline(struct buffer *b) +{ + buf_printf(b, "\n#ifdef RETPOLINE\n"); + buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n"); + buf_printf(b, "#endif\n"); +} + static void add_staging_flag(struct buffer *b, const char *name) { static const char *staging_dir = "drivers/staging"; @@ -2506,6 +2514,7 @@ int main(int argc, char **argv) err |= check_modname_len(mod); add_header(&buf, mod); add_intree_flag(&buf, !external_module); + add_retpoline(&buf); add_staging_flag(&buf, mod->name); err |= add_versions(&buf, mod); add_depends(&buf, mod, modules); -- cgit v1.2.3 From 95ca0ee8636059ea2800dfbac9ecac6212d6b38f Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:09 +0000 Subject: x86/cpufeatures: Add CPUID_7_EDX CPUID leaf This is a pure feature bits leaf. There are two AVX512 feature bits in it already which were handled as scattered bits, and three more from this leaf are going to be added for speculation control features. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-2-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeature.h | 7 +++++-- arch/x86/include/asm/cpufeatures.h | 8 +++++--- arch/x86/include/asm/disabled-features.h | 3 ++- arch/x86/include/asm/required-features.h | 3 ++- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/cpu/scattered.c | 2 -- 6 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ea9a7dde62e5..70eddb3922ff 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -29,6 +29,7 @@ enum cpuid_leafs CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, + CPUID_7_EDX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -79,8 +80,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 18)) + BUILD_BUG_ON_ZERO(NCAPINTS != 19)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -101,8 +103,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 18)) + BUILD_BUG_ON_ZERO(NCAPINTS != 19)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 25b9375c1484..7b25cf30d25d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NCAPINTS 19 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -206,8 +206,6 @@ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ @@ -319,6 +317,10 @@ #define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ #define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index e428e16dd822..c6a3af198294 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -71,6 +71,7 @@ #define DISABLED_MASK15 0 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) #define DISABLED_MASK17 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) +#define DISABLED_MASK18 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index d91ba04dd007..fb3a6de7440b 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -106,6 +106,7 @@ #define REQUIRED_MASK15 0 #define REQUIRED_MASK16 (NEED_LA57) #define REQUIRED_MASK17 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) +#define REQUIRED_MASK18 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 372ba3fb400f..e5d66e93ed81 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -745,6 +745,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; + c->x86_capability[CPUID_7_EDX] = edx; } /* Extended state features: level 0x0000000d */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d0e69769abfd..df11f5d604be 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -21,8 +21,6 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, -- cgit v1.2.3 From fc67dd70adb711a45d2ef34e12d1a8be75edde61 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:10 +0000 Subject: x86/cpufeatures: Add Intel feature bits for Speculation Control Add three feature bits exposed by new microcode on Intel CPUs for speculation control. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-3-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeatures.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7b25cf30d25d..0a5107002716 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -320,6 +320,9 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ /* * BUG word(s) -- cgit v1.2.3 From 5d10cbc91d9eb5537998b65608441b592eec65e7 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:11 +0000 Subject: x86/cpufeatures: Add AMD feature bits for Speculation Control AMD exposes the PRED_CMD/SPEC_CTRL MSRs slightly differently to Intel. See http://lkml.kernel.org/r/2b3e25cc-286d-8bd0-aeaf-9ac4aae39de8@amd.com Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: Tom Lendacky Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-4-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeatures.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0a5107002716..ae3212f14dec 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -269,6 +269,9 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_AMD_PRED_CMD (13*32+12) /* Prediction Command MSR (AMD) */ +#define X86_FEATURE_AMD_SPEC_CTRL (13*32+14) /* Speculation Control MSR only (AMD) */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors (AMD) */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -- cgit v1.2.3 From 1e340c60d0dd3ae07b5bedc16a0469c14b9f3410 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:12 +0000 Subject: x86/msr: Add definitions for new speculation control MSRs Add MSR and bit definitions for SPEC_CTRL, PRED_CMD and ARCH_CAPABILITIES. See Intel's 336996-Speculative-Execution-Side-Channel-Mitigations.pdf Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-5-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/msr-index.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index fa11fb1fa570..eb83ff1bae8f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -39,6 +39,13 @@ /* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ +#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ +#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ + +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ +#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ + #define MSR_PPIN_CTL 0x0000004e #define MSR_PPIN 0x0000004f @@ -57,6 +64,11 @@ #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) #define MSR_MTRRcap 0x000000fe + +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a +#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ +#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ + #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e -- cgit v1.2.3 From fec9434a12f38d3aeafeb75711b71d8a1fdef621 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:13 +0000 Subject: x86/pti: Do not enable PTI on CPUs which are not vulnerable to Meltdown Also, for CPUs which don't speculate at all, don't report that they're vulnerable to the Spectre variants either. Leave the cpu_no_meltdown[] match table with just X86_VENDOR_AMD in it for now, even though that could be done with a simple comparison, on the assumption that we'll have more to add. Based on suggestions from Dave Hansen and Alan Cox. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Acked-by: Dave Hansen Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-6-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/cpu/common.c | 48 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e5d66e93ed81..970ee06dc8aa 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -853,6 +855,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } +static const __initdata struct x86_cpu_id cpu_no_speculation[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_CENTAUR, 5 }, + { X86_VENDOR_INTEL, 5 }, + { X86_VENDOR_NSC, 5 }, + { X86_VENDOR_ANY, 4 }, + {} +}; + +static const __initdata struct x86_cpu_id cpu_no_meltdown[] = { + { X86_VENDOR_AMD }, + {} +}; + +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = 0; + + if (x86_match_cpu(cpu_no_meltdown)) + return false; + + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + /* Rogue Data Cache Load? No! */ + if (ia32_cap & ARCH_CAP_RDCL_NO) + return false; + + return true; +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -900,11 +937,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - if (c->x86_vendor != X86_VENDOR_AMD) - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - - setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + if (!x86_match_cpu(cpu_no_speculation)) { + if (cpu_vulnerable_to_meltdown(c)) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + } fpu__init_system(c); -- cgit v1.2.3 From a5b2966364538a0e68c9fa29bc0a3a1651799035 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:14 +0000 Subject: x86/cpufeature: Blacklist SPEC_CTRL/PRED_CMD on early Spectre v2 microcodes This doesn't refuse to load the affected microcodes; it just refuses to use the Spectre v2 mitigation features if they're detected, by clearing the appropriate feature bits. The AMD CPUID bits are handled here too, because hypervisors *may* have been exposing those bits even on Intel chips, for fine-grained control of what's available. It is non-trivial to use x86_match_cpu() for this table because that doesn't handle steppings. And the approach taken in commit bd9240a18 almost made me lose my lunch. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-7-git-send-email-dwmw@amazon.co.uk --- arch/x86/kernel/cpu/intel.c | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b720dacac051..5faa487d0477 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) ELF_HWCAP2 |= HWCAP2_RING3MWAIT; } +/* + * Early microcode releases for the Spectre v2 mitigation were broken. + * Information taken from; + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf + * - https://kb.vmware.com/s/article/52345 + * - Microcode revisions observed in the wild + * - Release note from 20180108 microcode release + */ +struct sku_microcode { + u8 model; + u8 stepping; + u32 microcode; +}; +static const struct sku_microcode spectre_bad_microcodes[] = { + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 }, + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 }, + { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 }, + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + /* Updated in the 20180108 release; blacklist until we know otherwise */ + { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 }, + /* Observed in the wild */ + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, +}; + +static bool bad_spectre_microcode(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { + if (c->x86_model == spectre_bad_microcodes[i].model && + c->x86_mask == spectre_bad_microcodes[i].stepping) + return (c->microcode <= spectre_bad_microcodes[i].microcode); + } + return false; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -122,6 +175,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) c->microcode = intel_get_microcode_revision(); + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_STIBP) || + cpu_has(c, X86_FEATURE_AMD_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_AMD_PRED_CMD) || + cpu_has(c, X86_FEATURE_AMD_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling SPEC_CTRL\n"); + clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_STIBP); + clear_cpu_cap(c, X86_FEATURE_AMD_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_AMD_PRED_CMD); + clear_cpu_cap(c, X86_FEATURE_AMD_STIBP); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * -- cgit v1.2.3 From 20ffa1caecca4db8f79fe665acdeaa5af815a24d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:15 +0000 Subject: x86/speculation: Add basic IBPB (Indirect Branch Prediction Barrier) support Expose indirect_branch_prediction_barrier() for use in subsequent patches. [ tglx: Add IBPB status to spectre_v2 sysfs file ] Co-developed-by: KarimAllah Ahmed Signed-off-by: KarimAllah Ahmed Signed-off-by: David Woodhouse Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-8-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/include/asm/nospec-branch.h | 13 +++++++++++++ arch/x86/kernel/cpu/bugs.c | 10 +++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ae3212f14dec..07934b2f8df2 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -210,6 +210,8 @@ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled*/ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 4ad41087ce0e..34e384c7208f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -218,5 +218,18 @@ static inline void vmexit_fill_RSB(void) #endif } +static inline void indirect_branch_prediction_barrier(void) +{ + asm volatile(ALTERNATIVE("", + "movl %[msr], %%ecx\n\t" + "movl %[val], %%eax\n\t" + "movl $0, %%edx\n\t" + "wrmsr", + X86_FEATURE_IBPB) + : : [msr] "i" (MSR_IA32_PRED_CMD), + [val] "i" (PRED_CMD_IBPB) + : "eax", "ecx", "edx", "memory"); +} + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4a39d7bb4bd8..bac7a3558db2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -263,6 +263,13 @@ retpoline_auto: setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Filling RSB on context switch\n"); } + + /* Initialize Indirect Branch Prediction Barrier if supported */ + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) || + boot_cpu_has(X86_FEATURE_AMD_PRED_CMD)) { + setup_force_cpu_cap(X86_FEATURE_IBPB); + pr_info("Enabling Indirect Branch Prediction Barrier\n"); + } } #undef pr_fmt @@ -292,7 +299,8 @@ ssize_t cpu_show_spectre_v2(struct device *dev, if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + boot_cpu_has(X86_FEATURE_IBPB) ? ", IPBP" : "", spectre_v2_bad_module ? " - vulnerable module loaded" : ""); } #endif -- cgit v1.2.3 From 0e6c16c652cadaffd25a6bb326ec10da5bcec6b4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 26 Jan 2018 13:11:36 +0100 Subject: x86/alternative: Print unadorned pointers After commit ad67b74d2469 ("printk: hash addresses printed with %p") pointers are being hashed when printed. However, this makes the alternative debug output completely useless. Switch to %px in order to see the unadorned kernel pointers. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: riel@redhat.com Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: David Woodhouse Cc: jikos@kernel.org Cc: luto@amacapital.net Cc: dave.hansen@intel.com Cc: torvalds@linux-foundation.org Cc: keescook@google.com Cc: Josh Poimboeuf Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Cc: pjt@google.com Link: https://lkml.kernel.org/r/20180126121139.31959-2-bp@alien8.de --- arch/x86/kernel/alternative.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e0b97e4d1db5..14a52c7d23d4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -298,7 +298,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) tgt_rip = next_rip + o_dspl; n_dspl = tgt_rip - orig_insn; - DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); if (tgt_rip - orig_insn >= 0) { if (n_dspl - 2 <= 127) @@ -355,7 +355,7 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins add_nops(instr + (a->instrlen - a->padlen), a->padlen); local_irq_restore(flags); - DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", instr, a->instrlen - a->padlen, a->padlen); } @@ -376,7 +376,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("alt table %p -> %p", start, end); + DPRINTK("alt table %px, -> %px", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -400,14 +400,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, continue; } - DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", + DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d", a->cpuid >> 5, a->cpuid & 0x1f, instr, a->instrlen, replacement, a->replacementlen, a->padlen); - DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); memcpy(insnbuf, replacement, a->replacementlen); insnbuf_sz = a->replacementlen; @@ -433,7 +433,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, a->instrlen - a->replacementlen); insnbuf_sz += a->instrlen - a->replacementlen; } - DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); + DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); text_poke_early(instr, insnbuf, insnbuf_sz); } -- cgit v1.2.3 From 7a32fc51ca938e67974cbb9db31e1a43f98345a9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 26 Jan 2018 13:11:37 +0100 Subject: x86/nospec: Fix header guards names ... to adhere to the _ASM_X86_ naming scheme. No functional change. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: riel@redhat.com Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: David Woodhouse Cc: jikos@kernel.org Cc: luto@amacapital.net Cc: dave.hansen@intel.com Cc: torvalds@linux-foundation.org Cc: keescook@google.com Cc: Josh Poimboeuf Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Cc: pjt@google.com Link: https://lkml.kernel.org/r/20180126121139.31959-3-bp@alien8.de --- arch/x86/include/asm/nospec-branch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 34e384c7208f..865192a2cc31 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NOSPEC_BRANCH_H__ -#define __NOSPEC_BRANCH_H__ +#ifndef _ASM_X86_NOSPEC_BRANCH_H_ +#define _ASM_X86_NOSPEC_BRANCH_H_ #include #include @@ -232,4 +232,4 @@ static inline void indirect_branch_prediction_barrier(void) } #endif /* __ASSEMBLY__ */ -#endif /* __NOSPEC_BRANCH_H__ */ +#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ -- cgit v1.2.3 From 55fa19d3e51f33d9cd4056d25836d93abf9438db Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 26 Jan 2018 13:11:39 +0100 Subject: x86/bugs: Drop one "mitigation" from dmesg Make [ 0.031118] Spectre V2 mitigation: Mitigation: Full generic retpoline into [ 0.031118] Spectre V2: Mitigation: Full generic retpoline to reduce the mitigation mitigations strings. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: riel@redhat.com Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: David Woodhouse Cc: jikos@kernel.org Cc: luto@amacapital.net Cc: dave.hansen@intel.com Cc: torvalds@linux-foundation.org Cc: keescook@google.com Cc: Josh Poimboeuf Cc: tim.c.chen@linux.intel.com Cc: pjt@google.com Link: https://lkml.kernel.org/r/20180126121139.31959-5-bp@alien8.de --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bac7a3558db2..c988a8acb0d5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -91,7 +91,7 @@ static const char *spectre_v2_strings[] = { }; #undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; static bool spectre_v2_bad_module; -- cgit v1.2.3 From 5beda7d54eafece4c974cfa9fbb9f60fb18fd20a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jan 2018 13:12:14 -0800 Subject: x86/mm/64: Fix vmapped stack syncing on very-large-memory 4-level systems Neil Berrington reported a double-fault on a VM with 768GB of RAM that uses large amounts of vmalloc space with PTI enabled. The cause is that load_new_mm_cr3() was never fixed to take the 5-level pgd folding code into account, so, on a 4-level kernel, the pgd synchronization logic compiles away to exactly nothing. Interestingly, the problem doesn't trigger with nopti. I assume this is because the kernel is mapped with global pages if we boot with nopti. The sequence of operations when we create a new task is that we first load its mm while still running on the old stack (which crashes if the old stack is unmapped in the new mm unless the TLB saves us), then we call prepare_switch_to(), and then we switch to the new stack. prepare_switch_to() pokes the new stack directly, which will populate the mapping through vmalloc_fault(). I assume that we're getting lucky on non-PTI systems -- the old stack's TLB entry stays alive long enough to make it all the way through prepare_switch_to() and switch_to() so that we make it to a valid stack. Fixes: b50858ce3e2a ("x86/mm/vmalloc: Add 5-level paging support") Reported-and-tested-by: Neil Berrington Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Konstantin Khlebnikov Cc: stable@vger.kernel.org Cc: Dave Hansen Cc: Borislav Petkov Link: https://lkml.kernel.org/r/346541c56caed61abbe693d7d2742b4a380c5001.1516914529.git.luto@kernel.org --- arch/x86/mm/tlb.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a1561957dccb..5bfe61a5e8e3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -151,6 +151,34 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } +static void sync_current_stack_to_mm(struct mm_struct *mm) +{ + unsigned long sp = current_stack_pointer; + pgd_t *pgd = pgd_offset(mm, sp); + + if (CONFIG_PGTABLE_LEVELS > 4) { + if (unlikely(pgd_none(*pgd))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + + set_pgd(pgd, *pgd_ref); + } + } else { + /* + * "pgd" is faked. The top level entries are "p4d"s, so sync + * the p4d. This compiles to approximately the same code as + * the 5-level case. + */ + p4d_t *p4d = p4d_offset(pgd, sp); + + if (unlikely(p4d_none(*p4d))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); + + set_p4d(p4d, *p4d_ref); + } + } +} + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -226,11 +254,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int index = pgd_index(current_stack_pointer); - pgd_t *pgd = next->pgd + index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[index]); + sync_current_stack_to_mm(next); } /* Stop remote flushes for the previous mm */ -- cgit v1.2.3 From 36b3a7726886f24c4209852a58e64435bde3af98 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jan 2018 13:12:15 -0800 Subject: x86/mm/64: Tighten up vmalloc_fault() sanity checks on 5-level kernels On a 5-level kernel, if a non-init mm has a top-level entry, it needs to match init_mm's, but the vmalloc_fault() code skipped over the BUG_ON() that would have checked it. While we're at it, get rid of the rather confusing 4-level folded "pgd" logic. Cleans-up: b50858ce3e2a ("x86/mm/vmalloc: Add 5-level paging support") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: Konstantin Khlebnikov Cc: Dave Hansen Cc: Borislav Petkov Cc: Neil Berrington Link: https://lkml.kernel.org/r/2ae598f8c279b0a29baf75df207e6f2fdddc0a1b.1516914529.git.luto@kernel.org --- arch/x86/mm/fault.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b3e40773dce0..800de815519c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -439,18 +439,13 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_ref)) return -1; - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_ref); - arch_flush_lazy_mmu_mode(); - } else if (CONFIG_PGTABLE_LEVELS > 4) { - /* - * With folded p4d, pgd_none() is always false, so the pgd may - * point to an empty page table entry and pgd_page_vaddr() - * will return garbage. - * - * We will do the correct sanity check on the p4d level. - */ - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgd_none(*pgd)) { + set_pgd(pgd, *pgd_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } } /* With 4-level paging, copying happens on the p4d level. */ @@ -459,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address) if (p4d_none(*p4d_ref)) return -1; - if (p4d_none(*p4d)) { + if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) { set_p4d(p4d, *p4d_ref); arch_flush_lazy_mmu_mode(); } else { @@ -470,6 +465,7 @@ static noinline int vmalloc_fault(unsigned long address) * Below here mismatches are bugs because these lower tables * are shared: */ + BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); pud = pud_offset(p4d, address); pud_ref = pud_offset(p4d_ref, address); -- cgit v1.2.3 From e383095c7fe8d218e00ec0f83e4b95ed4e627b02 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2018 15:45:14 +0100 Subject: x86/cpu/bugs: Make retpoline module warning conditional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If sysfs is disabled and RETPOLINE not defined: arch/x86/kernel/cpu/bugs.c:97:13: warning: ‘spectre_v2_bad_module’ defined but not used [-Wunused-variable] static bool spectre_v2_bad_module; Hide it. Fixes: caf7501a1b4e ("module/retpoline: Warn about missing retpoline in module") Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: David Woodhouse --- arch/x86/kernel/cpu/bugs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c988a8acb0d5..b0b7157df89e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -94,9 +94,10 @@ static const char *spectre_v2_strings[] = { #define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; -static bool spectre_v2_bad_module; #ifdef RETPOLINE +static bool spectre_v2_bad_module; + bool retpoline_module_ok(bool has_retpoline) { if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) @@ -106,6 +107,13 @@ bool retpoline_module_ok(bool has_retpoline) spectre_v2_bad_module = true; return false; } + +static inline const char *spectre_v2_module_string(void) +{ + return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; +} +#else +static inline const char *spectre_v2_module_string(void) { return ""; } #endif static void __init spec2_print_if_insecure(const char *reason) @@ -300,7 +308,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, return sprintf(buf, "Not affected\n"); return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_IBPB) ? ", IPBP" : "", - spectre_v2_bad_module ? " - vulnerable module loaded" : ""); + boot_cpu_has(X86_FEATURE_IBPB) ? ", IBPB" : "", + spectre_v2_module_string()); } #endif -- cgit v1.2.3 From 2961298efe1ea1b6fc0d7ee8b76018fa6c0bcef2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jan 2018 16:24:32 +0000 Subject: x86/cpufeatures: Clean up Spectre v2 related CPUID flags We want to expose the hardware features simply in /proc/cpuinfo as "ibrs", "ibpb" and "stibp". Since AMD has separate CPUID bits for those, use them as the user-visible bits. When the Intel SPEC_CTRL bit is set which indicates both IBRS and IBPB capability, set those (AMD) bits accordingly. Likewise if the Intel STIBP bit is set, set the AMD STIBP that's used for the generic hardware capability. Hide the rest from /proc/cpuinfo by putting "" in the comments. Including RETPOLINE and RETPOLINE_AMD which shouldn't be visible there. There are patches to make the sysfs vulnerabilities information non-readable by non-root, and the same should apply to all information about which mitigations are actually in use. Those *shouldn't* appear in /proc/cpuinfo. The feature bit for whether IBPB is actually used, which is needed for ALTERNATIVEs, is renamed to X86_FEATURE_USE_IBPB. Originally-by: Borislav Petkov Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1517070274-12128-2-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/cpufeatures.h | 18 +++++++++--------- arch/x86/include/asm/nospec-branch.h | 2 +- arch/x86/kernel/cpu/bugs.c | 7 +++---- arch/x86/kernel/cpu/intel.c | 31 +++++++++++++++++++++---------- 4 files changed, 34 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 07934b2f8df2..73b5fff159a4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -203,14 +203,14 @@ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled*/ +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -271,9 +271,9 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_AMD_PRED_CMD (13*32+12) /* Prediction Command MSR (AMD) */ -#define X86_FEATURE_AMD_SPEC_CTRL (13*32+14) /* Speculation Control MSR only (AMD) */ -#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors (AMD) */ +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -325,8 +325,8 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 865192a2cc31..19ecb5446b30 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -225,7 +225,7 @@ static inline void indirect_branch_prediction_barrier(void) "movl %[val], %%eax\n\t" "movl $0, %%edx\n\t" "wrmsr", - X86_FEATURE_IBPB) + X86_FEATURE_USE_IBPB) : : [msr] "i" (MSR_IA32_PRED_CMD), [val] "i" (PRED_CMD_IBPB) : "eax", "ecx", "edx", "memory"); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b0b7157df89e..32d8e6cdc09e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -273,9 +273,8 @@ retpoline_auto: } /* Initialize Indirect Branch Prediction Barrier if supported */ - if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) || - boot_cpu_has(X86_FEATURE_AMD_PRED_CMD)) { - setup_force_cpu_cap(X86_FEATURE_IBPB); + if (boot_cpu_has(X86_FEATURE_IBPB)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); pr_info("Enabling Indirect Branch Prediction Barrier\n"); } } @@ -308,7 +307,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, return sprintf(buf, "Not affected\n"); return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", spectre_v2_module_string()); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5faa487d0477..0c8b916abced 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -175,17 +175,28 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) c->microcode = intel_get_microcode_revision(); - if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || - cpu_has(c, X86_FEATURE_STIBP) || - cpu_has(c, X86_FEATURE_AMD_SPEC_CTRL) || - cpu_has(c, X86_FEATURE_AMD_PRED_CMD) || - cpu_has(c, X86_FEATURE_AMD_STIBP)) && bad_spectre_microcode(c)) { - pr_warn("Intel Spectre v2 broken microcode detected; disabling SPEC_CTRL\n"); - clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); + /* + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, + * and they also have a different bit for STIBP support. Also, + * a hypervisor might have set the individual AMD bits even on + * Intel CPUs, for finer-grained selection of what's available. + */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_IBPB); + } + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); + + /* Now if any of them are set, check the blacklist and clear the lot */ + if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); + clear_cpu_cap(c, X86_FEATURE_IBRS); + clear_cpu_cap(c, X86_FEATURE_IBPB); clear_cpu_cap(c, X86_FEATURE_STIBP); - clear_cpu_cap(c, X86_FEATURE_AMD_SPEC_CTRL); - clear_cpu_cap(c, X86_FEATURE_AMD_PRED_CMD); - clear_cpu_cap(c, X86_FEATURE_AMD_STIBP); + clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP); } /* -- cgit v1.2.3 From 1dde7415e99933bb7293d6b2843752cbdb43ec11 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 27 Jan 2018 16:24:33 +0000 Subject: x86/retpoline: Simplify vmexit_fill_RSB() Simplify it to call an asm-function instead of pasting 41 insn bytes at every call site. Also, add alignment to the macro as suggested here: https://support.google.com/faqs/answer/7625886 [dwmw2: Clean up comments, let it clobber %ebx and just tell the compiler] Signed-off-by: Borislav Petkov Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1517070274-12128-3-git-send-email-dwmw@amazon.co.uk --- arch/x86/entry/entry_32.S | 3 +- arch/x86/entry/entry_64.S | 3 +- arch/x86/include/asm/asm-prototypes.h | 3 ++ arch/x86/include/asm/nospec-branch.h | 70 ++++------------------------------- arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 56 ++++++++++++++++++++++++++++ 6 files changed, 71 insertions(+), 65 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 60c4c342316c..2a35b1e0fb90 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -252,7 +252,8 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + /* Clobbers %ebx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 63f4320602a3..b4f00984089e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -495,7 +495,8 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + /* Clobbers %rbx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 1908214b9125..4d111616524b 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -38,4 +38,7 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) +asmlinkage void __fill_rsb(void); +asmlinkage void __clear_rsb(void); + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 19ecb5446b30..df4ececa6ebf 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,50 +7,6 @@ #include #include -/* - * Fill the CPU return stack buffer. - * - * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; lfence; jmp' loop to capture speculative execution. - * - * This is required in various cases for retpoline and IBRS-based - * mitigations for the Spectre variant 2 vulnerability. Sometimes to - * eliminate potentially bogus entries from the RSB, and sometimes - * purely to ensure that it doesn't get empty, which on some CPUs would - * allow predictions from other (unwanted!) sources to be used. - * - * We define a CPP macro such that it can be used from both .S files and - * inline assembly. It's possible to do a .macro and then include that - * from C via asm(".include ") but let's not go there. - */ - -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ -#define RSB_FILL_LOOPS 16 /* To avoid underflow */ - -/* - * Google experimented with loop-unrolling and this turned out to be - * the optimal version — two calls, each with their own speculation - * trap should their return address end up getting used, in a loop. - */ -#define __FILL_RETURN_BUFFER(reg, nr, sp) \ - mov $(nr/2), reg; \ -771: \ - call 772f; \ -773: /* speculation trap */ \ - pause; \ - lfence; \ - jmp 773b; \ -772: \ - call 774f; \ -775: /* speculation trap */ \ - pause; \ - lfence; \ - jmp 775b; \ -774: \ - dec reg; \ - jnz 771b; \ - add $(BITS_PER_LONG/8) * nr, sp; - #ifdef __ASSEMBLY__ /* @@ -121,17 +77,10 @@ #endif .endm - /* - * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP - * monstrosity above, manually. - */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +/* This clobbers the BX register */ +.macro FILL_RETURN_BUFFER nr:req ftr:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ - \ftr -.Lskip_rsb_\@: + ALTERNATIVE "", "call __clear_rsb", \ftr #endif .endm @@ -206,15 +155,10 @@ extern char __indirect_thunk_end[]; static inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE - unsigned long loops; - - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE("jmp 910f", - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), - X86_FEATURE_RETPOLINE) - "910:" - : "=r" (loops), ASM_CALL_CONSTRAINT - : : "memory" ); + alternative_input("", + "call __fill_rsb", + X86_FEATURE_RETPOLINE, + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory")); #endif } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d435c89875c1..d0a3170e6804 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_RETPOLINE) += retpoline.o +OBJECT_FILES_NON_STANDARD_retpoline.o :=y obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index c909961e678a..480edc3a5e03 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,6 +7,7 @@ #include #include #include +#include .macro THUNK reg .section .text.__x86.indirect_thunk @@ -46,3 +47,58 @@ GENERATE_THUNK(r13) GENERATE_THUNK(r14) GENERATE_THUNK(r15) #endif + +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; lfence; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * Google experimented with loop-unrolling and this turned out to be + * the optimal version - two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +.macro STUFF_RSB nr:req sp:req + mov $(\nr / 2), %_ASM_BX + .align 16 +771: + call 772f +773: /* speculation trap */ + pause + lfence + jmp 773b + .align 16 +772: + call 774f +775: /* speculation trap */ + pause + lfence + jmp 775b + .align 16 +774: + dec %_ASM_BX + jnz 771b + add $((BITS_PER_LONG/8) * \nr), \sp +.endm + +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +ENTRY(__fill_rsb) + STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP + ret +END(__fill_rsb) +EXPORT_SYMBOL_GPL(__fill_rsb) + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ + +ENTRY(__clear_rsb) + STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP + ret +END(__clear_rsb) +EXPORT_SYMBOL_GPL(__clear_rsb) -- cgit v1.2.3 From 64e16720ea0879f8ab4547e3b9758936d483909b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 27 Jan 2018 16:24:34 +0000 Subject: x86/speculation: Simplify indirect_branch_prediction_barrier() Make it all a function which does the WRMSR instead of having a hairy inline asm. [dwmw2: export it, fix CONFIG_RETPOLINE issues] Signed-off-by: Borislav Petkov Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1517070274-12128-4-git-send-email-dwmw@amazon.co.uk --- arch/x86/include/asm/nospec-branch.h | 13 ++++--------- arch/x86/include/asm/processor.h | 3 +++ arch/x86/kernel/cpu/bugs.c | 6 ++++++ 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index df4ececa6ebf..d15d471348b8 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -164,15 +164,10 @@ static inline void vmexit_fill_RSB(void) static inline void indirect_branch_prediction_barrier(void) { - asm volatile(ALTERNATIVE("", - "movl %[msr], %%ecx\n\t" - "movl %[val], %%eax\n\t" - "movl $0, %%edx\n\t" - "wrmsr", - X86_FEATURE_USE_IBPB) - : : [msr] "i" (MSR_IA32_PRED_CMD), - [val] "i" (PRED_CMD_IBPB) - : "eax", "ecx", "edx", "memory"); + alternative_input("", + "call __ibp_barrier", + X86_FEATURE_USE_IBPB, + ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory")); } #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9c18da64daa9..881ca3b1d6d4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -970,4 +970,7 @@ bool xen_set_default_idle(void); void stop_this_cpu(void *dummy); void df_debug(struct pt_regs *regs, long error_code); + +void __ibp_barrier(void); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 32d8e6cdc09e..3bfb2b23d79c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -311,3 +311,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, spectre_v2_module_string()); } #endif + +void __ibp_barrier(void) +{ + __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0); +} +EXPORT_SYMBOL_GPL(__ibp_barrier); -- cgit v1.2.3 From dd085168a74c99c3ebe7f813069e412eb8444243 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 27 Jan 2018 20:21:50 -0600 Subject: x86/ftrace: Add one more ENDPROC annotation When ORC support was added for the ftrace_64.S code, an ENDPROC for function_hook() was missed. This results in the following warning: arch/x86/kernel/ftrace_64.o: warning: objtool: .entry.text+0x0: unreachable instruction Fixes: e2ac83d74a4d ("x86/ftrace: Fix ORC unwinding from ftrace handlers") Reported-by: Steven Rostedt Reported-by: Borislav Petkov Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20180128022150.dqierscqmt3uwwsr@treble --- arch/x86/kernel/ftrace_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7cb8ba08beb9..8774fd2ed390 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -291,7 +291,7 @@ trace: restore_mcount_regs jmp fgraph_trace -END(function_hook) +ENDPROC(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.2.3 From 89a8f6d4904c8cf3ff8fee9fdaff392a6bbb8bf6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:31 +0100 Subject: x86/hyperv: Check for required priviliges in hyperv_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In hyperv_init() its presumed that it always has access to VP index and hypercall MSRs while according to the specification it should be checked if it's allowed to access the corresponding MSRs before accessing them. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-2-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 189a398290db..21f9d53d9f00 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -110,12 +110,19 @@ static int hv_cpu_init(unsigned int cpu) */ void hyperv_init(void) { - u64 guest_id; + u64 guest_id, required_msrs; union hv_x64_msr_hypercall_contents hypercall_msr; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; + /* Absolutely required MSRs */ + required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE | + HV_X64_MSR_VP_INDEX_AVAILABLE; + + if ((ms_hyperv.features & required_msrs) != required_msrs) + return; + /* Allocate percpu VP index */ hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), GFP_KERNEL); -- cgit v1.2.3 From e2768eaa1ca4fbb7b778da5615cce3dd310352e6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:32 +0100 Subject: x86/hyperv: Add a function to read both TSC and TSC page value simulateneously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is going to be used from KVM code where both TSC and TSC page value are needed. Nothing is supposed to use the function when Hyper-V code is compiled out, just BUG(). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-3-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 1 + arch/x86/include/asm/mshyperv.h | 23 +++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 21f9d53d9f00..1a6c63f721bc 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -37,6 +37,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void) { return tsc_pg; } +EXPORT_SYMBOL_GPL(hv_get_tsc_page); static u64 read_hv_clock_tsc(struct clocksource *arg) { diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 8bf450b13d9f..6b1d4ea78270 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -325,9 +325,10 @@ static inline void hyperv_setup_mmu_ops(void) {} #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc) { - u64 scale, offset, cur_tsc; + u64 scale, offset; u32 sequence; /* @@ -358,7 +359,7 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) scale = READ_ONCE(tsc_pg->tsc_scale); offset = READ_ONCE(tsc_pg->tsc_offset); - cur_tsc = rdtsc_ordered(); + *cur_tsc = rdtsc_ordered(); /* * Make sure we read sequence after we read all other values @@ -368,7 +369,14 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); - return mul_u64_u64_shr(cur_tsc, scale, 64) + offset; + return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; +} + +static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +{ + u64 cur_tsc; + + return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); } #else @@ -376,5 +384,12 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) { return NULL; } + +static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc) +{ + BUG(); + return U64_MAX; +} #endif #endif -- cgit v1.2.3 From 93286261de1b46339aa27cd4c639b21778f6cade Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:33 +0100 Subject: x86/hyperv: Reenlightenment notifications support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V supports Live Migration notification. This is supposed to be used in conjunction with TSC emulation: when a VM is migrated to a host with different TSC frequency for some short period the host emulates the accesses to TSC and sends an interrupt to notify about the event. When the guest is done updating everything it can disable TSC emulation and everything will start working fast again. These notifications weren't required until now as Hyper-V guests are not supposed to use TSC as a clocksource: in Linux the TSC is even marked as unstable on boot. Guests normally use 'tsc page' clocksource and host updates its values on migrations automatically. Things change when with nested virtualization: even when the PV clocksources (kvm-clock or tsc page) are passed through to the nested guests the TSC frequency and frequency changes need to be know.. Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that the fix in on the way. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-4-vkuznets@redhat.com --- arch/x86/entry/entry_32.S | 3 ++ arch/x86/entry/entry_64.S | 3 ++ arch/x86/hyperv/hv_init.c | 89 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/irq_vectors.h | 7 ++- arch/x86/include/asm/mshyperv.h | 9 ++++ arch/x86/include/uapi/asm/hyperv.h | 27 ++++++++++++ arch/x86/kernel/cpu/mshyperv.c | 6 +++ 7 files changed, 143 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2a35b1e0fb90..7a796eeddf99 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -895,6 +895,9 @@ BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, hyperv_vector_handler) +BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_intr) + #endif /* CONFIG_HYPERV */ ENTRY(page_fault) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a83570495162..553aa49909ce 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1245,6 +1245,9 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ #if IS_ENABLED(CONFIG_HYPERV) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler + +apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ + hyperv_reenlightenment_vector hyperv_reenlightenment_intr #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a6c63f721bc..712ac40081f7 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -18,6 +18,8 @@ */ #include +#include +#include #include #include #include @@ -102,6 +104,93 @@ static int hv_cpu_init(unsigned int cpu) return 0; } +static void (*hv_reenlightenment_cb)(void); + +static void hv_reenlightenment_notify(struct work_struct *dummy) +{ + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + /* Don't issue the callback if TSC accesses are not emulated */ + if (hv_reenlightenment_cb && emu_status.inprogress) + hv_reenlightenment_cb(); +} +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); + +void hyperv_stop_tsc_emulation(void) +{ + u64 freq; + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + emu_status.inprogress = 0; + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + tsc_khz = div64_u64(freq, 1000); +} +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); + +static inline bool hv_reenlightenment_available(void) +{ + /* + * Check for required features and priviliges to make TSC frequency + * change notifications work. + */ + return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && + ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; +} + +__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +{ + entering_ack_irq(); + + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); + + exiting_irq(); +} + +void set_hv_tscchange_cb(void (*cb)(void)) +{ + struct hv_reenlightenment_control re_ctrl = { + .vector = HYPERV_REENLIGHTENMENT_VECTOR, + .enabled = 1, + .target_vp = hv_vp_index[smp_processor_id()] + }; + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; + + if (!hv_reenlightenment_available()) { + pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + return; + } + + hv_reenlightenment_cb = cb; + + /* Make sure callback is registered before we write to MSRs */ + wmb(); + + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); +} +EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); + +void clear_hv_tscchange_cb(void) +{ + struct hv_reenlightenment_control re_ctrl; + + if (!hv_reenlightenment_available()) + return; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + re_ctrl.enabled = 0; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + + hv_reenlightenment_cb = NULL; +} +EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 67421f649cfa..e71c1120426b 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -103,7 +103,12 @@ #endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef -#define LOCAL_TIMER_VECTOR 0xee + +#if IS_ENABLED(CONFIG_HYPERV) +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee +#endif + +#define LOCAL_TIMER_VECTOR 0xed #define NR_VECTORS 256 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 6b1d4ea78270..1790002a2052 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -160,6 +160,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_set_synint_state(int_num, val) wrmsrl(int_num, val) void hyperv_callback_vector(void); +void hyperv_reenlightenment_vector(void); #ifdef CONFIG_TRACING #define trace_hyperv_callback_vector hyperv_callback_vector #endif @@ -316,11 +317,19 @@ void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs, long err); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); + +void hyperv_reenlightenment_intr(struct pt_regs *regs); +void set_hv_tscchange_cb(void (*cb)(void)); +void clear_hv_tscchange_cb(void); +void hyperv_stop_tsc_emulation(void); #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline bool hv_is_hypercall_page_setup(void) { return false; } static inline void hyperv_cleanup(void) {} static inline void hyperv_setup_mmu_ops(void) {} +static inline void set_hv_tscchange_cb(void (*cb)(void)) {} +static inline void clear_hv_tscchange_cb(void) {} +static inline void hyperv_stop_tsc_emulation(void) {}; #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 1a5bfead93b4..197c2e6c7376 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -40,6 +40,9 @@ */ #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) +/* AccessReenlightenmentControls privilege */ +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) + /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available @@ -234,6 +237,30 @@ #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) +/* TSC emulation after migration */ +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 + +struct hv_reenlightenment_control { + u64 vector:8; + u64 reserved1:8; + u64 enabled:1; + u64 reserved2:15; + u64 target_vp:32; +}; + +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 + +struct hv_tsc_emulation_control { + u64 enabled:1; + u64 reserved:63; +}; + +struct hv_tsc_emulation_status { + u64 inprogress:1; + u64 reserved:63; +}; + #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 85eb5fc180c8..9340f41ce8d3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void) hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + + /* Setup the IDT for reenlightenment notifications */ + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_vector); + #endif } -- cgit v1.2.3 From e7c4e36c447daca2b7df49024f6bf230871cb155 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:34 +0100 Subject: x86/hyperv: Redirect reenlightment notifications on CPU offlining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is very unlikely for CPUs to get offlined when running on Hyper-V as there is a protection in the vmbus module which prevents it when the guest has any VMBus devices assigned. This, however, may change in future if an option to reassign an already active channel will be added. It is also possible to run without any Hyper-V devices or to have a CPU with no assigned channels. Reassign reenlightenment notifications to some other active CPU when the CPU which is assigned to them goes offline. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-5-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 712ac40081f7..e4377e2f2a10 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -191,6 +191,26 @@ void clear_hv_tscchange_cb(void) } EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); +static int hv_cpu_die(unsigned int cpu) +{ + struct hv_reenlightenment_control re_ctrl; + unsigned int new_cpu; + + if (hv_reenlightenment_cb == NULL) + return 0; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + if (re_ctrl.target_vp == hv_vp_index[cpu]) { + /* Reassign to some other online CPU */ + new_cpu = cpumask_any_but(cpu_online_mask, cpu); + + re_ctrl.target_vp = hv_vp_index[new_cpu]; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + } + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -220,7 +240,7 @@ void hyperv_init(void) return; if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", - hv_cpu_init, NULL) < 0) + hv_cpu_init, hv_cpu_die) < 0) goto free_vp_index; /* -- cgit v1.2.3 From 51d4e5daa32808df4d50db511d167fde19fa114e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:35 +0100 Subject: x86/irq: Count Hyper-V reenlightenment interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V reenlightenment interrupts arrive when the VM is migrated, While they are not interesting in general it's important when L2 nested guests are running. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-6-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 2 ++ arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/kernel/irq.c | 9 +++++++++ 3 files changed, 14 insertions(+) (limited to 'arch') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e4377e2f2a10..a3adece392f1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -147,6 +147,8 @@ __visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) { entering_ack_irq(); + inc_irq_stat(irq_hv_reenlightenment_count); + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); exiting_irq(); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 51cc979dd364..7c341a74ec8c 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -38,6 +38,9 @@ typedef struct { #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif +#if IS_ENABLED(CONFIG_HYPERV) + unsigned int irq_hv_reenlightenment_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 68e1867cca80..45fb4d2565f8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -141,6 +141,15 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->irq_hv_callback_count); seq_puts(p, " Hypervisor callback interrupts\n"); } +#endif +#if IS_ENABLED(CONFIG_HYPERV) + if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HRE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_reenlightenment_count); + seq_puts(p, " Hyper-V reenlightenment interrupts\n"); + } #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) -- cgit v1.2.3 From b0c39dc68e3b3d22bf9d2984f62f6c86788a49e7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:36 +0100 Subject: x86/kvm: Pass stable clocksource to guests when running nested on Hyper-V MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, KVM is able to work in 'masterclock' mode passing PVCLOCK_TSC_STABLE_BIT to guests when the clocksource which is used on the host is TSC. When running nested on Hyper-V the guest normally uses a different one: TSC page which is resistant to TSC frequency changes on events like L1 migration. Add support for it in KVM. The only non-trivial change is in vgettsc(): when updating the gtod copy both the clock readout and tsc value have to be updated now. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-7-vkuznets@redhat.com --- arch/x86/kvm/x86.c | 93 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c53298dfbf50..b1ce368a07af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -67,6 +67,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -1377,6 +1378,11 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } +static inline int gtod_is_based_on_tsc(int mode) +{ + return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK; +} + static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 @@ -1396,7 +1402,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * perform request to enable masterclock. */ if (ka->use_master_clock || - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) + (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -1459,6 +1465,19 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vcpu->arch.tsc_offset = offset; } +static inline bool kvm_check_tsc_unstable(void) +{ +#ifdef CONFIG_X86_64 + /* + * TSC is marked unstable when we're running on Hyper-V, + * 'TSC page' clocksource is good. + */ + if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK) + return false; +#endif + return check_tsc_unstable(); +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1504,7 +1523,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) */ if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { - if (!check_tsc_unstable()) { + if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { @@ -1604,18 +1623,43 @@ static u64 read_tsc(void) return last; } -static inline u64 vgettsc(u64 *cycle_now) +static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) { long v; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + u64 tsc_pg_val; + + switch (gtod->clock.vclock_mode) { + case VCLOCK_HVCLOCK: + tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), + tsc_timestamp); + if (tsc_pg_val != U64_MAX) { + /* TSC page valid */ + *mode = VCLOCK_HVCLOCK; + v = (tsc_pg_val - gtod->clock.cycle_last) & + gtod->clock.mask; + } else { + /* TSC page invalid */ + *mode = VCLOCK_NONE; + } + break; + case VCLOCK_TSC: + *mode = VCLOCK_TSC; + *tsc_timestamp = read_tsc(); + v = (*tsc_timestamp - gtod->clock.cycle_last) & + gtod->clock.mask; + break; + default: + *mode = VCLOCK_NONE; + } - *cycle_now = read_tsc(); + if (*mode == VCLOCK_NONE) + *tsc_timestamp = v = 0; - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; return v * gtod->clock.mult; } -static int do_monotonic_boot(s64 *t, u64 *cycle_now) +static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1624,9 +1668,8 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) do { seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; ns = gtod->nsec_base; - ns += vgettsc(cycle_now); + ns += vgettsc(tsc_timestamp, &mode); ns >>= gtod->clock.shift; ns += gtod->boot_ns; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -1635,7 +1678,7 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } -static int do_realtime(struct timespec *ts, u64 *cycle_now) +static int do_realtime(struct timespec *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1644,10 +1687,9 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now) do { seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->nsec_base; - ns += vgettsc(cycle_now); + ns += vgettsc(tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -1657,25 +1699,26 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now) return mode; } -/* returns true if host is using tsc clocksource */ -static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) +/* returns true if host is using TSC based clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; + return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, + tsc_timestamp)); } -/* returns true if host is using tsc clocksource */ +/* returns true if host is using TSC based clocksource */ static bool kvm_get_walltime_and_clockread(struct timespec *ts, - u64 *cycle_now) + u64 *tsc_timestamp) { /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return do_realtime(ts, cycle_now) == VCLOCK_TSC; + return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); } #endif @@ -2869,13 +2912,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); - if (check_tsc_unstable()) { + if (kvm_check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); @@ -6110,9 +6153,9 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, update_pvclock_gtod(tk); /* disable master clock if host does not trust, or does not - * use, TSC clocksource + * use, TSC based clocksource. */ - if (gtod->clock.vclock_mode != VCLOCK_TSC && + if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) queue_work(system_long_wq, &pvclock_gtod_work); @@ -7767,7 +7810,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, { struct kvm_vcpu *vcpu; - if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); @@ -7924,7 +7967,7 @@ int kvm_arch_hardware_enable(void) return ret; local_tsc = rdtsc(); - stable = !check_tsc_unstable(); + stable = !kvm_check_tsc_unstable(); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (!stable && vcpu->cpu == smp_processor_id()) -- cgit v1.2.3 From 0092e4346f49558e5fe5a927c6d78d401dc4ed73 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:37 +0100 Subject: x86/kvm: Support Hyper-V reenlightenment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running nested KVM on Hyper-V guests its required to update masterclocks for all guests when L1 migrates to a host with different TSC frequency. Implement the procedure in the following way: - Pause all guests. - Tell the host (Hyper-V) to stop emulating TSC accesses. - Update the gtod copy, recompute clocks. - Unpause all guests. This is somewhat similar to cpufreq but there are two important differences: - TSC emulation can only be disabled globally (on all CPUs) - The new TSC frequency is not known until emulation is turned off so there is no way to 'prepare' for the event upfront. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-8-vkuznets@redhat.com --- arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b1ce368a07af..879a99987401 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,6 +68,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -5932,6 +5933,43 @@ static void tsc_khz_changed(void *data) __this_cpu_write(cpu_tsc_khz, khz); } +static void kvm_hyperv_tsc_notifier(void) +{ +#ifdef CONFIG_X86_64 + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int cpu; + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_make_mclock_inprogress_request(kvm); + + hyperv_stop_tsc_emulation(); + + /* TSC frequency always matches when on Hyper-V */ + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + kvm_max_guest_tsc_khz = tsc_khz; + + list_for_each_entry(kvm, &vm_list, vm_list) { + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(cpu, vcpu, kvm) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + + kvm_for_each_vcpu(cpu, vcpu, kvm) + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + + spin_unlock(&ka->pvclock_gtod_sync_lock); + } + spin_unlock(&kvm_lock); +#endif +} + static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -6217,6 +6255,9 @@ int kvm_arch_init(void *opaque) kvm_lapic_init(); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); + + if (x86_hyper_type == X86_HYPER_MS_HYPERV) + set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif return 0; @@ -6229,6 +6270,10 @@ out: void kvm_arch_exit(void) { +#ifdef CONFIG_X86_64 + if (x86_hyper_type == X86_HYPER_MS_HYPERV) + clear_hv_tscchange_cb(); +#endif kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); -- cgit v1.2.3 From 5fa4ec9cb2e6679e2f828033726f758ea314b9c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Jan 2018 09:41:40 +0100 Subject: x86/kvm: Make it compile on 32bit and with HYPYERVISOR_GUEST=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reenlightment support for hyperv slapped a direct reference to x86_hyper_type into the kvm code which results in the following build failure when CONFIG_HYPERVISOR_GUEST=n: arch/x86/kvm/x86.c:6259:6: error: ‘x86_hyper_type’ undeclared (first use in this function) arch/x86/kvm/x86.c:6259:6: note: each undeclared identifier is reported only once for each function it appears in Use the proper helper function to cure that. The 32bit compile fails because of: arch/x86/kvm/x86.c:5936:13: warning: ‘kvm_hyperv_tsc_notifier’ defined but not used [-Wunused-function] which is a real trainwreck engineering artwork. The callsite is wrapped into #ifdef CONFIG_X86_64, but the function itself has the #ifdef inside the function body. Make the function itself wrapped into the ifdef to cure that. Qualiteee.... Fixes: 0092e4346f49 ("x86/kvm: Support Hyper-V reenlightenment") Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 879a99987401..cd3b3bc67c5a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5933,9 +5933,9 @@ static void tsc_khz_changed(void *data) __this_cpu_write(cpu_tsc_khz, khz); } +#ifdef CONFIG_X86_64 static void kvm_hyperv_tsc_notifier(void) { -#ifdef CONFIG_X86_64 struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; @@ -5967,8 +5967,8 @@ static void kvm_hyperv_tsc_notifier(void) spin_unlock(&ka->pvclock_gtod_sync_lock); } spin_unlock(&kvm_lock); -#endif } +#endif static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -6256,7 +6256,7 @@ int kvm_arch_init(void *opaque) #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); - if (x86_hyper_type == X86_HYPER_MS_HYPERV) + if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif @@ -6271,7 +6271,7 @@ out: void kvm_arch_exit(void) { #ifdef CONFIG_X86_64 - if (x86_hyper_type == X86_HYPER_MS_HYPERV) + if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); #endif kvm_lapic_exit(); -- cgit v1.2.3