From f10dc56c64bb662822475304508c1ce99f194e70 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 29 Jul 2018 16:52:30 +0200 Subject: crypto: arm64 - revert NEON yield for fast AEAD implementations As it turns out, checking the TIF_NEED_RESCHED flag after each iteration results in a significant performance regression (~10%) when running fast algorithms (i.e., ones that use special instructions and operate in the < 4 cycles per byte range) on in-order cores with comparatively slow memory accesses such as the Cortex-A53. Given the speed of these ciphers, and the fact that the page based nature of the AEAD scatterwalk API guarantees that the core NEON transform is never invoked with more than a single page's worth of input, we can estimate the worst case duration of any resulting scheduling blackout: on a 1 GHz Cortex-A53 running with 64k pages, processing a page's worth of input at 4 cycles per byte results in a delay of ~250 us, which is a reasonable upper bound. So let's remove the yield checks from the fused AES-CCM and AES-GCM routines entirely. This reverts commit 7b67ae4d5ce8e2f912377f5fbccb95811a92097f and partially reverts commit 7c50136a8aba8784f07fb66a950cc61a7f3d2ee3. Fixes: 7c50136a8aba ("crypto: arm64/aes-ghash - yield NEON after every ...") Fixes: 7b67ae4d5ce8 ("crypto: arm64/aes-ccm - yield NEON after every ...") Signed-off-by: Ard Biesheuvel Acked-by: Herbert Xu Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-ce-ccm-core.S | 150 +++++++++++++----------------------- arch/arm64/crypto/ghash-ce-core.S | 76 ++++++------------ 2 files changed, 80 insertions(+), 146 deletions(-) (limited to 'arch') diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S index 88f5aef7934c..e3a375c4cb83 100644 --- a/arch/arm64/crypto/aes-ce-ccm-core.S +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -19,33 +19,24 @@ * u32 *macp, u8 const rk[], u32 rounds); */ ENTRY(ce_aes_ccm_auth_data) - frame_push 7 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - - ldr w25, [x22] /* leftover from prev round? */ + ldr w8, [x3] /* leftover from prev round? */ ld1 {v0.16b}, [x0] /* load mac */ - cbz w25, 1f - sub w25, w25, #16 + cbz w8, 1f + sub w8, w8, #16 eor v1.16b, v1.16b, v1.16b -0: ldrb w7, [x20], #1 /* get 1 byte of input */ - subs w21, w21, #1 - add w25, w25, #1 +0: ldrb w7, [x1], #1 /* get 1 byte of input */ + subs w2, w2, #1 + add w8, w8, #1 ins v1.b[0], w7 ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ beq 8f /* out of input? */ - cbnz w25, 0b + cbnz w8, 0b eor v0.16b, v0.16b, v1.16b -1: ld1 {v3.4s}, [x23] /* load first round key */ - prfm pldl1strm, [x20] - cmp w24, #12 /* which key size? */ - add x6, x23, #16 - sub w7, w24, #2 /* modified # of rounds */ +1: ld1 {v3.4s}, [x4] /* load first round key */ + prfm pldl1strm, [x1] + cmp w5, #12 /* which key size? */ + add x6, x4, #16 + sub w7, w5, #2 /* modified # of rounds */ bmi 2f bne 5f mov v5.16b, v3.16b @@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data) ld1 {v5.4s}, [x6], #16 /* load next round key */ bpl 3b aese v0.16b, v4.16b - subs w21, w21, #16 /* last data? */ + subs w2, w2, #16 /* last data? */ eor v0.16b, v0.16b, v5.16b /* final round */ bmi 6f - ld1 {v1.16b}, [x20], #16 /* load next input block */ + ld1 {v1.16b}, [x1], #16 /* load next input block */ eor v0.16b, v0.16b, v1.16b /* xor with mac */ - beq 6f - - if_will_cond_yield_neon - st1 {v0.16b}, [x19] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x19] /* reload mac */ - endif_yield_neon - - b 1b -6: st1 {v0.16b}, [x19] /* store mac */ + bne 1b +6: st1 {v0.16b}, [x0] /* store mac */ beq 10f - adds w21, w21, #16 + adds w2, w2, #16 beq 10f - mov w25, w21 -7: ldrb w7, [x20], #1 + mov w8, w2 +7: ldrb w7, [x1], #1 umov w6, v0.b[0] eor w6, w6, w7 - strb w6, [x19], #1 - subs w21, w21, #1 + strb w6, [x0], #1 + subs w2, w2, #1 beq 10f ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ b 7b -8: mov w7, w25 - add w25, w25, #16 +8: mov w7, w8 + add w8, w8, #16 9: ext v1.16b, v1.16b, v1.16b, #1 adds w7, w7, #1 bne 9b eor v0.16b, v0.16b, v1.16b - st1 {v0.16b}, [x19] -10: str w25, [x22] - - frame_pop + st1 {v0.16b}, [x0] +10: str w8, [x3] ret ENDPROC(ce_aes_ccm_auth_data) @@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final) ENDPROC(ce_aes_ccm_final) .macro aes_ccm_do_crypt,enc - frame_push 8 - - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - - ldr x26, [x25, #8] /* load lower ctr */ - ld1 {v0.16b}, [x24] /* load mac */ -CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ + ldr x8, [x6, #8] /* load lower ctr */ + ld1 {v0.16b}, [x5] /* load mac */ +CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */ 0: /* outer loop */ - ld1 {v1.8b}, [x25] /* load upper ctr */ - prfm pldl1strm, [x20] - add x26, x26, #1 - rev x9, x26 - cmp w23, #12 /* which key size? */ - sub w7, w23, #2 /* get modified # of rounds */ + ld1 {v1.8b}, [x6] /* load upper ctr */ + prfm pldl1strm, [x1] + add x8, x8, #1 + rev x9, x8 + cmp w4, #12 /* which key size? */ + sub w7, w4, #2 /* get modified # of rounds */ ins v1.d[1], x9 /* no carry in lower ctr */ - ld1 {v3.4s}, [x22] /* load first round key */ - add x10, x22, #16 + ld1 {v3.4s}, [x3] /* load first round key */ + add x10, x3, #16 bmi 1f bne 4f mov v5.16b, v3.16b @@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ bpl 2b aese v0.16b, v4.16b aese v1.16b, v4.16b - subs w21, w21, #16 - bmi 7f /* partial block? */ - ld1 {v2.16b}, [x20], #16 /* load next input block */ + subs w2, w2, #16 + bmi 6f /* partial block? */ + ld1 {v2.16b}, [x1], #16 /* load next input block */ .if \enc == 1 eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ @@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ eor v1.16b, v2.16b, v5.16b /* final round enc */ .endif eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ - st1 {v1.16b}, [x19], #16 /* write output block */ - beq 5f - - if_will_cond_yield_neon - st1 {v0.16b}, [x24] /* store mac */ - do_cond_yield_neon - ld1 {v0.16b}, [x24] /* reload mac */ - endif_yield_neon - - b 0b -5: -CPU_LE( rev x26, x26 ) - st1 {v0.16b}, [x24] /* store mac */ - str x26, [x25, #8] /* store lsb end of ctr (BE) */ - -6: frame_pop - ret - -7: eor v0.16b, v0.16b, v5.16b /* final round mac */ + st1 {v1.16b}, [x0], #16 /* write output block */ + bne 0b +CPU_LE( rev x8, x8 ) + st1 {v0.16b}, [x5] /* store mac */ + str x8, [x6, #8] /* store lsb end of ctr (BE) */ +5: ret + +6: eor v0.16b, v0.16b, v5.16b /* final round mac */ eor v1.16b, v1.16b, v5.16b /* final round enc */ - st1 {v0.16b}, [x24] /* store mac */ - add w21, w21, #16 /* process partial tail block */ -8: ldrb w9, [x20], #1 /* get 1 byte of input */ + st1 {v0.16b}, [x5] /* store mac */ + add w2, w2, #16 /* process partial tail block */ +7: ldrb w9, [x1], #1 /* get 1 byte of input */ umov w6, v1.b[0] /* get top crypted ctr byte */ umov w7, v0.b[0] /* get top mac byte */ .if \enc == 1 @@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 ) eor w9, w9, w6 eor w7, w7, w9 .endif - strb w9, [x19], #1 /* store out byte */ - strb w7, [x24], #1 /* store mac byte */ - subs w21, w21, #1 - beq 6b + strb w9, [x0], #1 /* store out byte */ + strb w7, [x5], #1 /* store mac byte */ + subs w2, w2, #1 + beq 5b ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ - b 8b + b 7b .endm /* diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index dcffb9e77589..c723647b37db 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8) .endm .macro pmull_gcm_do_crypt, enc - frame_push 10 + ld1 {SHASH.2d}, [x4] + ld1 {XL.2d}, [x1] + ldr x8, [x5, #8] // load lower counter - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 - mov x23, x4 - mov x24, x5 - mov x25, x6 - mov x26, x7 - .if \enc == 1 - ldr x27, [sp, #96] // first stacked arg - .endif - - ldr x28, [x24, #8] // load lower counter -CPU_LE( rev x28, x28 ) - -0: mov x0, x25 - load_round_keys w26, x0 - ld1 {SHASH.2d}, [x23] - ld1 {XL.2d}, [x20] + load_round_keys w7, x6 movi MASK.16b, #0xe1 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 +CPU_LE( rev x8, x8 ) shl MASK.2d, MASK.2d, #57 eor SHASH2.16b, SHASH2.16b, SHASH.16b .if \enc == 1 - ld1 {KS.16b}, [x27] + ldr x10, [sp] + ld1 {KS.16b}, [x10] .endif -1: ld1 {CTR.8b}, [x24] // load upper counter - ld1 {INP.16b}, [x22], #16 - rev x9, x28 - add x28, x28, #1 - sub w19, w19, #1 +0: ld1 {CTR.8b}, [x5] // load upper counter + ld1 {INP.16b}, [x3], #16 + rev x9, x8 + add x8, x8, #1 + sub w0, w0, #1 ins CTR.d[1], x9 // set lower counter .if \enc == 1 eor INP.16b, INP.16b, KS.16b // encrypt input - st1 {INP.16b}, [x21], #16 + st1 {INP.16b}, [x2], #16 .endif rev64 T1.16b, INP.16b - cmp w26, #12 - b.ge 4f // AES-192/256? + cmp w7, #12 + b.ge 2f // AES-192/256? -2: enc_round CTR, v21 +1: enc_round CTR, v21 ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 @@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 ) .if \enc == 0 eor INP.16b, INP.16b, KS.16b - st1 {INP.16b}, [x21], #16 + st1 {INP.16b}, [x2], #16 .endif - cbz w19, 3f + cbnz w0, 0b - if_will_cond_yield_neon - st1 {XL.2d}, [x20] - .if \enc == 1 - st1 {KS.16b}, [x27] - .endif - do_cond_yield_neon - b 0b - endif_yield_neon +CPU_LE( rev x8, x8 ) + st1 {XL.2d}, [x1] + str x8, [x5, #8] // store lower counter - b 1b - -3: st1 {XL.2d}, [x20] .if \enc == 1 - st1 {KS.16b}, [x27] + st1 {KS.16b}, [x10] .endif -CPU_LE( rev x28, x28 ) - str x28, [x24, #8] // store lower counter - - frame_pop ret -4: b.eq 5f // AES-192? +2: b.eq 3f // AES-192? enc_round CTR, v17 enc_round CTR, v18 -5: enc_round CTR, v19 +3: enc_round CTR, v19 enc_round CTR, v20 - b 2b + b 1b .endm /* -- cgit v1.2.3 From 877ccce7cbe8409256616f5e6bdedb08ce2e82db Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Fri, 3 Aug 2018 13:37:50 +0200 Subject: crypto: x86/aegis,morus - Fix and simplify CPUID checks It turns out I had misunderstood how the x86_match_cpu() function works. It evaluates a logical OR of the matching conditions, not logical AND. This caused the CPU feature checks for AEGIS to pass even if only SSE2 (but not AES-NI) was supported (or vice versa), leading to potential crashes if something tried to use the registered algs. This patch switches the checks to a simpler method that is used e.g. in the Camellia x86 code. The patch also removes the MODULE_DEVICE_TABLE declarations which actually seem to cause the modules to be auto-loaded at boot, which is not desired. The crypto API on-demand module loading is sufficient. Fixes: 1d373d4e8e15 ("crypto: x86 - Add optimized AEGIS implementations") Fixes: 6ecc9d9ff91f ("crypto: x86 - Add optimized MORUS implementations") Signed-off-by: Ondrej Mosnacek Tested-by: Milan Broz Signed-off-by: Herbert Xu --- arch/x86/crypto/aegis128-aesni-glue.c | 12 ++++-------- arch/x86/crypto/aegis128l-aesni-glue.c | 12 ++++-------- arch/x86/crypto/aegis256-aesni-glue.c | 12 ++++-------- arch/x86/crypto/morus1280-avx2-glue.c | 10 +++------- arch/x86/crypto/morus1280-sse2-glue.c | 10 +++------- arch/x86/crypto/morus640-sse2-glue.c | 10 +++------- 6 files changed, 21 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 5de7c0d46edf..acd11b3bf639 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128_aesni_alg, diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 876e4866e633..2071c3d1ae07 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128l_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128l_aesni_alg, diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index 2b5dd3af8f4d..b5f2a8fd5a71 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis256_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis256_aesni_alg, diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c index f111f36d26dc..6634907d6ccd 100644 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); -static const struct x86_cpu_id avx2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AVX2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id); - static int __init crypto_morus1280_avx2_module_init(void) { - if (!x86_match_cpu(avx2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_avx2_algs, diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index 839270aa713c..95cf857d2cbb 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus1280_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_sse2_algs, diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 26b47e2db8d2..615fb7bc9a32 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus640_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus640_sse2_algs, -- cgit v1.2.3 From 66509a276c8c1d19ee3f661a41b418d101c57d29 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 28 Jul 2018 11:47:17 +0200 Subject: parisc: Enable CONFIG_MLONGCALLS by default Enable the -mlong-calls compiler option by default, because otherwise in most cases linking the vmlinux binary fails due to truncations of R_PARISC_PCREL22F relocations. This fixes building the 64-bit defconfig. Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 17526bebcbd2..46f656b8fc23 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -195,7 +195,7 @@ config PREFETCH config MLONGCALLS bool "Enable the -mlong-calls compiler option for big kernels" - def_bool y if (!MODULES) + default y depends on PA8X00 help If you configure the kernel to include many drivers built-in instead -- cgit v1.2.3 From fedb8da96355f5f64353625bf96dc69423ad1826 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sun, 5 Aug 2018 13:30:31 -0400 Subject: parisc: Define mb() and add memory barriers to assembler unlock sequences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For years I thought all parisc machines executed loads and stores in order. However, Jeff Law recently indicated on gcc-patches that this is not correct. There are various degrees of out-of-order execution all the way back to the PA7xxx processor series (hit-under-miss). The PA8xxx series has full out-of-order execution for both integer operations, and loads and stores. This is described in the following article: http://web.archive.org/web/20040214092531/http://www.cpus.hp.com/technical_references/advperf.shtml For this reason, we need to define mb() and to insert a memory barrier before the store unlocking spinlocks. This ensures that all memory accesses are complete prior to unlocking. The ldcw instruction performs the same function on entry. Signed-off-by: John David Anglin Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Helge Deller --- arch/parisc/include/asm/barrier.h | 32 ++++++++++++++++++++++++++++++++ arch/parisc/kernel/entry.S | 2 ++ arch/parisc/kernel/pacache.S | 1 + arch/parisc/kernel/syscall.S | 4 ++++ 4 files changed, 39 insertions(+) create mode 100644 arch/parisc/include/asm/barrier.h (limited to 'arch') diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h new file mode 100644 index 000000000000..dbaaca84f27f --- /dev/null +++ b/arch/parisc/include/asm/barrier.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +#ifndef __ASSEMBLY__ + +/* The synchronize caches instruction executes as a nop on systems in + which all memory references are performed in order. */ +#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory") + +#if defined(CONFIG_SMP) +#define mb() do { synchronize_caches(); } while (0) +#define rmb() mb() +#define wmb() mb() +#define dma_rmb() mb() +#define dma_wmb() mb() +#else +#define mb() barrier() +#define rmb() barrier() +#define wmb() barrier() +#define dma_rmb() barrier() +#define dma_wmb() barrier() +#endif + +#define __smp_mb() mb() +#define __smp_rmb() mb() +#define __smp_wmb() mb() + +#include + +#endif /* !__ASSEMBLY__ */ +#endif /* __ASM_BARRIER_H */ diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index e95207c0565e..1b4732e20137 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -481,6 +481,8 @@ /* Release pa_tlb_lock lock without reloading lock address. */ .macro tlb_unlock0 spc,tmp #ifdef CONFIG_SMP + or,COND(=) %r0,\spc,%r0 + sync or,COND(=) %r0,\spc,%r0 stw \spc,0(\tmp) #endif diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index 22e6374ece44..97451e67d35b 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -353,6 +353,7 @@ ENDPROC_CFI(flush_data_cache_local) .macro tlb_unlock la,flags,tmp #ifdef CONFIG_SMP ldi 1,\tmp + sync stw \tmp,0(\la) mtsm \flags #endif diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index e775f80ae28c..4886a6db42e9 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -633,6 +633,7 @@ cas_action: sub,<> %r28, %r25, %r0 2: stw,ma %r24, 0(%r26) /* Free lock */ + sync stw,ma %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG /* Clear thread register indicator */ @@ -647,6 +648,7 @@ cas_action: 3: /* Error occurred on load or store */ /* Free lock */ + sync stw %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG stw %r0, 4(%sr2,%r20) @@ -848,6 +850,7 @@ cas2_action: cas2_end: /* Free lock */ + sync stw,ma %r20, 0(%sr2,%r20) /* Enable interrupts */ ssm PSW_SM_I, %r0 @@ -858,6 +861,7 @@ cas2_end: 22: /* Error occurred on load or store */ /* Free lock */ + sync stw %r20, 0(%sr2,%r20) ssm PSW_SM_I, %r0 ldo 1(%r0),%r28 -- cgit v1.2.3