diff options
author | Heiko Carstens <hca@linux.ibm.com> | 2024-02-03 11:45:22 +0100 |
---|---|---|
committer | Heiko Carstens <hca@linux.ibm.com> | 2024-02-16 14:30:17 +0100 |
commit | cb2a1dd589a0ce97429bf2beeb560e5b030c2ccc (patch) | |
tree | cb2ffaa05e7442ba1f4dfb305dcdb7bfef87d463 | |
parent | s390/checksum: provide and use cksm() inline assembly (diff) | |
download | linux-cb2a1dd589a0ce97429bf2beeb560e5b030c2ccc.tar.xz linux-cb2a1dd589a0ce97429bf2beeb560e5b030c2ccc.zip |
s390/checksum: provide vector register variant of csum_partial()
Provide a faster variant of csum_partial() which uses vector registers
instead of the cksm instruction.
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
-rw-r--r-- | arch/s390/include/asm/checksum.h | 17 | ||||
-rw-r--r-- | arch/s390/include/asm/fpu-insn-asm.h | 19 | ||||
-rw-r--r-- | arch/s390/include/asm/fpu-insn.h | 99 | ||||
-rw-r--r-- | arch/s390/include/asm/fpu-types.h | 4 | ||||
-rw-r--r-- | arch/s390/lib/Makefile | 1 | ||||
-rw-r--r-- | arch/s390/lib/csum-partial.c | 63 |
6 files changed, 187 insertions, 16 deletions
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index 414264b3ed6c..00095cc20afa 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -30,22 +30,7 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum) return sum; } -/* - * Computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit). - * - * Returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic. - * - * This function must be called with even lengths, except - * for the last fragment, which may be odd. - * - * It's best to have buff aligned on a 32-bit boundary. - */ -static inline __wsum csum_partial(const void *buff, int len, __wsum sum) -{ - return cksm(buff, len, sum); -} +__wsum csum_partial(const void *buff, int len, __wsum sum); /* * Fold a partial checksum without adding pseudo headers. diff --git a/arch/s390/include/asm/fpu-insn-asm.h b/arch/s390/include/asm/fpu-insn-asm.h index 789d626599ee..aaf42c513a21 100644 --- a/arch/s390/include/asm/fpu-insn-asm.h +++ b/arch/s390/include/asm/fpu-insn-asm.h @@ -521,6 +521,15 @@ VMRL \vr1, \vr2, \vr3, 3 .endm +/* VECTOR LOAD WITH LENGTH */ +.macro VLL v, gr, disp, base + VX_NUM v1, \v + GR_NUM b2, \base + GR_NUM r3, \gr + .word 0xE700 | ((v1&15) << 4) | r3 + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x37, v1 +.endm /* Vector integer instructions */ @@ -534,6 +543,16 @@ MRXBOPC 0, 0x68, v1, v2, v3 .endm +/* VECTOR CHECKSUM */ +.macro VCKSM vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | ((v1&15) << 4) | (v2&15) + .word ((v3&15) << 12) + MRXBOPC 0, 0x66, v1, v2, v3 +.endm + /* VECTOR EXCLUSIVE OR */ .macro VX vr1, vr2, vr3 VX_NUM v1, \vr1 diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h index 803ce4e2aab4..7e9997fa45d3 100644 --- a/arch/s390/include/asm/fpu-insn.h +++ b/arch/s390/include/asm/fpu-insn.h @@ -108,6 +108,89 @@ static __always_inline void fpu_stfpc(unsigned int *fpc) : "memory"); } +static __always_inline void fpu_vcksm(u8 v1, u8 v2, u8 v3) +{ + asm volatile("VCKSM %[v1],%[v2],%[v3]" + : + : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3) + : "memory"); +} + +#ifdef CONFIG_CC_IS_CLANG + +static __always_inline void fpu_vl(u8 v1, const void *vxr) +{ + instrument_read(vxr, sizeof(__vector128)); + asm volatile("\n" + " la 1,%[vxr]\n" + " VL %[v1],0,,1\n" + : + : [vxr] "R" (*(__vector128 *)vxr), + [v1] "I" (v1) + : "memory", "1"); +} + +#else /* CONFIG_CC_IS_CLANG */ + +static __always_inline void fpu_vl(u8 v1, const void *vxr) +{ + instrument_read(vxr, sizeof(__vector128)); + asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n" + : + : [vxr] "Q" (*(__vector128 *)vxr), + [v1] "I" (v1) + : "memory"); +} + +#endif /* CONFIG_CC_IS_CLANG */ + +static __always_inline u64 fpu_vlgvf(u8 v, u16 index) +{ + u64 val; + + asm volatile("VLGVF %[val],%[v],%[index]" + : [val] "=d" (val) + : [v] "I" (v), [index] "L" (index) + : "memory"); + return val; +} + +#ifdef CONFIG_CC_IS_CLANG + +static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_read(vxr, size); + asm volatile("\n" + " la 1,%[vxr]\n" + " VLL %[v1],%[index],0,1\n" + : + : [vxr] "R" (*(u8 *)vxr), + [index] "d" (index), + [v1] "I" (v1) + : "memory", "1"); +} + +#else /* CONFIG_CC_IS_CLANG */ + +static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_read(vxr, size); + asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n" + : + : [vxr] "Q" (*(u8 *)vxr), + [index] "d" (index), + [v1] "I" (v1) + : "memory"); +} + +#endif /* CONFIG_CC_IS_CLANG */ + #ifdef CONFIG_CC_IS_CLANG #define fpu_vlm(_v1, _v3, _vxrs) \ @@ -148,6 +231,14 @@ static __always_inline void fpu_stfpc(unsigned int *fpc) #endif /* CONFIG_CC_IS_CLANG */ +static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index) +{ + asm volatile("VLVGF %[v],%[val],%[index]" + : + : [v] "I" (v), [val] "d" (val), [index] "L" (index) + : "memory"); +} + #ifdef CONFIG_CC_IS_CLANG #define fpu_vstm(_v1, _v3, _vxrs) \ @@ -186,5 +277,13 @@ static __always_inline void fpu_stfpc(unsigned int *fpc) #endif /* CONFIG_CC_IS_CLANG */ +static __always_inline void fpu_vzero(u8 v) +{ + asm volatile("VZERO %[v]" + : + : [v] "I" (v) + : "memory"); +} + #endif /* __ASSEMBLY__ */ #endif /* __ASM_S390_FPU_INSN_H */ diff --git a/arch/s390/include/asm/fpu-types.h b/arch/s390/include/asm/fpu-types.h index 04c32b9fc849..8d58d5a95399 100644 --- a/arch/s390/include/asm/fpu-types.h +++ b/arch/s390/include/asm/fpu-types.h @@ -32,12 +32,16 @@ struct kernel_fpu_##vxr_size { \ __vector128 vxrs[vxr_size] __aligned(8); \ } +KERNEL_FPU_STRUCT(8); KERNEL_FPU_STRUCT(16); KERNEL_FPU_STRUCT(32); #define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name) \ struct kernel_fpu_##vxr_size name __uninitialized +#define DECLARE_KERNEL_FPU_ONSTACK8(name) \ + DECLARE_KERNEL_FPU_ONSTACK(8, name) + #define DECLARE_KERNEL_FPU_ONSTACK16(name) \ DECLARE_KERNEL_FPU_ONSTACK(16, name) diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 7c50eca85ca4..90eac15ea62a 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -4,6 +4,7 @@ # lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o +lib-y += csum-partial.o obj-y += mem.o xor.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o diff --git a/arch/s390/lib/csum-partial.c b/arch/s390/lib/csum-partial.c new file mode 100644 index 000000000000..3ea009cbc3b7 --- /dev/null +++ b/arch/s390/lib/csum-partial.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <asm/checksum.h> +#include <asm/fpu.h> + +/* + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). + * + * Returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic. + * + * This function must be called with even lengths, except + * for the last fragment, which may be odd. + * + * It's best to have buff aligned on a 64-bit boundary. + */ +__wsum csum_partial(const void *buff, int len, __wsum sum) +{ + DECLARE_KERNEL_FPU_ONSTACK8(vxstate); + + if (!cpu_has_vx()) + return cksm(buff, len, sum); + kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23); + fpu_vlvgf(16, (__force u32)sum, 1); + fpu_vzero(17); + fpu_vzero(18); + fpu_vzero(19); + while (len >= 64) { + fpu_vlm(20, 23, buff); + fpu_vcksm(16, 20, 16); + fpu_vcksm(17, 21, 17); + fpu_vcksm(18, 22, 18); + fpu_vcksm(19, 23, 19); + buff += 64; + len -= 64; + } + while (len >= 32) { + fpu_vlm(20, 21, buff); + fpu_vcksm(16, 20, 16); + fpu_vcksm(17, 21, 17); + buff += 32; + len -= 32; + } + while (len >= 16) { + fpu_vl(20, buff); + fpu_vcksm(16, 20, 16); + buff += 16; + len -= 16; + } + if (len) { + fpu_vll(20, len - 1, buff); + fpu_vcksm(16, 20, 16); + } + fpu_vcksm(18, 19, 18); + fpu_vcksm(16, 17, 16); + fpu_vcksm(16, 18, 16); + sum = (__force __wsum)fpu_vlgvf(16, 1); + kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23); + return sum; +} +EXPORT_SYMBOL(csum_partial); |