diff options
Diffstat (limited to 'arch/arm64/lib')
-rw-r--r-- | arch/arm64/lib/Makefile | 6 | ||||
-rw-r--r-- | arch/arm64/lib/clear_page.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/clear_user.S | 2 | ||||
-rw-r--r-- | arch/arm64/lib/copy_from_user.S | 4 | ||||
-rw-r--r-- | arch/arm64/lib/copy_in_user.S | 4 | ||||
-rw-r--r-- | arch/arm64/lib/copy_page.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/copy_to_user.S | 4 | ||||
-rw-r--r-- | arch/arm64/lib/crc32.S | 54 | ||||
-rw-r--r-- | arch/arm64/lib/memchr.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/memcmp.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/memcpy.S | 2 | ||||
-rw-r--r-- | arch/arm64/lib/memmove.S | 2 | ||||
-rw-r--r-- | arch/arm64/lib/memset.S | 2 | ||||
-rw-r--r-- | arch/arm64/lib/strchr.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/strcmp.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/strlen.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/strncmp.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/strnlen.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/strrchr.S | 1 | ||||
-rw-r--r-- | arch/arm64/lib/tishift.S | 5 | ||||
-rw-r--r-- | arch/arm64/lib/xor-neon.c | 184 |
21 files changed, 271 insertions, 8 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 69ff9887f724..5540a1638baf 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -5,6 +5,12 @@ lib-y := clear_user.o delay.o copy_from_user.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ strchr.o strrchr.o tishift.o +ifeq ($(CONFIG_KERNEL_MODE_NEON), y) +obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o +CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only +CFLAGS_xor-neon.o += -ffreestanding +endif + # Tell the compiler to treat all general purpose registers (with the # exception of the IP registers, which are already handled by the caller # in case of a PLT) as callee-saved, which allows for efficient runtime diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S index ef08e905e35b..6d13b0d64ad5 100644 --- a/arch/arm64/lib/clear_page.S +++ b/arch/arm64/lib/clear_page.S @@ -37,3 +37,4 @@ ENTRY(clear_page) b.ne 1b ret ENDPROC(clear_page) +EXPORT_SYMBOL(clear_page) diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 21ba0b29621b..feb225bd4b80 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -18,6 +18,7 @@ #include <linux/linkage.h> #include <asm/asm-uaccess.h> +#include <asm/assembler.h> .text @@ -53,6 +54,7 @@ uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 uaccess_disable_not_uao x2, x3 ret ENDPROC(__arch_clear_user) +EXPORT_SYMBOL(__arch_clear_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 20305d485046..dea6c762d52f 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,8 +16,9 @@ #include <linux/linkage.h> -#include <asm/cache.h> #include <asm/asm-uaccess.h> +#include <asm/assembler.h> +#include <asm/cache.h> /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -71,6 +72,7 @@ ENTRY(__arch_copy_from_user) mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) +EXPORT_SYMBOL(__arch_copy_from_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 54b75deb1d16..a84227fbf716 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,8 +18,9 @@ #include <linux/linkage.h> -#include <asm/cache.h> #include <asm/asm-uaccess.h> +#include <asm/assembler.h> +#include <asm/cache.h> /* * Copy from user space to user space (alignment handled by the hardware) @@ -73,6 +74,7 @@ ENTRY(__arch_copy_in_user) mov x0, #0 ret ENDPROC(__arch_copy_in_user) +EXPORT_SYMBOL(__arch_copy_in_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 076c43715e64..98313e24a987 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -87,3 +87,4 @@ alternative_else_nop_endif ret ENDPROC(copy_page) +EXPORT_SYMBOL(copy_page) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index fda6172d6b88..ef44c7ca3ffb 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,8 +16,9 @@ #include <linux/linkage.h> -#include <asm/cache.h> #include <asm/asm-uaccess.h> +#include <asm/assembler.h> +#include <asm/cache.h> /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -70,6 +71,7 @@ ENTRY(__arch_copy_to_user) mov x0, #0 ret ENDPROC(__arch_copy_to_user) +EXPORT_SYMBOL(__arch_copy_to_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 5bc1e85b4e1c..f132f2a7522e 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -15,15 +15,59 @@ .cpu generic+crc .macro __crc32, c -0: subs x2, x2, #16 - b.mi 8f - ldp x3, x4, [x1], #16 + cmp x2, #16 + b.lt 8f // less than 16 bytes + + and x7, x2, #0x1f + and x2, x2, #~0x1f + cbz x7, 32f // multiple of 32 bytes + + and x8, x7, #0xf + ldp x3, x4, [x1] + add x8, x8, x1 + add x1, x1, x7 + ldp x5, x6, [x8] CPU_BE( rev x3, x3 ) CPU_BE( rev x4, x4 ) +CPU_BE( rev x5, x5 ) +CPU_BE( rev x6, x6 ) + + tst x7, #8 + crc32\c\()x w8, w0, x3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #4 + lsr x4, x3, #32 + crc32\c\()w w8, w0, w3 + csel x3, x3, x4, eq + csel w0, w0, w8, eq + tst x7, #2 + lsr w4, w3, #16 + crc32\c\()h w8, w0, w3 + csel w3, w3, w4, eq + csel w0, w0, w8, eq + tst x7, #1 + crc32\c\()b w8, w0, w3 + csel w0, w0, w8, eq + tst x7, #16 + crc32\c\()x w8, w0, x5 + crc32\c\()x w8, w8, x6 + csel w0, w0, w8, eq + cbz x2, 0f + +32: ldp x3, x4, [x1], #32 + sub x2, x2, #32 + ldp x5, x6, [x1, #-16] +CPU_BE( rev x3, x3 ) +CPU_BE( rev x4, x4 ) +CPU_BE( rev x5, x5 ) +CPU_BE( rev x6, x6 ) crc32\c\()x w0, w0, x3 crc32\c\()x w0, w0, x4 - b.ne 0b - ret + crc32\c\()x w0, w0, x5 + crc32\c\()x w0, w0, x6 + cbnz x2, 32b +0: ret 8: tbz x2, #3, 4f ldr x3, [x1], #8 diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S index 0f164a4baf52..f146b7ecd28f 100644 --- a/arch/arm64/lib/memchr.S +++ b/arch/arm64/lib/memchr.S @@ -42,3 +42,4 @@ WEAK(memchr) 2: mov x0, #0 ret ENDPIPROC(memchr) +EXPORT_SYMBOL_NOKASAN(memchr) diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S index fb295f52e9f8..e2e629b09049 100644 --- a/arch/arm64/lib/memcmp.S +++ b/arch/arm64/lib/memcmp.S @@ -256,3 +256,4 @@ CPU_LE( rev data2, data2 ) mov result, #0 ret ENDPIPROC(memcmp) +EXPORT_SYMBOL_NOKASAN(memcmp) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 67613937711f..b4f82888ed60 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -74,4 +74,6 @@ ENTRY(memcpy) #include "copy_template.S" ret ENDPIPROC(memcpy) +EXPORT_SYMBOL(memcpy) ENDPROC(__memcpy) +EXPORT_SYMBOL(__memcpy) diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S index a5a4459013b1..ef12f719d99d 100644 --- a/arch/arm64/lib/memmove.S +++ b/arch/arm64/lib/memmove.S @@ -197,4 +197,6 @@ ENTRY(memmove) b.ne .Ltail63 ret ENDPIPROC(memmove) +EXPORT_SYMBOL(memmove) ENDPROC(__memmove) +EXPORT_SYMBOL(__memmove) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index f2670a9f218c..a79cf118d6d0 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -216,4 +216,6 @@ ENTRY(memset) b.ne .Ltail_maybe_long ret ENDPIPROC(memset) +EXPORT_SYMBOL(memset) ENDPROC(__memset) +EXPORT_SYMBOL(__memset) diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S index 7c83091d1bcd..b179421f46c7 100644 --- a/arch/arm64/lib/strchr.S +++ b/arch/arm64/lib/strchr.S @@ -40,3 +40,4 @@ WEAK(strchr) csel x0, x0, xzr, eq ret ENDPROC(strchr) +EXPORT_SYMBOL_NOKASAN(strchr) diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 7d5d15398bfb..c306c7b88574 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -232,3 +232,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +EXPORT_SYMBOL_NOKASAN(strcmp) diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S index 8e0b14205dcb..2a0240937416 100644 --- a/arch/arm64/lib/strlen.S +++ b/arch/arm64/lib/strlen.S @@ -124,3 +124,4 @@ CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ csel data2, data2, data2a, le b .Lrealigned ENDPIPROC(strlen) +EXPORT_SYMBOL_NOKASAN(strlen) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index 66bd145935d9..c5d567afb039 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -308,3 +308,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +EXPORT_SYMBOL_NOKASAN(strncmp) diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S index 355be04441fe..e21e536d420e 100644 --- a/arch/arm64/lib/strnlen.S +++ b/arch/arm64/lib/strnlen.S @@ -169,3 +169,4 @@ CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ mov len, limit ret ENDPIPROC(strnlen) +EXPORT_SYMBOL_NOKASAN(strnlen) diff --git a/arch/arm64/lib/strrchr.S b/arch/arm64/lib/strrchr.S index ea84924d5990..47e1593016dc 100644 --- a/arch/arm64/lib/strrchr.S +++ b/arch/arm64/lib/strrchr.S @@ -41,3 +41,4 @@ WEAK(strrchr) 2: mov x0, x3 ret ENDPIPROC(strrchr) +EXPORT_SYMBOL_NOKASAN(strrchr) diff --git a/arch/arm64/lib/tishift.S b/arch/arm64/lib/tishift.S index 0fdff97794de..047622536535 100644 --- a/arch/arm64/lib/tishift.S +++ b/arch/arm64/lib/tishift.S @@ -5,6 +5,8 @@ #include <linux/linkage.h> +#include <asm/assembler.h> + ENTRY(__ashlti3) cbz x2, 1f mov x3, #64 @@ -25,6 +27,7 @@ ENTRY(__ashlti3) mov x0, x2 ret ENDPROC(__ashlti3) +EXPORT_SYMBOL(__ashlti3) ENTRY(__ashrti3) cbz x2, 1f @@ -46,6 +49,7 @@ ENTRY(__ashrti3) mov x1, x2 ret ENDPROC(__ashrti3) +EXPORT_SYMBOL(__ashrti3) ENTRY(__lshrti3) cbz x2, 1f @@ -67,3 +71,4 @@ ENTRY(__lshrti3) mov x1, x2 ret ENDPROC(__lshrti3) +EXPORT_SYMBOL(__lshrti3) diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c new file mode 100644 index 000000000000..131c60c27dff --- /dev/null +++ b/arch/arm64/lib/xor-neon.c @@ -0,0 +1,184 @@ +/* + * arch/arm64/lib/xor-neon.c + * + * Authors: Jackie Liu <liuyun01@kylinos.cn> + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/raid/xor.h> +#include <linux/module.h> +#include <asm/neon-intrinsics.h> + +void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, + unsigned long *p2) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + } while (--lines > 0); +} + +void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* p1 ^= p3 */ + v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + } while (--lines > 0); +} + +void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, unsigned long *p4) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* p1 ^= p3 */ + v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + } while (--lines > 0); +} + +void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, + unsigned long *p2, unsigned long *p3, + unsigned long *p4, unsigned long *p5) +{ + uint64_t *dp1 = (uint64_t *)p1; + uint64_t *dp2 = (uint64_t *)p2; + uint64_t *dp3 = (uint64_t *)p3; + uint64_t *dp4 = (uint64_t *)p4; + uint64_t *dp5 = (uint64_t *)p5; + + register uint64x2_t v0, v1, v2, v3; + long lines = bytes / (sizeof(uint64x2_t) * 4); + + do { + /* p1 ^= p2 */ + v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); + v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); + v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); + v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); + + /* p1 ^= p3 */ + v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); + + /* p1 ^= p4 */ + v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); + + /* p1 ^= p5 */ + v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); + v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); + v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); + v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); + + /* store */ + vst1q_u64(dp1 + 0, v0); + vst1q_u64(dp1 + 2, v1); + vst1q_u64(dp1 + 4, v2); + vst1q_u64(dp1 + 6, v3); + + dp1 += 8; + dp2 += 8; + dp3 += 8; + dp4 += 8; + dp5 += 8; + } while (--lines > 0); +} + +struct xor_block_template const xor_block_inner_neon = { + .name = "__inner_neon__", + .do_2 = xor_arm64_neon_2, + .do_3 = xor_arm64_neon_3, + .do_4 = xor_arm64_neon_4, + .do_5 = xor_arm64_neon_5, +}; +EXPORT_SYMBOL(xor_block_inner_neon); + +MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); +MODULE_DESCRIPTION("ARMv8 XOR Extensions"); +MODULE_LICENSE("GPL"); |