diff options
Diffstat (limited to 'arch/arm64/lib')
-rw-r--r-- | arch/arm64/lib/Makefile | 6 | ||||
-rw-r--r-- | arch/arm64/lib/clear_page.S | 13 | ||||
-rw-r--r-- | arch/arm64/lib/copy_page.S | 13 | ||||
-rw-r--r-- | arch/arm64/lib/crc-t10dif-core.S | 469 | ||||
-rw-r--r-- | arch/arm64/lib/crc-t10dif-glue.c | 81 | ||||
-rw-r--r-- | arch/arm64/lib/crc32-glue.c | 99 | ||||
-rw-r--r-- | arch/arm64/lib/crc32.S | 344 | ||||
-rw-r--r-- | arch/arm64/lib/memcpy.S | 19 | ||||
-rw-r--r-- | arch/arm64/lib/memset.S | 20 |
9 files changed, 990 insertions, 74 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 13e6a2829116..4d49dff721a8 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -13,7 +13,11 @@ endif lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o -obj-$(CONFIG_CRC32) += crc32.o +obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o +crc32-arm64-y := crc32.o crc32-glue.o + +obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o +crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S index ebde40e7fa2b..bd6f7d5eb6eb 100644 --- a/arch/arm64/lib/clear_page.S +++ b/arch/arm64/lib/clear_page.S @@ -15,6 +15,19 @@ * x0 - dest */ SYM_FUNC_START(__pi_clear_page) +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + + mov x1, #PAGE_SIZE + setpn [x0]!, x1!, xzr + setmn [x0]!, x1!, xzr + seten [x0]!, x1!, xzr + ret +.Lno_mops: +#endif mrs x1, dczid_el0 tbnz x1, #4, 2f /* Branch if DC ZVA is prohibited */ and w1, w1, #0xf diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 6a56d7cf309d..e6374e7e5511 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -18,6 +18,19 @@ * x1 - src */ SYM_FUNC_START(__pi_copy_page) +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + + mov x2, #PAGE_SIZE + cpypwn [x0]!, [x1]!, x2! + cpymwn [x0]!, [x1]!, x2! + cpyewn [x0]!, [x1]!, x2! + ret +.Lno_mops: +#endif ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S new file mode 100644 index 000000000000..87dd6d46224d --- /dev/null +++ b/arch/arm64/lib/crc-t10dif-core.S @@ -0,0 +1,469 @@ +// +// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions +// +// Copyright (C) 2016 Linaro Ltd +// Copyright (C) 2019-2024 Google LLC +// +// Authors: Ard Biesheuvel <ardb@google.com> +// Eric Biggers <ebiggers@google.com> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License version 2 as +// published by the Free Software Foundation. +// + +// Derived from the x86 version: +// +// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +// +// Copyright (c) 2013, Intel Corporation +// +// Authors: +// Erdinc Ozturk <erdinc.ozturk@intel.com> +// Vinodh Gopal <vinodh.gopal@intel.com> +// James Guilford <james.guilford@intel.com> +// Tim Chen <tim.c.chen@linux.intel.com> +// +// This software is available to you under a choice of one of two +// licenses. You may choose to be licensed under the terms of the GNU +// General Public License (GPL) Version 2, available from the file +// COPYING in the main directory of this source tree, or the +// OpenIB.org BSD license below: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the +// distribution. +// +// * Neither the name of the Intel Corporation nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// +// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Reference paper titled "Fast CRC Computation for Generic +// Polynomials Using PCLMULQDQ Instruction" +// URL: http://www.intel.com/content/dam/www/public/us/en/documents +// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + init_crc .req w0 + buf .req x1 + len .req x2 + fold_consts_ptr .req x5 + + fold_consts .req v10 + + t3 .req v17 + t4 .req v18 + t5 .req v19 + t6 .req v20 + t7 .req v21 + t8 .req v22 + + perm .req v27 + + .macro pmull16x64_p64, a16, b64, c64 + pmull2 \c64\().1q, \a16\().2d, \b64\().2d + pmull \b64\().1q, \a16\().1d, \b64\().1d + .endm + + /* + * Pairwise long polynomial multiplication of two 16-bit values + * + * { w0, w1 }, { y0, y1 } + * + * by two 64-bit values + * + * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } + * + * where each vector element is a byte, ordered from least to most + * significant. + * + * This can be implemented using 8x8 long polynomial multiplication, by + * reorganizing the input so that each pairwise 8x8 multiplication + * produces one of the terms from the decomposition below, and + * combining the results of each rank and shifting them into place. + * + * Rank + * 0 w0*x0 ^ | y0*z0 ^ + * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ + * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ + * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ + * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ + * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ + * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ + * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ + * 8 w1*x7 << 64 | y1*z7 << 64 + * + * The inputs can be reorganized into + * + * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } + * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } + * + * and after performing 8x8->16 bit long polynomial multiplication of + * each of the halves of the first vector with those of the second one, + * we obtain the following four vectors of 16-bit elements: + * + * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } + * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } + * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } + * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } + * + * Results b and c can be XORed together, as the vector elements have + * matching ranks. Then, the final XOR (*) can be pulled forward, and + * applied between the halves of each of the remaining three vectors, + * which are then shifted into place, and combined to produce two + * 80-bit results. + * + * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent + * to the 64x64 bit one above, but XOR'ing the outputs together will + * produce the expected result, and this is sufficient in the context of + * this algorithm. + */ + .macro pmull16x64_p8, a16, b64, c64 + ext t7.16b, \b64\().16b, \b64\().16b, #1 + tbl t5.16b, {\a16\().16b}, perm.16b + uzp1 t7.16b, \b64\().16b, t7.16b + bl __pmull_p8_16x64 + ext \b64\().16b, t4.16b, t4.16b, #15 + eor \c64\().16b, t8.16b, t5.16b + .endm + +SYM_FUNC_START_LOCAL(__pmull_p8_16x64) + ext t6.16b, t5.16b, t5.16b, #8 + + pmull t3.8h, t7.8b, t5.8b + pmull t4.8h, t7.8b, t6.8b + pmull2 t5.8h, t7.16b, t5.16b + pmull2 t6.8h, t7.16b, t6.16b + + ext t8.16b, t3.16b, t3.16b, #8 + eor t4.16b, t4.16b, t6.16b + ext t7.16b, t5.16b, t5.16b, #8 + ext t6.16b, t4.16b, t4.16b, #8 + eor t8.8b, t8.8b, t3.8b + eor t5.8b, t5.8b, t7.8b + eor t4.8b, t4.8b, t6.8b + ext t5.16b, t5.16b, t5.16b, #14 + ret +SYM_FUNC_END(__pmull_p8_16x64) + + + // Fold reg1, reg2 into the next 32 data bytes, storing the result back + // into reg1, reg2. + .macro fold_32_bytes, p, reg1, reg2 + ldp q11, q12, [buf], #0x20 + + pmull16x64_\p fold_consts, \reg1, v8 + +CPU_LE( rev64 v11.16b, v11.16b ) +CPU_LE( rev64 v12.16b, v12.16b ) + + pmull16x64_\p fold_consts, \reg2, v9 + +CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) +CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) + + eor \reg1\().16b, \reg1\().16b, v8.16b + eor \reg2\().16b, \reg2\().16b, v9.16b + eor \reg1\().16b, \reg1\().16b, v11.16b + eor \reg2\().16b, \reg2\().16b, v12.16b + .endm + + // Fold src_reg into dst_reg, optionally loading the next fold constants + .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts + pmull16x64_\p fold_consts, \src_reg, v8 + .ifnb \load_next_consts + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + .endif + eor \dst_reg\().16b, \dst_reg\().16b, v8.16b + eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b + .endm + + .macro crc_t10dif_pmull, p + + // For sizes less than 256 bytes, we can't fold 128 bytes at a time. + cmp len, #256 + b.lt .Lless_than_256_bytes_\@ + + adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts + + // Load the first 128 data bytes. Byte swapping is necessary to make + // the bit order match the polynomial coefficient order. + ldp q0, q1, [buf] + ldp q2, q3, [buf, #0x20] + ldp q4, q5, [buf, #0x40] + ldp q6, q7, [buf, #0x60] + add buf, buf, #0x80 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( rev64 v1.16b, v1.16b ) +CPU_LE( rev64 v2.16b, v2.16b ) +CPU_LE( rev64 v3.16b, v3.16b ) +CPU_LE( rev64 v4.16b, v4.16b ) +CPU_LE( rev64 v5.16b, v5.16b ) +CPU_LE( rev64 v6.16b, v6.16b ) +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) +CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) +CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) +CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) +CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) +CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) +CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v8.16b, #0 + mov v8.h[7], init_crc + eor v0.16b, v0.16b, v8.16b + + // Load the constants for folding across 128 bytes. + ld1 {fold_consts.2d}, [fold_consts_ptr] + + // Subtract 128 for the 128 data bytes just consumed. Subtract another + // 128 to simplify the termination condition of the following loop. + sub len, len, #256 + + // While >= 128 data bytes remain (not counting v0-v7), fold the 128 + // bytes v0-v7 into them, storing the result back into v0-v7. +.Lfold_128_bytes_loop_\@: + fold_32_bytes \p, v0, v1 + fold_32_bytes \p, v2, v3 + fold_32_bytes \p, v4, v5 + fold_32_bytes \p, v6, v7 + + subs len, len, #128 + b.ge .Lfold_128_bytes_loop_\@ + + // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. + + // Fold across 64 bytes. + add fold_consts_ptr, fold_consts_ptr, #16 + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + fold_16_bytes \p, v0, v4 + fold_16_bytes \p, v1, v5 + fold_16_bytes \p, v2, v6 + fold_16_bytes \p, v3, v7, 1 + // Fold across 32 bytes. + fold_16_bytes \p, v4, v6 + fold_16_bytes \p, v5, v7, 1 + // Fold across 16 bytes. + fold_16_bytes \p, v6, v7 + + // Add 128 to get the correct number of data bytes remaining in 0...127 + // (not counting v7), following the previous extra subtraction by 128. + // Then subtract 16 to simplify the termination condition of the + // following loop. + adds len, len, #(128-16) + + // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 + // into them, storing the result back into v7. + b.lt .Lfold_16_bytes_loop_done_\@ +.Lfold_16_bytes_loop_\@: + pmull16x64_\p fold_consts, v7, v8 + eor v7.16b, v7.16b, v8.16b + ldr q0, [buf], #16 +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + eor v7.16b, v7.16b, v0.16b + subs len, len, #16 + b.ge .Lfold_16_bytes_loop_\@ + +.Lfold_16_bytes_loop_done_\@: + // Add 16 to get the correct number of data bytes remaining in 0...15 + // (not counting v7), following the previous extra subtraction by 16. + adds len, len, #16 + b.eq .Lreduce_final_16_bytes_\@ + +.Lhandle_partial_segment_\@: + // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first + // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To + // do this without needing a fold constant for each possible 'len', + // redivide the bytes into a first chunk of 'len' bytes and a second + // chunk of 16 bytes, then fold the first chunk into the second. + + // v0 = last 16 original data bytes + add buf, buf, len + ldr q0, [buf, #-16] +CPU_LE( rev64 v0.16b, v0.16b ) +CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) + + // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. + adr_l x4, .Lbyteshift_table + 16 + sub x4, x4, len + ld1 {v2.16b}, [x4] + tbl v1.16b, {v7.16b}, v2.16b + + // v3 = first chunk: v7 right-shifted by '16-len' bytes. + movi v3.16b, #0x80 + eor v2.16b, v2.16b, v3.16b + tbl v3.16b, {v7.16b}, v2.16b + + // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. + sshr v2.16b, v2.16b, #7 + + // v2 = second chunk: 'len' bytes from v0 (low-order bytes), + // then '16-len' bytes from v1 (high-order bytes). + bsl v2.16b, v1.16b, v0.16b + + // Fold the first chunk into the second chunk, storing the result in v7. + pmull16x64_\p fold_consts, v3, v0 + eor v7.16b, v3.16b, v0.16b + eor v7.16b, v7.16b, v2.16b + b .Lreduce_final_16_bytes_\@ + +.Lless_than_256_bytes_\@: + // Checksumming a buffer of length 16...255 bytes + + adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts + + // Load the first 16 data bytes. + ldr q7, [buf], #0x10 +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + + // XOR the first 16 data *bits* with the initial CRC value. + movi v0.16b, #0 + mov v0.h[7], init_crc + eor v7.16b, v7.16b, v0.16b + + // Load the fold-across-16-bytes constants. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + + cmp len, #16 + b.eq .Lreduce_final_16_bytes_\@ // len == 16 + subs len, len, #32 + b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 + add len, len, #16 + b .Lhandle_partial_segment_\@ // 17 <= len <= 31 + +.Lreduce_final_16_bytes_\@: + .endm + +// +// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p8) + frame_push 1 + + // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } + movi perm.4h, #8, lsl #8 + orr perm.2s, #1, lsl #16 + orr perm.2s, #1, lsl #24 + zip1 perm.16b, perm.16b, perm.16b + zip1 perm.16b, perm.16b, perm.16b + + crc_t10dif_pmull p8 + +CPU_LE( rev64 v7.16b, v7.16b ) +CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) + str q7, [x3] + + frame_pop + ret +SYM_FUNC_END(crc_t10dif_pmull_p8) + + .align 5 +// +// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); +// +// Assumes len >= 16. +// +SYM_FUNC_START(crc_t10dif_pmull_p64) + crc_t10dif_pmull p64 + + // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. + + movi v2.16b, #0 // init zero register + + // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. + ld1 {fold_consts.2d}, [fold_consts_ptr], #16 + + // Fold the high 64 bits into the low 64 bits, while also multiplying by + // x^64. This produces a 128-bit value congruent to x^64 * M(x) and + // whose low 48 bits are 0. + ext v0.16b, v2.16b, v7.16b, #8 + pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x)) + eor v0.16b, v0.16b, v7.16b // + low bits * x^64 + + // Fold the high 32 bits into the low 96 bits. This produces a 96-bit + // value congruent to x^64 * M(x) and whose low 48 bits are 0. + ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits + mov v0.s[3], v2.s[0] // zero high 32 bits + pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x)) + eor v0.16b, v0.16b, v1.16b // + low bits + + // Load G(x) and floor(x^48 / G(x)). + ld1 {fold_consts.2d}, [fold_consts_ptr] + + // Use Barrett reduction to compute the final CRC value. + pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x)) + ushr v1.2d, v1.2d, #32 // /= x^32 + pmull v1.1q, v1.1d, fold_consts.1d // *= G(x) + ushr v0.2d, v0.2d, #48 + eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits + // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. + + umov w0, v0.h[0] + ret +SYM_FUNC_END(crc_t10dif_pmull_p64) + + .section ".rodata", "a" + .align 4 + +// Fold constants precomputed from the polynomial 0x18bb7 +// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +.Lfold_across_128_bytes_consts: + .quad 0x0000000000006123 // x^(8*128) mod G(x) + .quad 0x0000000000002295 // x^(8*128+64) mod G(x) +// .Lfold_across_64_bytes_consts: + .quad 0x0000000000001069 // x^(4*128) mod G(x) + .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) +// .Lfold_across_32_bytes_consts: + .quad 0x000000000000857d // x^(2*128) mod G(x) + .quad 0x0000000000007acc // x^(2*128+64) mod G(x) +.Lfold_across_16_bytes_consts: + .quad 0x000000000000a010 // x^(1*128) mod G(x) + .quad 0x0000000000001faa // x^(1*128+64) mod G(x) +// .Lfinal_fold_consts: + .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) + .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) +// .Lbarrett_reduction_consts: + .quad 0x0000000000018bb7 // G(x) + .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) + +// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - +// len] is the index vector to shift left by 'len' bytes, and is also {0x80, +// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. +.Lbyteshift_table: + .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 + .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f + .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 + .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c new file mode 100644 index 000000000000..dab7e3796232 --- /dev/null +++ b/arch/arm64/lib/crc-t10dif-glue.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions + * + * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/cpufeature.h> +#include <linux/crc-t10dif.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <crypto/internal/simd.h> + +#include <asm/neon.h> +#include <asm/simd.h> + +static DEFINE_STATIC_KEY_FALSE(have_asimd); +static DEFINE_STATIC_KEY_FALSE(have_pmull); + +#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U + +asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len, + u8 out[16]); +asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); + +u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length) +{ + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) { + if (static_branch_likely(&have_pmull)) { + if (crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc_t10dif_pmull_p64(crc, data, length); + kernel_neon_end(); + return crc; + } + } else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && + static_branch_likely(&have_asimd) && + crypto_simd_usable()) { + u8 buf[16]; + + kernel_neon_begin(); + crc_t10dif_pmull_p8(crc, data, length, buf); + kernel_neon_end(); + + crc = 0; + data = buf; + length = sizeof(buf); + } + } + return crc_t10dif_generic(crc, data, length); +} +EXPORT_SYMBOL(crc_t10dif_arch); + +static int __init crc_t10dif_arm64_init(void) +{ + if (cpu_have_named_feature(ASIMD)) { + static_branch_enable(&have_asimd); + if (cpu_have_named_feature(PMULL)) + static_branch_enable(&have_pmull); + } + return 0; +} +arch_initcall(crc_t10dif_arm64_init); + +static void __exit crc_t10dif_arm64_exit(void) +{ +} +module_exit(crc_t10dif_arm64_exit); + +bool crc_t10dif_is_optimized(void) +{ + return static_key_enabled(&have_asimd); +} +EXPORT_SYMBOL(crc_t10dif_is_optimized); + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c new file mode 100644 index 000000000000..15c4c9db573e --- /dev/null +++ b/arch/arm64/lib/crc32-glue.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/crc32.h> +#include <linux/linkage.h> +#include <linux/module.h> + +#include <asm/alternative.h> +#include <asm/cpufeature.h> +#include <asm/neon.h> +#include <asm/simd.h> + +#include <crypto/internal/simd.h> + +// The minimum input length to consider the 4-way interleaved code path +static const size_t min_len = 1024; + +asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); + +asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); + +u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_le_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_le_arm64(crc, p, len); +} +EXPORT_SYMBOL(crc32_le_arch); + +u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32c_le_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32c_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32c_le_arm64(crc, p, len); +} +EXPORT_SYMBOL(crc32c_le_arch); + +u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_be_base(crc, p, len); + + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_be_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + + return crc32_be_arm64(crc, p, len); +} +EXPORT_SYMBOL(crc32_be_arch); + +u32 crc32_optimizations(void) +{ + if (alternative_has_cap_likely(ARM64_HAS_CRC32)) + return CRC32_LE_OPTIMIZATION | + CRC32_BE_OPTIMIZATION | + CRC32C_OPTIMIZATION; + return 0; +} +EXPORT_SYMBOL(crc32_optimizations); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("arm64-optimized CRC32 functions"); diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 8340dccff46f..68825317460f 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -1,54 +1,60 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Accelerated CRC32(C) using AArch64 CRC instructions + * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions * - * Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2016 - 2018 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel <ardb@kernel.org> */ #include <linux/linkage.h> -#include <asm/alternative.h> #include <asm/assembler.h> - .arch armv8-a+crc + .cpu generic+crc+crypto - .macro byteorder, reg, be - .if \be -CPU_LE( rev \reg, \reg ) - .else -CPU_BE( rev \reg, \reg ) - .endif + .macro bitle, reg .endm - .macro byteorder16, reg, be - .if \be -CPU_LE( rev16 \reg, \reg ) - .else -CPU_BE( rev16 \reg, \reg ) - .endif + .macro bitbe, reg + rbit \reg, \reg .endm - .macro bitorder, reg, be - .if \be - rbit \reg, \reg - .endif + .macro bytele, reg .endm - .macro bitorder16, reg, be - .if \be + .macro bytebe, reg rbit \reg, \reg - lsr \reg, \reg, #16 - .endif + lsr \reg, \reg, #24 .endm - .macro bitorder8, reg, be - .if \be + .macro hwordle, reg +CPU_BE( rev16 \reg, \reg ) + .endm + + .macro hwordbe, reg +CPU_LE( rev \reg, \reg ) rbit \reg, \reg - lsr \reg, \reg, #24 - .endif +CPU_BE( lsr \reg, \reg, #16 ) + .endm + + .macro le, regs:vararg + .irp r, \regs +CPU_BE( rev \r, \r ) + .endr .endm - .macro __crc32, c, be=0 - bitorder w0, \be + .macro be, regs:vararg + .irp r, \regs +CPU_LE( rev \r, \r ) + .endr + .irp r, \regs + rbit \r, \r + .endr + .endm + + .macro __crc32, c, order=le + bit\order w0 cmp x2, #16 b.lt 8f // less than 16 bytes @@ -61,14 +67,7 @@ CPU_BE( rev16 \reg, \reg ) add x8, x8, x1 add x1, x1, x7 ldp x5, x6, [x8] - byteorder x3, \be - byteorder x4, \be - byteorder x5, \be - byteorder x6, \be - bitorder x3, \be - bitorder x4, \be - bitorder x5, \be - bitorder x6, \be + \order x3, x4, x5, x6 tst x7, #8 crc32\c\()x w8, w0, x3 @@ -96,65 +95,268 @@ CPU_BE( rev16 \reg, \reg ) 32: ldp x3, x4, [x1], #32 sub x2, x2, #32 ldp x5, x6, [x1, #-16] - byteorder x3, \be - byteorder x4, \be - byteorder x5, \be - byteorder x6, \be - bitorder x3, \be - bitorder x4, \be - bitorder x5, \be - bitorder x6, \be + \order x3, x4, x5, x6 crc32\c\()x w0, w0, x3 crc32\c\()x w0, w0, x4 crc32\c\()x w0, w0, x5 crc32\c\()x w0, w0, x6 cbnz x2, 32b -0: bitorder w0, \be +0: bit\order w0 ret 8: tbz x2, #3, 4f ldr x3, [x1], #8 - byteorder x3, \be - bitorder x3, \be + \order x3 crc32\c\()x w0, w0, x3 4: tbz x2, #2, 2f ldr w3, [x1], #4 - byteorder w3, \be - bitorder w3, \be + \order w3 crc32\c\()w w0, w0, w3 2: tbz x2, #1, 1f ldrh w3, [x1], #2 - byteorder16 w3, \be - bitorder16 w3, \be + hword\order w3 crc32\c\()h w0, w0, w3 1: tbz x2, #0, 0f ldrb w3, [x1] - bitorder8 w3, \be + byte\order w3 crc32\c\()b w0, w0, w3 -0: bitorder w0, \be +0: bit\order w0 ret .endm .align 5 -SYM_FUNC_START(crc32_le) -alternative_if_not ARM64_HAS_CRC32 - b crc32_le_base -alternative_else_nop_endif +SYM_FUNC_START(crc32_le_arm64) __crc32 -SYM_FUNC_END(crc32_le) +SYM_FUNC_END(crc32_le_arm64) .align 5 -SYM_FUNC_START(__crc32c_le) -alternative_if_not ARM64_HAS_CRC32 - b __crc32c_le_base -alternative_else_nop_endif +SYM_FUNC_START(crc32c_le_arm64) __crc32 c -SYM_FUNC_END(__crc32c_le) +SYM_FUNC_END(crc32c_le_arm64) .align 5 -SYM_FUNC_START(crc32_be) -alternative_if_not ARM64_HAS_CRC32 - b crc32_be_base -alternative_else_nop_endif - __crc32 be=1 -SYM_FUNC_END(crc32_be) +SYM_FUNC_START(crc32_be_arm64) + __crc32 order=be +SYM_FUNC_END(crc32_be_arm64) + + in .req x1 + len .req x2 + + /* + * w0: input CRC at entry, output CRC at exit + * x1: pointer to input buffer + * x2: length of input in bytes + */ + .macro crc4way, insn, table, order=le + bit\order w0 + lsr len, len, #6 // len := # of 64-byte blocks + + /* Process up to 64 blocks of 64 bytes at a time */ +.La\@: mov x3, #64 + cmp len, #64 + csel x3, x3, len, hi // x3 := min(len, 64) + sub len, len, x3 + + /* Divide the input into 4 contiguous blocks */ + add x4, x3, x3, lsl #1 // x4 := 3 * x3 + add x7, in, x3, lsl #4 // x7 := in + 16 * x3 + add x8, in, x3, lsl #5 // x8 := in + 32 * x3 + add x9, in, x4, lsl #4 // x9 := in + 16 * x4 + + /* Load the folding coefficients from the lookup table */ + adr_l x5, \table - 12 // entry 0 omitted + add x5, x5, x4, lsl #2 // x5 += 12 * x3 + ldp s0, s1, [x5] + ldr s2, [x5, #8] + + /* Zero init partial CRCs for this iteration */ + mov w4, wzr + mov w5, wzr + mov w6, wzr + mov x17, xzr + +.Lb\@: sub x3, x3, #1 + \insn w6, w6, x17 + ldp x10, x11, [in], #16 + ldp x12, x13, [x7], #16 + ldp x14, x15, [x8], #16 + ldp x16, x17, [x9], #16 + + \order x10, x11, x12, x13, x14, x15, x16, x17 + + /* Apply the CRC transform to 4 16-byte blocks in parallel */ + \insn w0, w0, x10 + \insn w4, w4, x12 + \insn w5, w5, x14 + \insn w6, w6, x16 + \insn w0, w0, x11 + \insn w4, w4, x13 + \insn w5, w5, x15 + cbnz x3, .Lb\@ + + /* Combine the 4 partial results into w0 */ + mov v3.d[0], x0 + mov v4.d[0], x4 + mov v5.d[0], x5 + pmull v0.1q, v0.1d, v3.1d + pmull v1.1q, v1.1d, v4.1d + pmull v2.1q, v2.1d, v5.1d + eor v0.8b, v0.8b, v1.8b + eor v0.8b, v0.8b, v2.8b + mov x5, v0.d[0] + eor x5, x5, x17 + \insn w0, w6, x5 + + mov in, x9 + cbnz len, .La\@ + + bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32c_le_arm64_4way) + crc4way crc32cx, .L0 +SYM_FUNC_END(crc32c_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_le_arm64_4way) + crc4way crc32x, .L1 +SYM_FUNC_END(crc32_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_be_arm64_4way) + crc4way crc32x, .L1, be +SYM_FUNC_END(crc32_be_arm64_4way) + + .section .rodata, "a", %progbits + .align 6 +.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27 + .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e + .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b + .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8 + .long 0x299847d5, 0x878a92a7, 0x39d3b296 + .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53 + .long 0xa60ce07b, 0x83348832, 0x47db8317 + .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092 + .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0 + .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7 + .long 0xf285651c, 0xce7f39f4, 0xdaece73e + .long 0x271d9844, 0xd270f1a2, 0xab7aff2a + .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385 + .long 0xcec3662e, 0x1b03397f, 0x83348832 + .long 0x8227bb8a, 0xb3e32c28, 0x299847d5 + .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86 + .long 0xf6076544, 0x10746f3c, 0x18b33a4e + .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b + .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7 + .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b + .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b + .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4 + .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56 + .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2 + .long 0xa90fd27a, 0x0167d312, 0xc619809d + .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d + .long 0x4597456a, 0x98d8d9cb, 0x65863b64 + .long 0xc9c8b782, 0x68bce87a, 0x1b03397f + .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd + .long 0x2342001e, 0x3771e98f, 0xb3e32c28 + .long 0xe8b6368b, 0x2178513a, 0x064f7f26 + .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c + .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c + .long 0x02ee03b2, 0xff0dba97, 0x10746f3c + .long 0x135c83fd, 0xf872e54c, 0xc7a68855 + .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844 + .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c + .long 0xded288f8, 0xb3af077a, 0x93a5f730 + .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c + .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2 + .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203 + .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e + .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb + .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a + .long 0x8e1450f7, 0x2342001e, 0x8227bb8a + .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768 + .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35 + .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c + .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b + .long 0xd6c3a807, 0x2664fd8b, 0x0167d312 + .long 0x1d31175f, 0x02ee03b2, 0xf6076544 + .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a + .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf + .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb + .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c + .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a + .long 0x8a074012, 0xded288f8, 0x57a3d037 + .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b + .long 0x3be3c09b, 0x6353c1cc, 0x42d98888 + .long 0x465a4eee, 0xf48642e9, 0x3771e98f + .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9 + .long 0xa52f58ec, 0x9a5ede41, 0x2178513a + .long 0x47972100, 0x45cddf4e, 0xe0ac139e + .long 0x359674f7, 0xa51b6135, 0x170076fa + +.L1: .long 0xaf449247, 0x81256527, 0xccaa009e + .long 0x57c54819, 0x1d9513d7, 0x81256527 + .long 0x3f41287a, 0x57c54819, 0xaf449247 + .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7 + .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394 + .long 0x71d54a59, 0xf5e48c85, 0x57c54819 + .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed + .long 0xd31343ea, 0xe95c1271, 0x910eeec1 + .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a + .long 0x9ee62949, 0xcec97417, 0x9026d5b1 + .long 0xa55d1514, 0xf183c71b, 0xd1df2327 + .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85 + .long 0x9d842b80, 0xeea395c4, 0x3c656ced + .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd + .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd + .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271 + .long 0xef82aa68, 0xdb3935ea, 0xb918a347 + .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59 + .long 0x99cce860, 0x356d209f, 0xff6f2fc2 + .long 0xd8af8e46, 0xc352f6de, 0xcec97417 + .long 0xf1996890, 0xd8110ff1, 0x1c63267b + .long 0x631bc508, 0xe95c7216, 0xf183c71b + .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0 + .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea + .long 0x7a92fffb, 0xf7003835, 0x4470ac44 + .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4 + .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee + .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40 + .long 0x60290934, 0x81b6f443, 0x6d40f445 + .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949 + .long 0xdcf5088a, 0x9dbdc100, 0x145575d5 + .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d + .long 0x255b139e, 0x631bc508, 0xa55d1514 + .long 0xd784eaa8, 0xce26786c, 0xdb3935ea + .long 0x6d2c864a, 0x8068c345, 0x2586d334 + .long 0x02072e24, 0xdb3839f3, 0x21aa2b26 + .long 0x06689b0a, 0x5efd72f5, 0xe0575528 + .long 0x1e52f5ea, 0x4117915b, 0x356d209f + .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80 + .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de + .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c + .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1 + .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0 + .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216 + .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356 + .long 0x0c540e7b, 0x753c81ff, 0x8e031a19 + .long 0x9924c781, 0xb9220208, 0x3edcde65 + .long 0x3954de39, 0x1753ab84, 0x1d6708a0 + .long 0xf32238b5, 0xbec81497, 0x9e70b943 + .long 0xbbd2cd2c, 0x0925d861, 0xf7003835 + .long 0xcc401304, 0xd784eaa8, 0xef82aa68 + .long 0x4987e684, 0x6044fbb0, 0x00eba0c8 + .long 0x3aa11427, 0x18fe3b4a, 0x87441142 + .long 0x297aad60, 0x02072e24, 0xd14bcc9b + .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a + .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8 + .long 0x25b8822a, 0x1e52f5ea, 0x99cce860 + .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443 + .long 0x5690aa32, 0xa91fdefb, 0x688a110e + .long 0x1357a093, 0x3796455c, 0xd8af8e46 + .long 0x798fdd33, 0xaaa18a37, 0x357b9517 + .long 0xc2815395, 0x54d42691, 0x9dbdc100 + .long 0x21cfc0f7, 0x28ae0976, 0xf1996890 + .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6 diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 4ab48d49c451..9b99106fb95f 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -57,7 +57,7 @@ The loop tail is handled by always copying 64 bytes from the end. */ -SYM_FUNC_START(__pi_memcpy) +SYM_FUNC_START_LOCAL(__pi_memcpy_generic) add srcend, src, count add dstend, dstin, count cmp count, 128 @@ -238,7 +238,24 @@ L(copy64_from_start): stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstin] ret +SYM_FUNC_END(__pi_memcpy_generic) + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +SYM_FUNC_START(__pi_memcpy) +alternative_if_not ARM64_HAS_MOPS + b __pi_memcpy_generic +alternative_else_nop_endif + + mov dst, dstin + cpyp [dst]!, [src]!, count! + cpym [dst]!, [src]!, count! + cpye [dst]!, [src]!, count! + ret SYM_FUNC_END(__pi_memcpy) +#else +SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic) +#endif SYM_FUNC_ALIAS(__memcpy, __pi_memcpy) EXPORT_SYMBOL(__memcpy) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index a5aebe82ad73..97157da65ec6 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -26,6 +26,7 @@ */ dstin .req x0 +val_x .req x1 val .req w1 count .req x2 tmp1 .req x3 @@ -42,7 +43,7 @@ dst .req x8 tmp3w .req w9 tmp3 .req x9 -SYM_FUNC_START(__pi_memset) +SYM_FUNC_START_LOCAL(__pi_memset_generic) mov dst, dstin /* Preserve return value. */ and A_lw, val, #255 orr A_lw, A_lw, A_lw, lsl #8 @@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset) ands count, count, zva_bits_x b.ne .Ltail_maybe_long ret +SYM_FUNC_END(__pi_memset_generic) + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +SYM_FUNC_START(__pi_memset) +alternative_if_not ARM64_HAS_MOPS + b __pi_memset_generic +alternative_else_nop_endif + + mov dst, dstin + setp [dst]!, count!, val_x + setm [dst]!, count!, val_x + sete [dst]!, count!, val_x + ret SYM_FUNC_END(__pi_memset) +#else +SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic) +#endif SYM_FUNC_ALIAS(__memset, __pi_memset) EXPORT_SYMBOL(__memset) |