9 files changed, 990 insertions, 74 deletions
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 13e6a2829116..4d49dff721a8 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -13,7 +13,11 @@ endif
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
-obj-$(CONFIG_CRC32) += crc32.o
+obj-$(CONFIG_CRC32_ARCH) += crc32-arm64.o
+crc32-arm64-y := crc32.o crc32-glue.o
+
+obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-arm64.o
+crc-t10dif-arm64-y := crc-t10dif-glue.o crc-t10dif-core.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index ebde40e7fa2b..bd6f7d5eb6eb 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -15,6 +15,19 @@
  *	x0 - dest
  */
 SYM_FUNC_START(__pi_clear_page)
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+	b	.Lno_mops
+alternative_else_nop_endif
+
+	mov	x1, #PAGE_SIZE
+	setpn	[x0]!, x1!, xzr
+	setmn	[x0]!, x1!, xzr
+	seten	[x0]!, x1!, xzr
+	ret
+.Lno_mops:
+#endif
 	mrs	x1, dczid_el0
 	tbnz	x1, #4, 2f	/* Branch if DC ZVA is prohibited */
 	and	w1, w1, #0xf
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 6a56d7cf309d..e6374e7e5511 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -18,6 +18,19 @@
  *	x1 - src
  */
 SYM_FUNC_START(__pi_copy_page)
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+	b	.Lno_mops
+alternative_else_nop_endif
+
+	mov	x2, #PAGE_SIZE
+	cpypwn	[x0]!, [x1]!, x2!
+	cpymwn	[x0]!, [x1]!, x2!
+	cpyewn	[x0]!, [x1]!, x2!
+	ret
+.Lno_mops:
+#endif
 	ldp	x2, x3, [x1]
 	ldp	x4, x5, [x1, #16]
 	ldp	x6, x7, [x1, #32]
diff --git a/arch/arm64/lib/crc-t10dif-core.S b/arch/arm64/lib/crc-t10dif-core.S
new file mode 100644
index 000000000000..87dd6d46224d
--- /dev/null
+++ b/arch/arm64/lib/crc-t10dif-core.S
@@ -0,0 +1,469 @@
+//
+// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+//
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+//          Eric Biggers <ebiggers@google.com>
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License version 2 as
+// published by the Free Software Foundation.
+//
+
+// Derived from the x86 version:
+//
+// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+//
+// Copyright (c) 2013, Intel Corporation
+//
+// Authors:
+//     Erdinc Ozturk <erdinc.ozturk@intel.com>
+//     Vinodh Gopal <vinodh.gopal@intel.com>
+//     James Guilford <james.guilford@intel.com>
+//     Tim Chen <tim.c.chen@linux.intel.com>
+//
+// This software is available to you under a choice of one of two
+// licenses.  You may choose to be licensed under the terms of the GNU
+// General Public License (GPL) Version 2, available from the file
+// COPYING in the main directory of this source tree, or the
+// OpenIB.org BSD license below:
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// * Neither the name of the Intel Corporation nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+//
+// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//       Reference paper titled "Fast CRC Computation for Generic
+//	Polynomials Using PCLMULQDQ Instruction"
+//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
+//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+//
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.arch		armv8-a+crypto
+
+	init_crc	.req	w0
+	buf		.req	x1
+	len		.req	x2
+	fold_consts_ptr	.req	x5
+
+	fold_consts	.req	v10
+
+	t3		.req	v17
+	t4		.req	v18
+	t5		.req	v19
+	t6		.req	v20
+	t7		.req	v21
+	t8		.req	v22
+
+	perm		.req	v27
+
+	.macro		pmull16x64_p64, a16, b64, c64
+	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
+	pmull		\b64\().1q, \a16\().1d, \b64\().1d
+	.endm
+
+	/*
+	 * Pairwise long polynomial multiplication of two 16-bit values
+	 *
+	 *   { w0, w1 }, { y0, y1 }
+	 *
+	 * by two 64-bit values
+	 *
+	 *   { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+	 *
+	 * where each vector element is a byte, ordered from least to most
+	 * significant.
+	 *
+	 * This can be implemented using 8x8 long polynomial multiplication, by
+	 * reorganizing the input so that each pairwise 8x8 multiplication
+	 * produces one of the terms from the decomposition below, and
+	 * combining the results of each rank and shifting them into place.
+	 *
+	 * Rank
+	 *  0            w0*x0 ^              |        y0*z0 ^
+	 *  1       (w0*x1 ^ w1*x0) <<  8 ^   |   (y0*z1 ^ y1*z0) <<  8 ^
+	 *  2       (w0*x2 ^ w1*x1) << 16 ^   |   (y0*z2 ^ y1*z1) << 16 ^
+	 *  3       (w0*x3 ^ w1*x2) << 24 ^   |   (y0*z3 ^ y1*z2) << 24 ^
+	 *  4       (w0*x4 ^ w1*x3) << 32 ^   |   (y0*z4 ^ y1*z3) << 32 ^
+	 *  5       (w0*x5 ^ w1*x4) << 40 ^   |   (y0*z5 ^ y1*z4) << 40 ^
+	 *  6       (w0*x6 ^ w1*x5) << 48 ^   |   (y0*z6 ^ y1*z5) << 48 ^
+	 *  7       (w0*x7 ^ w1*x6) << 56 ^   |   (y0*z7 ^ y1*z6) << 56 ^
+	 *  8            w1*x7      << 64     |        y1*z7      << 64
+	 *
+	 * The inputs can be reorganized into
+	 *
+	 *   { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+	 *   { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+	 *
+	 * and after performing 8x8->16 bit long polynomial multiplication of
+	 * each of the halves of the first vector with those of the second one,
+	 * we obtain the following four vectors of 16-bit elements:
+	 *
+	 *   a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+	 *   b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+	 *   c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+	 *   d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+	 *
+	 * Results b and c can be XORed together, as the vector elements have
+	 * matching ranks. Then, the final XOR (*) can be pulled forward, and
+	 * applied between the halves of each of the remaining three vectors,
+	 * which are then shifted into place, and combined to produce two
+	 * 80-bit results.
+	 *
+	 * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+	 * to the 64x64 bit one above, but XOR'ing the outputs together will
+	 * produce the expected result, and this is sufficient in the context of
+	 * this algorithm.
+	 */
+	.macro		pmull16x64_p8, a16, b64, c64
+	ext		t7.16b, \b64\().16b, \b64\().16b, #1
+	tbl		t5.16b, {\a16\().16b}, perm.16b
+	uzp1		t7.16b, \b64\().16b, t7.16b
+	bl		__pmull_p8_16x64
+	ext		\b64\().16b, t4.16b, t4.16b, #15
+	eor		\c64\().16b, t8.16b, t5.16b
+	.endm
+
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+	ext		t6.16b, t5.16b, t5.16b, #8
+
+	pmull		t3.8h, t7.8b, t5.8b
+	pmull		t4.8h, t7.8b, t6.8b
+	pmull2		t5.8h, t7.16b, t5.16b
+	pmull2		t6.8h, t7.16b, t6.16b
+
+	ext		t8.16b, t3.16b, t3.16b, #8
+	eor		t4.16b, t4.16b, t6.16b
+	ext		t7.16b, t5.16b, t5.16b, #8
+	ext		t6.16b, t4.16b, t4.16b, #8
+	eor		t8.8b, t8.8b, t3.8b
+	eor		t5.8b, t5.8b, t7.8b
+	eor		t4.8b, t4.8b, t6.8b
+	ext		t5.16b, t5.16b, t5.16b, #14
+	ret
+SYM_FUNC_END(__pmull_p8_16x64)
+
+
+	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
+	// into reg1, reg2.
+	.macro		fold_32_bytes, p, reg1, reg2
+	ldp		q11, q12, [buf], #0x20
+
+	pmull16x64_\p	fold_consts, \reg1, v8
+
+CPU_LE(	rev64		v11.16b, v11.16b		)
+CPU_LE(	rev64		v12.16b, v12.16b		)
+
+	pmull16x64_\p	fold_consts, \reg2, v9
+
+CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
+CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
+
+	eor		\reg1\().16b, \reg1\().16b, v8.16b
+	eor		\reg2\().16b, \reg2\().16b, v9.16b
+	eor		\reg1\().16b, \reg1\().16b, v11.16b
+	eor		\reg2\().16b, \reg2\().16b, v12.16b
+	.endm
+
+	// Fold src_reg into dst_reg, optionally loading the next fold constants
+	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
+	pmull16x64_\p	fold_consts, \src_reg, v8
+	.ifnb		\load_next_consts
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	.endif
+	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
+	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
+	.endm
+
+	.macro		crc_t10dif_pmull, p
+
+	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+	cmp		len, #256
+	b.lt		.Lless_than_256_bytes_\@
+
+	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+	// Load the first 128 data bytes.  Byte swapping is necessary to make
+	// the bit order match the polynomial coefficient order.
+	ldp		q0, q1, [buf]
+	ldp		q2, q3, [buf, #0x20]
+	ldp		q4, q5, [buf, #0x40]
+	ldp		q6, q7, [buf, #0x60]
+	add		buf, buf, #0x80
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	rev64		v2.16b, v2.16b			)
+CPU_LE(	rev64		v3.16b, v3.16b			)
+CPU_LE(	rev64		v4.16b, v4.16b			)
+CPU_LE(	rev64		v5.16b, v5.16b			)
+CPU_LE(	rev64		v6.16b, v6.16b			)
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
+CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
+CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
+CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
+CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v8.16b, #0
+	mov		v8.h[7], init_crc
+	eor		v0.16b, v0.16b, v8.16b
+
+	// Load the constants for folding across 128 bytes.
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+
+	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
+	// 128 to simplify the termination condition of the following loop.
+	sub		len, len, #256
+
+	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
+	// bytes v0-v7 into them, storing the result back into v0-v7.
+.Lfold_128_bytes_loop_\@:
+	fold_32_bytes	\p, v0, v1
+	fold_32_bytes	\p, v2, v3
+	fold_32_bytes	\p, v4, v5
+	fold_32_bytes	\p, v6, v7
+
+	subs		len, len, #128
+	b.ge		.Lfold_128_bytes_loop_\@
+
+	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
+
+	// Fold across 64 bytes.
+	add		fold_consts_ptr, fold_consts_ptr, #16
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+	fold_16_bytes	\p, v0, v4
+	fold_16_bytes	\p, v1, v5
+	fold_16_bytes	\p, v2, v6
+	fold_16_bytes	\p, v3, v7, 1
+	// Fold across 32 bytes.
+	fold_16_bytes	\p, v4, v6
+	fold_16_bytes	\p, v5, v7, 1
+	// Fold across 16 bytes.
+	fold_16_bytes	\p, v6, v7
+
+	// Add 128 to get the correct number of data bytes remaining in 0...127
+	// (not counting v7), following the previous extra subtraction by 128.
+	// Then subtract 16 to simplify the termination condition of the
+	// following loop.
+	adds		len, len, #(128-16)
+
+	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
+	// into them, storing the result back into v7.
+	b.lt		.Lfold_16_bytes_loop_done_\@
+.Lfold_16_bytes_loop_\@:
+	pmull16x64_\p	fold_consts, v7, v8
+	eor		v7.16b, v7.16b, v8.16b
+	ldr		q0, [buf], #16
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+	eor		v7.16b, v7.16b, v0.16b
+	subs		len, len, #16
+	b.ge		.Lfold_16_bytes_loop_\@
+
+.Lfold_16_bytes_loop_done_\@:
+	// Add 16 to get the correct number of data bytes remaining in 0...15
+	// (not counting v7), following the previous extra subtraction by 16.
+	adds		len, len, #16
+	b.eq		.Lreduce_final_16_bytes_\@
+
+.Lhandle_partial_segment_\@:
+	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
+	// do this without needing a fold constant for each possible 'len',
+	// redivide the bytes into a first chunk of 'len' bytes and a second
+	// chunk of 16 bytes, then fold the first chunk into the second.
+
+	// v0 = last 16 original data bytes
+	add		buf, buf, len
+	ldr		q0, [buf, #-16]
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+
+	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
+	adr_l		x4, .Lbyteshift_table + 16
+	sub		x4, x4, len
+	ld1		{v2.16b}, [x4]
+	tbl		v1.16b, {v7.16b}, v2.16b
+
+	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
+	movi		v3.16b, #0x80
+	eor		v2.16b, v2.16b, v3.16b
+	tbl		v3.16b, {v7.16b}, v2.16b
+
+	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+	sshr		v2.16b, v2.16b, #7
+
+	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
+	// then '16-len' bytes from v1 (high-order bytes).
+	bsl		v2.16b, v1.16b, v0.16b
+
+	// Fold the first chunk into the second chunk, storing the result in v7.
+	pmull16x64_\p	fold_consts, v3, v0
+	eor		v7.16b, v3.16b, v0.16b
+	eor		v7.16b, v7.16b, v2.16b
+	b		.Lreduce_final_16_bytes_\@
+
+.Lless_than_256_bytes_\@:
+	// Checksumming a buffer of length 16...255 bytes
+
+	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+	// Load the first 16 data bytes.
+	ldr		q7, [buf], #0x10
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+
+	// XOR the first 16 data *bits* with the initial CRC value.
+	movi		v0.16b, #0
+	mov		v0.h[7], init_crc
+	eor		v7.16b, v7.16b, v0.16b
+
+	// Load the fold-across-16-bytes constants.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+
+	cmp		len, #16
+	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
+	subs		len, len, #32
+	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
+	add		len, len, #16
+	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
+	.endm
+
+//
+// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p8)
+	frame_push	1
+
+	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+	movi		perm.4h, #8, lsl #8
+	orr		perm.2s, #1, lsl #16
+	orr		perm.2s, #1, lsl #24
+	zip1		perm.16b, perm.16b, perm.16b
+	zip1		perm.16b, perm.16b, perm.16b
+
+	crc_t10dif_pmull p8
+
+CPU_LE(	rev64		v7.16b, v7.16b			)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	str		q7, [x3]
+
+	frame_pop
+	ret
+SYM_FUNC_END(crc_t10dif_pmull_p8)
+
+	.align		5
+//
+// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+SYM_FUNC_START(crc_t10dif_pmull_p64)
+	crc_t10dif_pmull	p64
+
+	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+	movi		v2.16b, #0		// init zero register
+
+	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
+
+	// Fold the high 64 bits into the low 64 bits, while also multiplying by
+	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
+	// whose low 48 bits are 0.
+	ext		v0.16b, v2.16b, v7.16b, #8
+	pmull2		v7.1q, v7.2d, fold_consts.2d	// high bits * x^48 * (x^80 mod G(x))
+	eor		v0.16b, v0.16b, v7.16b		// + low bits * x^64
+
+	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
+	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
+	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
+	mov		v0.s[3], v2.s[0]		// zero high 32 bits
+	pmull		v1.1q, v1.1d, fold_consts.1d	// high 32 bits * x^48 * (x^48 mod G(x))
+	eor		v0.16b, v0.16b, v1.16b		// + low bits
+
+	// Load G(x) and floor(x^48 / G(x)).
+	ld1		{fold_consts.2d}, [fold_consts_ptr]
+
+	// Use Barrett reduction to compute the final CRC value.
+	pmull2		v1.1q, v0.2d, fold_consts.2d	// high 32 bits * floor(x^48 / G(x))
+	ushr		v1.2d, v1.2d, #32		// /= x^32
+	pmull		v1.1q, v1.1d, fold_consts.1d	// *= G(x)
+	ushr		v0.2d, v0.2d, #48
+	eor		v0.16b, v0.16b, v1.16b		// + low 16 nonzero bits
+	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+	umov		w0, v0.h[0]
+	ret
+SYM_FUNC_END(crc_t10dif_pmull_p64)
+
+	.section	".rodata", "a"
+	.align		4
+
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
+	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
+// .Lfold_across_64_bytes_consts:
+	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
+	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
+// .Lfold_across_32_bytes_consts:
+	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
+	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
+.Lfold_across_16_bytes_consts:
+	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
+	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
+// .Lfinal_fold_consts:
+	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
+	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+	.quad		0x0000000000018bb7	// G(x)
+	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
diff --git a/arch/arm64/lib/crc-t10dif-glue.c b/arch/arm64/lib/crc-t10dif-glue.c
new file mode 100644
index 000000000000..dab7e3796232
--- /dev/null
+++ b/arch/arm64/lib/crc-t10dif-glue.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc-t10dif.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/simd.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static DEFINE_STATIC_KEY_FALSE(have_asimd);
+static DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+#define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
+
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+				    u8 out[16]);
+asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+
+u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
+{
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+		if (static_branch_likely(&have_pmull)) {
+			if (crypto_simd_usable()) {
+				kernel_neon_begin();
+				crc = crc_t10dif_pmull_p64(crc, data, length);
+				kernel_neon_end();
+				return crc;
+			}
+		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
+			   static_branch_likely(&have_asimd) &&
+			   crypto_simd_usable()) {
+			u8 buf[16];
+
+			kernel_neon_begin();
+			crc_t10dif_pmull_p8(crc, data, length, buf);
+			kernel_neon_end();
+
+			crc = 0;
+			data = buf;
+			length = sizeof(buf);
+		}
+	}
+	return crc_t10dif_generic(crc, data, length);
+}
+EXPORT_SYMBOL(crc_t10dif_arch);
+
+static int __init crc_t10dif_arm64_init(void)
+{
+	if (cpu_have_named_feature(ASIMD)) {
+		static_branch_enable(&have_asimd);
+		if (cpu_have_named_feature(PMULL))
+			static_branch_enable(&have_pmull);
+	}
+	return 0;
+}
+arch_initcall(crc_t10dif_arm64_init);
+
+static void __exit crc_t10dif_arm64_exit(void)
+{
+}
+module_exit(crc_t10dif_arm64_exit);
+
+bool crc_t10dif_is_optimized(void)
+{
+	return static_key_enabled(&have_asimd);
+}
+EXPORT_SYMBOL(crc_t10dif_is_optimized);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
new file mode 100644
index 000000000000..15c4c9db573e
--- /dev/null
+++ b/arch/arm64/lib/crc32-glue.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crc32.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+#include <asm/alternative.h>
+#include <asm/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#include <crypto/internal/simd.h>
+
+// The minimum input length to consider the 4-way interleaved code path
+static const size_t min_len = 1024;
+
+asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
+
+asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+
+u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return crc32_le_base(crc, p, len);
+
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32_le_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
+	return crc32_le_arm64(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_le_arch);
+
+u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return crc32c_le_base(crc, p, len);
+
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32c_le_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
+	return crc32c_le_arm64(crc, p, len);
+}
+EXPORT_SYMBOL(crc32c_le_arch);
+
+u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return crc32_be_base(crc, p, len);
+
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32_be_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
+	return crc32_be_arm64(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be_arch);
+
+u32 crc32_optimizations(void)
+{
+	if (alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return CRC32_LE_OPTIMIZATION |
+		       CRC32_BE_OPTIMIZATION |
+		       CRC32C_OPTIMIZATION;
+	return 0;
+}
+EXPORT_SYMBOL(crc32_optimizations);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("arm64-optimized CRC32 functions");
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 8340dccff46f..68825317460f 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -1,54 +1,60 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Accelerated CRC32(C) using AArch64 CRC instructions
+ * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
  *
- * Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016 - 2018 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
  */
 
 #include <linux/linkage.h>
-#include <asm/alternative.h>
 #include <asm/assembler.h>
 
-	.arch		armv8-a+crc
+	.cpu		generic+crc+crypto
 
-	.macro		byteorder, reg, be
-	.if		\be
-CPU_LE( rev		\reg, \reg	)
-	.else
-CPU_BE( rev		\reg, \reg	)
-	.endif
+	.macro		bitle, reg
 	.endm
 
-	.macro		byteorder16, reg, be
-	.if		\be
-CPU_LE( rev16		\reg, \reg	)
-	.else
-CPU_BE( rev16		\reg, \reg	)
-	.endif
+	.macro		bitbe, reg
+	rbit		\reg, \reg
 	.endm
 
-	.macro		bitorder, reg, be
-	.if		\be
-	rbit		\reg, \reg
-	.endif
+	.macro		bytele, reg
 	.endm
 
-	.macro		bitorder16, reg, be
-	.if		\be
+	.macro		bytebe, reg
 	rbit		\reg, \reg
-	lsr		\reg, \reg, #16
-	.endif
+	lsr		\reg, \reg, #24
 	.endm
 
-	.macro		bitorder8, reg, be
-	.if		\be
+	.macro		hwordle, reg
+CPU_BE(	rev16		\reg, \reg	)
+	.endm
+
+	.macro		hwordbe, reg
+CPU_LE(	rev		\reg, \reg	)
 	rbit		\reg, \reg
-	lsr		\reg, \reg, #24
-	.endif
+CPU_BE(	lsr		\reg, \reg, #16	)
+	.endm
+
+	.macro		le, regs:vararg
+	.irp		r, \regs
+CPU_BE(	rev		\r, \r		)
+	.endr
 	.endm
 
-	.macro		__crc32, c, be=0
-	bitorder	w0, \be
+	.macro		be, regs:vararg
+	.irp		r, \regs
+CPU_LE(	rev		\r, \r		)
+	.endr
+	.irp		r, \regs
+	rbit		\r, \r
+	.endr
+	.endm
+
+	.macro		__crc32, c, order=le
+	bit\order	w0
 	cmp		x2, #16
 	b.lt		8f			// less than 16 bytes
 
@@ -61,14 +67,7 @@ CPU_BE( rev16		\reg, \reg	)
 	add		x8, x8, x1
 	add		x1, x1, x7
 	ldp		x5, x6, [x8]
-	byteorder	x3, \be
-	byteorder	x4, \be
-	byteorder	x5, \be
-	byteorder	x6, \be
-	bitorder	x3, \be
-	bitorder	x4, \be
-	bitorder	x5, \be
-	bitorder	x6, \be
+	\order		x3, x4, x5, x6
 
 	tst		x7, #8
 	crc32\c\()x	w8, w0, x3
@@ -96,65 +95,268 @@ CPU_BE( rev16		\reg, \reg	)
 32:	ldp		x3, x4, [x1], #32
 	sub		x2, x2, #32
 	ldp		x5, x6, [x1, #-16]
-	byteorder	x3, \be
-	byteorder	x4, \be
-	byteorder	x5, \be
-	byteorder	x6, \be
-	bitorder	x3, \be
-	bitorder	x4, \be
-	bitorder	x5, \be
-	bitorder	x6, \be
+	\order		x3, x4, x5, x6
 	crc32\c\()x	w0, w0, x3
 	crc32\c\()x	w0, w0, x4
 	crc32\c\()x	w0, w0, x5
 	crc32\c\()x	w0, w0, x6
 	cbnz		x2, 32b
-0:	bitorder	w0, \be
+0:	bit\order	w0
 	ret
 
 8:	tbz		x2, #3, 4f
 	ldr		x3, [x1], #8
-	byteorder	x3, \be
-	bitorder	x3, \be
+	\order		x3
 	crc32\c\()x	w0, w0, x3
 4:	tbz		x2, #2, 2f
 	ldr		w3, [x1], #4
-	byteorder	w3, \be
-	bitorder	w3, \be
+	\order		w3
 	crc32\c\()w	w0, w0, w3
 2:	tbz		x2, #1, 1f
 	ldrh		w3, [x1], #2
-	byteorder16	w3, \be
-	bitorder16	w3, \be
+	hword\order	w3
 	crc32\c\()h	w0, w0, w3
 1:	tbz		x2, #0, 0f
 	ldrb		w3, [x1]
-	bitorder8	w3, \be
+	byte\order	w3
 	crc32\c\()b	w0, w0, w3
-0:	bitorder	w0, \be
+0:	bit\order	w0
 	ret
 	.endm
 
 	.align		5
-SYM_FUNC_START(crc32_le)
-alternative_if_not ARM64_HAS_CRC32
-	b		crc32_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32_le_arm64)
 	__crc32
-SYM_FUNC_END(crc32_le)
+SYM_FUNC_END(crc32_le_arm64)
 
 	.align		5
-SYM_FUNC_START(__crc32c_le)
-alternative_if_not ARM64_HAS_CRC32
-	b		__crc32c_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32c_le_arm64)
 	__crc32		c
-SYM_FUNC_END(__crc32c_le)
+SYM_FUNC_END(crc32c_le_arm64)
 
 	.align		5
-SYM_FUNC_START(crc32_be)
-alternative_if_not ARM64_HAS_CRC32
-	b		crc32_be_base
-alternative_else_nop_endif
-	__crc32		be=1
-SYM_FUNC_END(crc32_be)
+SYM_FUNC_START(crc32_be_arm64)
+	__crc32		order=be
+SYM_FUNC_END(crc32_be_arm64)
+
+	in		.req	x1
+	len		.req	x2
+
+	/*
+	 * w0: input CRC at entry, output CRC at exit
+	 * x1: pointer to input buffer
+	 * x2: length of input in bytes
+	 */
+	.macro		crc4way, insn, table, order=le
+	bit\order	w0
+	lsr		len, len, #6		// len := # of 64-byte blocks
+
+	/* Process up to 64 blocks of 64 bytes at a time */
+.La\@:	mov		x3, #64
+	cmp		len, #64
+	csel		x3, x3, len, hi		// x3 := min(len, 64)
+	sub		len, len, x3
+
+	/* Divide the input into 4 contiguous blocks */
+	add		x4, x3, x3, lsl #1	// x4 :=  3 * x3
+	add		x7, in, x3, lsl #4	// x7 := in + 16 * x3
+	add		x8, in, x3, lsl #5	// x8 := in + 32 * x3
+	add		x9, in, x4, lsl #4	// x9 := in + 16 * x4
+
+	/* Load the folding coefficients from the lookup table */
+	adr_l		x5, \table - 12		// entry 0 omitted
+	add		x5, x5, x4, lsl #2	// x5 += 12 * x3
+	ldp		s0, s1, [x5]
+	ldr		s2, [x5, #8]
+
+	/* Zero init partial CRCs for this iteration */
+	mov		w4, wzr
+	mov		w5, wzr
+	mov		w6, wzr
+	mov		x17, xzr
+
+.Lb\@:	sub		x3, x3, #1
+	\insn		w6, w6, x17
+	ldp		x10, x11, [in], #16
+	ldp		x12, x13, [x7], #16
+	ldp		x14, x15, [x8], #16
+	ldp		x16, x17, [x9], #16
+
+	\order		x10, x11, x12, x13, x14, x15, x16, x17
+
+	/* Apply the CRC transform to 4 16-byte blocks in parallel */
+	\insn		w0, w0, x10
+	\insn		w4, w4, x12
+	\insn		w5, w5, x14
+	\insn		w6, w6, x16
+	\insn		w0, w0, x11
+	\insn		w4, w4, x13
+	\insn		w5, w5, x15
+	cbnz		x3, .Lb\@
+
+	/* Combine the 4 partial results into w0 */
+	mov		v3.d[0], x0
+	mov		v4.d[0], x4
+	mov		v5.d[0], x5
+	pmull		v0.1q, v0.1d, v3.1d
+	pmull		v1.1q, v1.1d, v4.1d
+	pmull		v2.1q, v2.1d, v5.1d
+	eor		v0.8b, v0.8b, v1.8b
+	eor		v0.8b, v0.8b, v2.8b
+	mov		x5, v0.d[0]
+	eor		x5, x5, x17
+	\insn		w0, w6, x5
+
+	mov		in, x9
+	cbnz		len, .La\@
+
+	bit\order	w0
+	ret
+	.endm
+
+	.align		5
+SYM_FUNC_START(crc32c_le_arm64_4way)
+	crc4way		crc32cx, .L0
+SYM_FUNC_END(crc32c_le_arm64_4way)
+
+	.align		5
+SYM_FUNC_START(crc32_le_arm64_4way)
+	crc4way		crc32x, .L1
+SYM_FUNC_END(crc32_le_arm64_4way)
+
+	.align		5
+SYM_FUNC_START(crc32_be_arm64_4way)
+	crc4way		crc32x, .L1, be
+SYM_FUNC_END(crc32_be_arm64_4way)
+
+	.section	.rodata, "a", %progbits
+	.align		6
+.L0:	.long		0xddc0152b, 0xba4fc28e, 0x493c7d27
+	.long		0x0715ce53, 0x9e4addf8, 0xba4fc28e
+	.long		0xc96cfdc0, 0x0715ce53, 0xddc0152b
+	.long		0xab7aff2a, 0x0d3b6092, 0x9e4addf8
+	.long		0x299847d5, 0x878a92a7, 0x39d3b296
+	.long		0xb6dd949b, 0xab7aff2a, 0x0715ce53
+	.long		0xa60ce07b, 0x83348832, 0x47db8317
+	.long		0xd270f1a2, 0xb9e02b86, 0x0d3b6092
+	.long		0x65863b64, 0xb6dd949b, 0xc96cfdc0
+	.long		0xb3e32c28, 0xbac2fd7b, 0x878a92a7
+	.long		0xf285651c, 0xce7f39f4, 0xdaece73e
+	.long		0x271d9844, 0xd270f1a2, 0xab7aff2a
+	.long		0x6cb08e5c, 0x2b3cac5d, 0x2162d385
+	.long		0xcec3662e, 0x1b03397f, 0x83348832
+	.long		0x8227bb8a, 0xb3e32c28, 0x299847d5
+	.long		0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
+	.long		0xf6076544, 0x10746f3c, 0x18b33a4e
+	.long		0x98d8d9cb, 0x271d9844, 0xb6dd949b
+	.long		0x57a3d037, 0x93a5f730, 0x78d9ccb7
+	.long		0x3771e98f, 0x6b749fb2, 0xbac2fd7b
+	.long		0xe0ac139e, 0xcec3662e, 0xa60ce07b
+	.long		0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
+	.long		0xa2b73df1, 0xb0cd4768, 0x61d82e56
+	.long		0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
+	.long		0xa90fd27a, 0x0167d312, 0xc619809d
+	.long		0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
+	.long		0x4597456a, 0x98d8d9cb, 0x65863b64
+	.long		0xc9c8b782, 0x68bce87a, 0x1b03397f
+	.long		0x62ec6c6d, 0x6956fc3b, 0xebb883bd
+	.long		0x2342001e, 0x3771e98f, 0xb3e32c28
+	.long		0xe8b6368b, 0x2178513a, 0x064f7f26
+	.long		0x9ef68d35, 0x170076fa, 0xdd7e3b0c
+	.long		0x0b0bf8ca, 0x6f345e45, 0xf285651c
+	.long		0x02ee03b2, 0xff0dba97, 0x10746f3c
+	.long		0x135c83fd, 0xf872e54c, 0xc7a68855
+	.long		0x00bcf5f6, 0x86d8e4d2, 0x271d9844
+	.long		0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
+	.long		0xded288f8, 0xb3af077a, 0x93a5f730
+	.long		0x37170390, 0xca6ef3ac, 0x6cb08e5c
+	.long		0xf48642e9, 0xdd66cbbb, 0x6b749fb2
+	.long		0xb25b29f2, 0xe9e28eb4, 0x1393e203
+	.long		0x45cddf4e, 0xc9c8b782, 0xcec3662e
+	.long		0xdfd94fb2, 0x93e106a4, 0x96c515bb
+	.long		0x021ac5ef, 0xd813b325, 0xe6fc4e6a
+	.long		0x8e1450f7, 0x2342001e, 0x8227bb8a
+	.long		0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
+	.long		0x613eee91, 0xd2c3ed1a, 0x39c7ff35
+	.long		0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
+	.long		0x0cd1526a, 0xf2271e60, 0x0ab3844b
+	.long		0xd6c3a807, 0x2664fd8b, 0x0167d312
+	.long		0x1d31175f, 0x02ee03b2, 0xf6076544
+	.long		0x4be7fd90, 0x363bd6b3, 0x26f6a60a
+	.long		0x6eeed1c9, 0x5fabe670, 0xa741c1bf
+	.long		0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
+	.long		0x2e7d11a7, 0x17f27698, 0x49c3cc9c
+	.long		0x889774e1, 0xaa7c7ad5, 0x68bce87a
+	.long		0x8a074012, 0xded288f8, 0x57a3d037
+	.long		0xbd0bb25f, 0x6d390dec, 0x6956fc3b
+	.long		0x3be3c09b, 0x6353c1cc, 0x42d98888
+	.long		0x465a4eee, 0xf48642e9, 0x3771e98f
+	.long		0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
+	.long		0xa52f58ec, 0x9a5ede41, 0x2178513a
+	.long		0x47972100, 0x45cddf4e, 0xe0ac139e
+	.long		0x359674f7, 0xa51b6135, 0x170076fa
+
+.L1:	.long		0xaf449247, 0x81256527, 0xccaa009e
+	.long		0x57c54819, 0x1d9513d7, 0x81256527
+	.long		0x3f41287a, 0x57c54819, 0xaf449247
+	.long		0xf5e48c85, 0x910eeec1, 0x1d9513d7
+	.long		0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
+	.long		0x71d54a59, 0xf5e48c85, 0x57c54819
+	.long		0x1c63267b, 0xfe807bbd, 0x0cbec0ed
+	.long		0xd31343ea, 0xe95c1271, 0x910eeec1
+	.long		0xf9d9c7ee, 0x71d54a59, 0x3f41287a
+	.long		0x9ee62949, 0xcec97417, 0x9026d5b1
+	.long		0xa55d1514, 0xf183c71b, 0xd1df2327
+	.long		0x21aa2b26, 0xd31343ea, 0xf5e48c85
+	.long		0x9d842b80, 0xeea395c4, 0x3c656ced
+	.long		0xd8110ff1, 0xcd669a40, 0xfe807bbd
+	.long		0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
+	.long		0x1d6708a0, 0x0c30f51d, 0xe95c1271
+	.long		0xef82aa68, 0xdb3935ea, 0xb918a347
+	.long		0xd14bcc9b, 0x21aa2b26, 0x71d54a59
+	.long		0x99cce860, 0x356d209f, 0xff6f2fc2
+	.long		0xd8af8e46, 0xc352f6de, 0xcec97417
+	.long		0xf1996890, 0xd8110ff1, 0x1c63267b
+	.long		0x631bc508, 0xe95c7216, 0xf183c71b
+	.long		0x8511c306, 0x8e031a19, 0x9b9bdbd0
+	.long		0xdb3839f3, 0x1d6708a0, 0xd31343ea
+	.long		0x7a92fffb, 0xf7003835, 0x4470ac44
+	.long		0x6ce68f2a, 0x00eba0c8, 0xeea395c4
+	.long		0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
+	.long		0xb46f7cff, 0x9a1b53c8, 0xcd669a40
+	.long		0x60290934, 0x81b6f443, 0x6d40f445
+	.long		0x8e976a7d, 0xd8af8e46, 0x9ee62949
+	.long		0xdcf5088a, 0x9dbdc100, 0x145575d5
+	.long		0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
+	.long		0x255b139e, 0x631bc508, 0xa55d1514
+	.long		0xd784eaa8, 0xce26786c, 0xdb3935ea
+	.long		0x6d2c864a, 0x8068c345, 0x2586d334
+	.long		0x02072e24, 0xdb3839f3, 0x21aa2b26
+	.long		0x06689b0a, 0x5efd72f5, 0xe0575528
+	.long		0x1e52f5ea, 0x4117915b, 0x356d209f
+	.long		0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
+	.long		0x3796455c, 0xb8e0e4a8, 0xc352f6de
+	.long		0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
+	.long		0x28ae0976, 0xb46f7cff, 0xd8110ff1
+	.long		0x9764bc8d, 0xd7e7a22c, 0x712510f0
+	.long		0x13a13e18, 0x3e9a43cd, 0xe95c7216
+	.long		0xb8ee242e, 0x8e976a7d, 0x3f9e9356
+	.long		0x0c540e7b, 0x753c81ff, 0x8e031a19
+	.long		0x9924c781, 0xb9220208, 0x3edcde65
+	.long		0x3954de39, 0x1753ab84, 0x1d6708a0
+	.long		0xf32238b5, 0xbec81497, 0x9e70b943
+	.long		0xbbd2cd2c, 0x0925d861, 0xf7003835
+	.long		0xcc401304, 0xd784eaa8, 0xef82aa68
+	.long		0x4987e684, 0x6044fbb0, 0x00eba0c8
+	.long		0x3aa11427, 0x18fe3b4a, 0x87441142
+	.long		0x297aad60, 0x02072e24, 0xd14bcc9b
+	.long		0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
+	.long		0x632d78c5, 0x3fc33de4, 0x9a1b53c8
+	.long		0x25b8822a, 0x1e52f5ea, 0x99cce860
+	.long		0xd4fc84bc, 0x1af62fb8, 0x81b6f443
+	.long		0x5690aa32, 0xa91fdefb, 0x688a110e
+	.long		0x1357a093, 0x3796455c, 0xd8af8e46
+	.long		0x798fdd33, 0xaaa18a37, 0x357b9517
+	.long		0xc2815395, 0x54d42691, 0x9dbdc100
+	.long		0x21cfc0f7, 0x28ae0976, 0xf1996890
+	.long		0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 4ab48d49c451..9b99106fb95f 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -57,7 +57,7 @@
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-SYM_FUNC_START(__pi_memcpy)
+SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
@@ -238,7 +238,24 @@ L(copy64_from_start):
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstin]
 	ret
+SYM_FUNC_END(__pi_memcpy_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+SYM_FUNC_START(__pi_memcpy)
+alternative_if_not ARM64_HAS_MOPS
+	b	__pi_memcpy_generic
+alternative_else_nop_endif
+
+	mov	dst, dstin
+	cpyp	[dst]!, [src]!, count!
+	cpym	[dst]!, [src]!, count!
+	cpye	[dst]!, [src]!, count!
+	ret
 SYM_FUNC_END(__pi_memcpy)
+#else
+SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
 EXPORT_SYMBOL(__memcpy)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index a5aebe82ad73..97157da65ec6 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -26,6 +26,7 @@
  */
 
 dstin		.req	x0
+val_x		.req	x1
 val		.req	w1
 count		.req	x2
 tmp1		.req	x3
@@ -42,7 +43,7 @@ dst		.req	x8
 tmp3w		.req	w9
 tmp3		.req	x9
 
-SYM_FUNC_START(__pi_memset)
+SYM_FUNC_START_LOCAL(__pi_memset_generic)
 	mov	dst, dstin	/* Preserve return value.  */
 	and	A_lw, val, #255
 	orr	A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset)
 	ands	count, count, zva_bits_x
 	b.ne	.Ltail_maybe_long
 	ret
+SYM_FUNC_END(__pi_memset_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+SYM_FUNC_START(__pi_memset)
+alternative_if_not ARM64_HAS_MOPS
+	b	__pi_memset_generic
+alternative_else_nop_endif
+
+	mov	dst, dstin
+	setp	[dst]!, count!, val_x
+	setm	[dst]!, count!, val_x
+	sete	[dst]!, count!, val_x
+	ret
 SYM_FUNC_END(__pi_memset)
+#else
+SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memset, __pi_memset)
 EXPORT_SYMBOL(__memset)