1 files changed, 276 insertions, 0 deletions
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
new file mode 100644
index 000000000000..a3d2b5e05f5e
--- /dev/null
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -0,0 +1,276 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 16
+
+ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define s1 0x00(%rsp)
+#define s2 0x04(%rsp)
+#define s3 0x08(%rsp)
+#define s4 0x0c(%rsp)
+#define m %rsi
+#define h01 %xmm0
+#define h23 %xmm1
+#define h44 %xmm2
+#define t1 %xmm3
+#define t2 %xmm4
+#define t3 %xmm5
+#define t4 %xmm6
+#define mask %xmm7
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define d4 %r12
+
+ENTRY(poly1305_block_sse2)
+	# %rdi: Accumulator h[5]
+	# %rsi: 16 byte input block m
+	# %rdx: Poly1305 key r[5]
+	# %rcx: Block count
+
+	# This single block variant tries to improve performance by doing two
+	# multiplications in parallel using SSE instructions. There is quite
+	# some quardword packing involved, hence the speedup is marginal.
+
+	push		%rbx
+	push		%r12
+	sub		$0x10,%rsp
+
+	# s1..s4 = r1..r4 * 5
+	mov		r1,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s1
+	mov		r2,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s2
+	mov		r3,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s3
+	mov		r4,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s4
+
+	movdqa		ANMASK(%rip),mask
+
+.Ldoblock:
+	# h01 = [0, h1, 0, h0]
+	# h23 = [0, h3, 0, h2]
+	# h44 = [0, h4, 0, h4]
+	movd		h0,h01
+	movd		h1,t1
+	movd		h2,h23
+	movd		h3,t2
+	movd		h4,h44
+	punpcklqdq	t1,h01
+	punpcklqdq	t2,h23
+	punpcklqdq	h44,h44
+
+	# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
+	movd		0x00(m),t1
+	movd		0x03(m),t2
+	psrld		$2,t2
+	punpcklqdq	t2,t1
+	pand		mask,t1
+	paddd		t1,h01
+	# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
+	movd		0x06(m),t1
+	movd		0x09(m),t2
+	psrld		$4,t1
+	psrld		$6,t2
+	punpcklqdq	t2,t1
+	pand		mask,t1
+	paddd		t1,h23
+	# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
+	mov		0x0c(m),%eax
+	shr		$8,%eax
+	or		$0x01000000,%eax
+	movd		%eax,t1
+	pshufd		$0xc4,t1,t1
+	paddd		t1,h44
+
+	# t1[0] = h0 * r0 + h2 * s3
+	# t1[1] = h1 * s4 + h3 * s2
+	movd		r0,t1
+	movd		s4,t2
+	punpcklqdq	t2,t1
+	pmuludq		h01,t1
+	movd		s3,t2
+	movd		s2,t3
+	punpcklqdq	t3,t2
+	pmuludq		h23,t2
+	paddq		t2,t1
+	# t2[0] = h0 * r1 + h2 * s4
+	# t2[1] = h1 * r0 + h3 * s3
+	movd		r1,t2
+	movd		r0,t3
+	punpcklqdq	t3,t2
+	pmuludq		h01,t2
+	movd		s4,t3
+	movd		s3,t4
+	punpcklqdq	t4,t3
+	pmuludq		h23,t3
+	paddq		t3,t2
+	# t3[0] = h4 * s1
+	# t3[1] = h4 * s2
+	movd		s1,t3
+	movd		s2,t4
+	punpcklqdq	t4,t3
+	pmuludq		h44,t3
+	# d0 = t1[0] + t1[1] + t3[0]
+	# d1 = t2[0] + t2[1] + t3[1]
+	movdqa		t1,t4
+	punpcklqdq	t2,t4
+	punpckhqdq	t2,t1
+	paddq		t4,t1
+	paddq		t3,t1
+	movq		t1,d0
+	psrldq		$8,t1
+	movq		t1,d1
+
+	# t1[0] = h0 * r2 + h2 * r0
+	# t1[1] = h1 * r1 + h3 * s4
+	movd		r2,t1
+	movd		r1,t2
+	punpcklqdq 	t2,t1
+	pmuludq		h01,t1
+	movd		r0,t2
+	movd		s4,t3
+	punpcklqdq	t3,t2
+	pmuludq		h23,t2
+	paddq		t2,t1
+	# t2[0] = h0 * r3 + h2 * r1
+	# t2[1] = h1 * r2 + h3 * r0
+	movd		r3,t2
+	movd		r2,t3
+	punpcklqdq	t3,t2
+	pmuludq		h01,t2
+	movd		r1,t3
+	movd		r0,t4
+	punpcklqdq	t4,t3
+	pmuludq		h23,t3
+	paddq		t3,t2
+	# t3[0] = h4 * s3
+	# t3[1] = h4 * s4
+	movd		s3,t3
+	movd		s4,t4
+	punpcklqdq	t4,t3
+	pmuludq		h44,t3
+	# d2 = t1[0] + t1[1] + t3[0]
+	# d3 = t2[0] + t2[1] + t3[1]
+	movdqa		t1,t4
+	punpcklqdq	t2,t4
+	punpckhqdq	t2,t1
+	paddq		t4,t1
+	paddq		t3,t1
+	movq		t1,d2
+	psrldq		$8,t1
+	movq		t1,d3
+
+	# t1[0] = h0 * r4 + h2 * r2
+	# t1[1] = h1 * r3 + h3 * r1
+	movd		r4,t1
+	movd		r3,t2
+	punpcklqdq	t2,t1
+	pmuludq		h01,t1
+	movd		r2,t2
+	movd		r1,t3
+	punpcklqdq	t3,t2
+	pmuludq		h23,t2
+	paddq		t2,t1
+	# t3[0] = h4 * r0
+	movd		r0,t3
+	pmuludq		h44,t3
+	# d4 = t1[0] + t1[1] + t3[0]
+	movdqa		t1,t4
+	psrldq		$8,t4
+	paddq		t4,t1
+	paddq		t3,t1
+	movq		t1,d4
+
+	# d1 += d0 >> 26
+	mov		d0,%rax
+	shr		$26,%rax
+	add		%rax,d1
+	# h0 = d0 & 0x3ffffff
+	mov		d0,%rbx
+	and		$0x3ffffff,%ebx
+
+	# d2 += d1 >> 26
+	mov		d1,%rax
+	shr		$26,%rax
+	add		%rax,d2
+	# h1 = d1 & 0x3ffffff
+	mov		d1,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h1
+
+	# d3 += d2 >> 26
+	mov		d2,%rax
+	shr		$26,%rax
+	add		%rax,d3
+	# h2 = d2 & 0x3ffffff
+	mov		d2,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h2
+
+	# d4 += d3 >> 26
+	mov		d3,%rax
+	shr		$26,%rax
+	add		%rax,d4
+	# h3 = d3 & 0x3ffffff
+	mov		d3,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h3
+
+	# h0 += (d4 >> 26) * 5
+	mov		d4,%rax
+	shr		$26,%rax
+	lea		(%eax,%eax,4),%eax
+	add		%eax,%ebx
+	# h4 = d4 & 0x3ffffff
+	mov		d4,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h4
+
+	# h1 += h0 >> 26
+	mov		%ebx,%eax
+	shr		$26,%eax
+	add		%eax,h1
+	# h0 = h0 & 0x3ffffff
+	andl		$0x3ffffff,%ebx
+	mov		%ebx,h0
+
+	add		$0x10,m
+	dec		%rcx
+	jnz		.Ldoblock
+
+	add		$0x10,%rsp
+	pop		%r12
+	pop		%rbx
+	ret
+ENDPROC(poly1305_block_sse2)