summaryrefslogtreecommitdiffstats
path: root/arch/s390/kernel/vdso64/vgetrandom-chacha.S
blob: 3f7e30886d1b44cbb8624ebe63847255c5ee23e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
/* SPDX-License-Identifier: GPL-2.0 */

#include <linux/stringify.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/fpu-insn.h>

#define STATE0	%v0
#define STATE1	%v1
#define STATE2	%v2
#define STATE3	%v3
#define COPY0	%v4
#define COPY1	%v5
#define COPY2	%v6
#define COPY3	%v7
#define BEPERM	%v19
#define TMP0	%v20
#define TMP1	%v21
#define TMP2	%v22
#define TMP3	%v23

	.section .rodata

	.balign 32
.Lconstants:
	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
	.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap

	.text
/*
 * s390 ChaCha20 implementation meant for vDSO. Produces a given positive
 * number of blocks of output with nonce 0, taking an input key and 8-bytes
 * counter. Does not spill to the stack.
 *
 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
 *				       const uint8_t *key,
 *				       uint32_t *counter,
 *				       size_t nblocks)
 */
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
	larl	%r1,.Lconstants

	/* COPY0 = "expand 32-byte k" */
	VL	COPY0,0,,%r1

	/* BEPERM = byte selectors for VPERM */
	ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148)

	/* COPY1,COPY2 = key */
	VLM	COPY1,COPY2,0,%r3

	/* COPY3 = counter || zero nonce  */
	lg	%r3,0(%r4)
	VZERO	COPY3
	VLVGG	COPY3,%r3,0

	lghi	%r1,0
.Lblock:
	VLR	STATE0,COPY0
	VLR	STATE1,COPY1
	VLR	STATE2,COPY2
	VLR	STATE3,COPY3

	lghi	%r0,10
.Ldoubleround:
	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
	VAF	STATE0,STATE0,STATE1
	VX	STATE3,STATE3,STATE0
	VERLLF	STATE3,STATE3,16

	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
	VAF	STATE2,STATE2,STATE3
	VX	STATE1,STATE1,STATE2
	VERLLF	STATE1,STATE1,12

	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
	VAF	STATE0,STATE0,STATE1
	VX	STATE3,STATE3,STATE0
	VERLLF	STATE3,STATE3,8

	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
	VAF	STATE2,STATE2,STATE3
	VX	STATE1,STATE1,STATE2
	VERLLF	STATE1,STATE1,7

	/* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
	VSLDB	STATE1,STATE1,STATE1,4
	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
	VSLDB	STATE2,STATE2,STATE2,8
	/* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
	VSLDB	STATE3,STATE3,STATE3,12

	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
	VAF	STATE0,STATE0,STATE1
	VX	STATE3,STATE3,STATE0
	VERLLF	STATE3,STATE3,16

	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
	VAF	STATE2,STATE2,STATE3
	VX	STATE1,STATE1,STATE2
	VERLLF	STATE1,STATE1,12

	/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
	VAF	STATE0,STATE0,STATE1
	VX	STATE3,STATE3,STATE0
	VERLLF	STATE3,STATE3,8

	/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
	VAF	STATE2,STATE2,STATE3
	VX	STATE1,STATE1,STATE2
	VERLLF	STATE1,STATE1,7

	/* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
	VSLDB	STATE1,STATE1,STATE1,12
	/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
	VSLDB	STATE2,STATE2,STATE2,8
	/* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
	VSLDB	STATE3,STATE3,STATE3,4
	brctg	%r0,.Ldoubleround

	/* OUTPUT0 = STATE0 + COPY0 */
	VAF	STATE0,STATE0,COPY0
	/* OUTPUT1 = STATE1 + COPY1 */
	VAF	STATE1,STATE1,COPY1
	/* OUTPUT2 = STATE2 + COPY2 */
	VAF	STATE2,STATE2,COPY2
	/* OUTPUT3 = STATE3 + COPY3 */
	VAF	STATE3,STATE3,COPY3

	/*
	 * 32 bit wise little endian store to OUTPUT. If the vector
	 * enhancement facility 2 is not installed use the slow path.
	 */
	ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148)
	VSTBRF	STATE0,0,,%r2
	VSTBRF	STATE1,16,,%r2
	VSTBRF	STATE2,32,,%r2
	VSTBRF	STATE3,48,,%r2
.Lstoredone:

	/* ++COPY3.COUNTER */
	/* alsih %r3,1 */
	.insn	rilu,0xcc0a00000000,%r3,1
	alcr	%r3,%r1
	VLVGG	COPY3,%r3,0

	/* OUTPUT += 64, --NBLOCKS */
	aghi	%r2,64
	brctg	%r5,.Lblock

	/* COUNTER = COPY3.COUNTER */
	stg	%r3,0(%r4)

	/* Zero out potentially sensitive regs */
	VZERO	STATE0
	VZERO	STATE1
	VZERO	STATE2
	VZERO	STATE3
	VZERO	COPY1
	VZERO	COPY2

	/* Early exit if TMP0-TMP3 have not been used */
	ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148)

	VZERO	TMP0
	VZERO	TMP1
	VZERO	TMP2
	VZERO	TMP3

	br	%r14

.Lstoreslow:
	/* Convert STATE to little endian format and store to OUTPUT */
	VPERM	TMP0,STATE0,STATE0,BEPERM
	VPERM	TMP1,STATE1,STATE1,BEPERM
	VPERM	TMP2,STATE2,STATE2,BEPERM
	VPERM	TMP3,STATE3,STATE3,BEPERM
	VSTM	TMP0,TMP3,0,%r2
	j	.Lstoredone
SYM_FUNC_END(__arch_chacha20_blocks_nostack)