1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/fpu-insn.h>
#define STATE0 %v0
#define STATE1 %v1
#define STATE2 %v2
#define STATE3 %v3
#define COPY0 %v4
#define COPY1 %v5
#define COPY2 %v6
#define COPY3 %v7
#define PERM4 %v16
#define PERM8 %v17
#define PERM12 %v18
#define BEPERM %v19
#define TMP0 %v20
#define TMP1 %v21
#define TMP2 %v22
#define TMP3 %v23
.section .rodata
.balign 128
.Lconstants:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
.long 0x04050607,0x08090a0b,0x0c0d0e0f,0x00010203 # rotl 4 bytes
.long 0x08090a0b,0x0c0d0e0f,0x00010203,0x04050607 # rotl 8 bytes
.long 0x0c0d0e0f,0x00010203,0x04050607,0x08090a0b # rotl 12 bytes
.long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
.text
/*
* s390 ChaCha20 implementation meant for vDSO. Produces a given positive
* number of blocks of output with nonce 0, taking an input key and 8-bytes
* counter. Does not spill to the stack.
*
* void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
* const uint8_t *key,
* uint32_t *counter,
* size_t nblocks)
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
larl %r1,.Lconstants
/* COPY0 = "expand 32-byte k" */
VL COPY0,0,,%r1
/* PERM4-PERM12,BEPERM = byte selectors for VPERM */
VLM PERM4,BEPERM,16,%r1
/* COPY1,COPY2 = key */
VLM COPY1,COPY2,0,%r3
/* COPY3 = counter || zero nonce */
lg %r3,0(%r4)
VZERO COPY3
VLVGG COPY3,%r3,0
lghi %r1,0
.Lblock:
VLR STATE0,COPY0
VLR STATE1,COPY1
VLR STATE2,COPY2
VLR STATE3,COPY3
lghi %r0,10
.Ldoubleround:
/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
VAF STATE0,STATE0,STATE1
VX STATE3,STATE3,STATE0
VERLLF STATE3,STATE3,16
/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
VAF STATE2,STATE2,STATE3
VX STATE1,STATE1,STATE2
VERLLF STATE1,STATE1,12
/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
VAF STATE0,STATE0,STATE1
VX STATE3,STATE3,STATE0
VERLLF STATE3,STATE3,8
/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
VAF STATE2,STATE2,STATE3
VX STATE1,STATE1,STATE2
VERLLF STATE1,STATE1,7
/* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
VPERM STATE1,STATE1,STATE1,PERM4
/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
VPERM STATE2,STATE2,STATE2,PERM8
/* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
VPERM STATE3,STATE3,STATE3,PERM12
/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
VAF STATE0,STATE0,STATE1
VX STATE3,STATE3,STATE0
VERLLF STATE3,STATE3,16
/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */
VAF STATE2,STATE2,STATE3
VX STATE1,STATE1,STATE2
VERLLF STATE1,STATE1,12
/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */
VAF STATE0,STATE0,STATE1
VX STATE3,STATE3,STATE0
VERLLF STATE3,STATE3,8
/* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */
VAF STATE2,STATE2,STATE3
VX STATE1,STATE1,STATE2
VERLLF STATE1,STATE1,7
/* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
VPERM STATE1,STATE1,STATE1,PERM12
/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
VPERM STATE2,STATE2,STATE2,PERM8
/* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
VPERM STATE3,STATE3,STATE3,PERM4
brctg %r0,.Ldoubleround
/* OUTPUT0 = STATE0 + STATE0 */
VAF STATE0,STATE0,COPY0
/* OUTPUT1 = STATE1 + STATE1 */
VAF STATE1,STATE1,COPY1
/* OUTPUT2 = STATE2 + STATE2 */
VAF STATE2,STATE2,COPY2
/* OUTPUT2 = STATE3 + STATE3 */
VAF STATE3,STATE3,COPY3
/*
* 32 bit wise little endian store to OUTPUT. If the vector
* enhancement facility 2 is not installed use the slow path.
*/
ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148)
VSTBRF STATE0,0,,%r2
VSTBRF STATE1,16,,%r2
VSTBRF STATE2,32,,%r2
VSTBRF STATE3,48,,%r2
.Lstoredone:
/* ++COPY3.COUNTER */
/* alsih %r3,1 */
.insn rilu,0xcc0a00000000,%r3,1
alcr %r3,%r1
VLVGG COPY3,%r3,0
/* OUTPUT += 64, --NBLOCKS */
aghi %r2,64
brctg %r5,.Lblock
/* COUNTER = COPY3.COUNTER */
stg %r3,0(%r4)
/* Zero out potentially sensitive regs */
VZERO STATE0
VZERO STATE1
VZERO STATE2
VZERO STATE3
VZERO COPY1
VZERO COPY2
/* Early exit if TMP0-TMP3 have not been used */
ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148)
VZERO TMP0
VZERO TMP1
VZERO TMP2
VZERO TMP3
br %r14
.Lstoreslow:
/* Convert STATE to little endian format and store to OUTPUT */
VPERM TMP0,STATE0,STATE0,BEPERM
VPERM TMP1,STATE1,STATE1,BEPERM
VPERM TMP2,STATE2,STATE2,BEPERM
VPERM TMP3,STATE3,STATE3,BEPERM
VSTM TMP0,TMP3,0,%r2
j .Lstoredone
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
|