crypto/bn/asm/bn-c64xplus.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374

;;====================================================================
;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
;; project.
;;
;; Rights for redistribution and usage in source and binary forms are
;; granted according to the OpenSSL license. Warranty of any kind is
;; disclaimed.
;;====================================================================
;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
;;====================================================================
	.text

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
	.if	__TI_EABI__
	.asg	bn_mul_add_words,_bn_mul_add_words
	.asg	bn_mul_words,_bn_mul_words
	.asg	bn_sqr_words,_bn_sqr_words
	.asg	bn_add_words,_bn_add_words
	.asg	bn_sub_words,_bn_sub_words
	.asg	bn_div_words,_bn_div_words
	.asg	bn_sqr_comba8,_bn_sqr_comba8
	.asg	bn_mul_comba8,_bn_mul_comba8
	.asg	bn_sqr_comba4,_bn_sqr_comba4
	.asg	bn_mul_comba4,_bn_mul_comba4
	.endif

	.asg	B3,RA
	.asg	A4,ARG0
	.asg	B4,ARG1
	.asg	A6,ARG2
	.asg	B6,ARG3
	.asg	A8,ARG4
	.asg	B8,ARG5
	.asg	A4,RET
	.asg	A15,FP
	.asg	B14,DP
	.asg	B15,SP

	.global	_bn_mul_add_words
_bn_mul_add_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A19		; high part of accumulator
|| [B0]	MV	ARG0,A2
|| [B0]	MV	ARG3,A3
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,B7	; ap[i]
	NOP	3
	LDW	*ARG0++,A7	; rp[i]
	MPY32U	B7,A3,A17:A16
	NOP	3		; [2,0] in epilogue
	ADDU	A16,A7,A21:A20
	ADDU	A19,A21:A20,A19:A18
||	MV.S	A17,A23
	SPKERNEL 2,1		; leave slot for "return value"
||	STW	A18,*A2++	; rp[i]
||	ADD	A19,A23,A19
;;====================================================================
	BNOP	RA,4
	MV	A19,RET		; return value
	.endasmfunc

	.global	_bn_mul_words
_bn_mul_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A19		; high part of accumulator
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,A7	; ap[i]
	NOP	4
	MPY32U	A7,ARG3,A17:A16
	NOP	4		; [2,0] in epiloque
	ADDU	A19,A16,A19:A18
||	MV.S	A17,A21
	SPKERNEL 2,1		; leave slot for "return value"
||	STW	A18,*ARG0++	; rp[i]
||	ADD.L	A19,A21,A19
;;====================================================================
	BNOP	RA,4
	MV	A19,RET		; return value
	.endasmfunc

	.global	_bn_sqr_words
_bn_sqr_words:
	.asmfunc
	MV	ARG2,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	MV	ARG0,B2
|| [B0]	ADD	4,ARG0,ARG0
	NOP	3

	SPLOOP	2		; 2*n+10
;;====================================================================
	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	MPY32U	B7,B7,B1:B0
	NOP	3		; [2,0] in epilogue
	STW	B0,*B2++(8)	; rp[2*i]
	MV	B1,A1
	SPKERNEL 2,0		; fully overlap BNOP RA,5
||	STW	A1,*ARG0++(8)	; rp[2*i+1]
;;====================================================================
	BNOP	RA,5
	.endasmfunc

	.global	_bn_add_words
_bn_add_words:
	.asmfunc
	MV	ARG3,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A1		; carry flag
|| [B0]	MV	ARG0,A3
	NOP	3

	SPLOOP	2		; 2*n+6
;;====================================================================
	LDW	*ARG2++,A7	; bp[i]
||	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	ADDU	A7,B7,A9:A8
	ADDU	A1,A9:A8,A1:A0
	SPKERNEL 0,0		; fully overlap BNOP RA,5
||	STW	A0,*A3++	; write result
||	MV	A1,RET		; keep carry flag in RET
;;====================================================================
	BNOP	RA,5
	.endasmfunc

	.global	_bn_sub_words
_bn_sub_words:
	.asmfunc
	MV	ARG3,B0
  [!B0]	BNOP	RA
||[!B0]	MVK	0,RET
   [B0]	MVC	B0,ILC
   [B0]	ZERO	A2		; borrow flag
|| [B0]	MV	ARG0,A3
	NOP	3

	SPLOOP	2		; 2*n+6
;;====================================================================
	LDW	*ARG2++,A7	; bp[i]
||	LDW	*ARG1++,B7	; ap[i]
	NOP	4
	SUBU	B7,A7,A1:A0
  [A2]	SUB	A1:A0,1,A1:A0
	SPKERNEL 0,1		; leave slot for "return borrow flag"
||	STW	A0,*A3++	; write result
||	AND	1,A1,A2		; pass on borrow flag
;;====================================================================
	BNOP	RA,4
	AND	1,A1,RET	; return borrow flag
	.endasmfunc

	.global	_bn_div_words
_bn_div_words:
	.asmfunc
	LMBD	1,A6,A0		; leading zero bits in dv
	LMBD	1,A4,A1		; leading zero bits in hi
||	MVK	32,B0
	CMPLTU	A1,A0,A2
||	ADD	A0,B0,B0
  [ A2]	BNOP	RA
||[ A2]	MVK	-1,A4		; return overflow
||[!A2]	MV	A4,A3		; reassign hi
  [!A2]	MV	B4,A4		; reassign lo, will be quotient
||[!A2]	MVC	B0,ILC
  [!A2]	SHL	A6,A0,A6	; normalize dv
||	MVK	1,A1

  [!A2]	CMPLTU	A3,A6,A1	; hi<dv?
||[!A2]	SHL	A4,1,A5:A4	; lo<<1
  [!A1]	SUB	A3,A6,A3	; hi-=dv
||[!A1]	OR	1,A4,A4
  [!A2]	SHRU	A3,31,A1	; upper bit
||[!A2]	ADDAH	A5,A3,A3	; hi<<1|lo>>31

	SPLOOP	3
  [!A1]	CMPLTU	A3,A6,A1	; hi<dv?
||[ A1]	ZERO	A1
||	SHL	A4,1,A5:A4	; lo<<1
  [!A1]	SUB	A3,A6,A3	; hi-=dv
||[!A1]	OR	1,A4,A4		; quotient
	SHRU	A3,31,A1	; upper bit
||	ADDAH	A5,A3,A3	; hi<<1|lo>>31
	SPKERNEL

	BNOP	RA,5
	.endasmfunc

;;====================================================================
;; Not really Comba algorithm, just straightforward NxM... Dedicated
;; fully unrolled real Comba implementations are asymptotically 2x
;; faster, but naturally larger undertaking. Purpose of this exercise
;; was rather to learn to master nested SPLOOPs...
;;====================================================================
	.global	_bn_sqr_comba8
	.global	_bn_mul_comba8
_bn_sqr_comba8:
	MV	ARG1,ARG2
_bn_mul_comba8:
	.asmfunc
	MVK	8,B0		; N, RILC
||	MVK	8,A0		; M, outer loop counter
||	MV	ARG1,A5		; copy ap
||	MV	ARG0,B4		; copy rp
||	ZERO	B19		; high part of accumulator
	MVC	B0,RILC
||	SUB	B0,2,B1		; N-2, initial ILC
||	SUB	B0,1,B2		; const B2=N-1
||	LDW	*A5++,B6	; ap[0]
||	MV	A0,A3		; const A3=M
sploopNxM?:			; for best performance arrange M<=N
   [A0]	SPLOOPD	2		; 2*n+10
||	MVC	B1,ILC
||	ADDAW	B4,B0,B5
||	ZERO	B7
||	LDW	*A5++,A9	; pre-fetch ap[1]
||	ZERO	A1
||	SUB	A0,1,A0
;;====================================================================
;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
;; This is because of Advisory 15 from TI publication SPRZ247I.
	LDW	*ARG2++,A7	; bp[i]
	NOP	3
   [A1]	LDW	*B5++,B7	; rp[i]
	MPY32U	A7,B6,B17:B16
	NOP	3
	ADDU	B16,B7,B21:B20
	ADDU	B19,B21:B20,B19:B18
||	MV.S	B17,B23
	SPKERNEL
||	STW	B18,*B4++	; rp[i]
||	ADD.S	B19,B23,B19
;;====================================================================
outer?:				; m*2*(n+1)+10
	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
	SPMASKR
||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
	MVD	A9,B6		; move through .M unit(*)
   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
	SUBAW	B5,B2,B5	; rewind rp to rp[1]
	MVK	1,A1
   [A0]	BNOP.S1	outer?,4
|| [A0]	SUB.L	A0,1,A0
	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
||	ZERO.S	B19		; high part of accumulator
;; end of outer?
	BNOP	RA,5		; return
	.endasmfunc
;; (*)	It should be noted that B6 is used as input to MPY32U in
;;	chronologically next cycle in *preceding* SPLOOP iteration.
;;	Normally such arrangement would require DINT, but at this
;;	point SPLOOP is draining and interrupts are disabled
;;	implicitly.

	.global	_bn_sqr_comba4
	.global	_bn_mul_comba4
_bn_sqr_comba4:
	MV	ARG1,ARG2
_bn_mul_comba4:
	.asmfunc
	.if	0
	BNOP	sploopNxM?,3
	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
	;; because of read-after-write penalties, it's rather
	;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
	MVK	4,B0		; N, RILC
||	MVK	4,A0		; M, outer loop counter
||	MV	ARG1,A5		; copy ap
||	MV	ARG0,B4		; copy rp
||	ZERO	B19		; high part of accumulator
	MVC	B0,RILC
||	SUB	B0,2,B1		; first ILC
||	SUB	B0,1,B2		; const B2=N-1
||	LDW	*A5++,B6	; ap[0]
||	MV	A0,A3		; const A3=M
	.else
	;; This alternative is an exercise in fully unrolled Comba
	;; algorithm implementation that operates at n*(n+1)+12, or
	;; as little as 32 cycles...
	LDW	*ARG1[0],B16	; a[0]
||	LDW	*ARG2[0],A16	; b[0]
	LDW	*ARG1[1],B17	; a[1]
||	LDW	*ARG2[1],A17	; b[1]
	LDW	*ARG1[2],B18	; a[2]
||	LDW	*ARG2[2],A18	; b[2]
	LDW	*ARG1[3],B19	; a[3]
||	LDW	*ARG2[3],A19	; b[3]
	NOP
	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
	STW	A0,*ARG0[0]
||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
||	ADDU	A22,A1,A1:A0
	MV	A23,B0
||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
||	ADDU	A24,A1:A0,A1:A0
	ADDU	A25,B0,B1:B0
||	STW	A0,*ARG0[1]
||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
||	ADDU	A26,A1,A9:A8
	ADDU	A27,B1,B9:B8
||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
||	ADDU	A28,A9:A8,A9:A8
	ADDU	A29,B9:B8,B9:B8
||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
||	ADDU	A30,A9:A8,A9:A8
	ADDU	A31,B9:B8,B9:B8
||	ADDU	B0,A9:A8,A9:A8
	STW	A8,*ARG0[2]
||	ADDU	A20,A9,A1:A0
	ADDU	A21,B9,B1:B0
||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
||	ADDU	A22,A1:A0,A1:A0
	ADDU	A23,B1:B0,B1:B0
||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
||	ADDU	A24,A1:A0,A1:A0
	ADDU	A25,B1:B0,B1:B0
||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
||	ADDU	A26,A1:A0,A1:A0
	ADDU	A27,B1:B0,B1:B0
||	ADDU	B8,A1:A0,A1:A0
	STW	A0,*ARG0[3]
||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
||	ADDU	A20,A1,A9:A8
	ADDU	A21,B1,B9:B8
||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
||	ADDU	A22,A9:A8,A9:A8
	ADDU	A23,B9:B8,B9:B8
||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
||	ADDU	A24,A9:A8,A9:A8
	ADDU	A25,B9:B8,B9:B8
||	ADDU	B0,A9:A8,A9:A8
	STW	A8,*ARG0[4]
||	ADDU	A26,A9,A1:A0
	ADDU	A27,B9,B1:B0
||	ADDU	A28,A1:A0,A1:A0
	ADDU	A29,B1:B0,B1:B0
||	BNOP	RA
||	ADDU	B8,A1:A0,A1:A0
	STW	A0,*ARG0[5]
||	ADDU	A30,A1,A9:A8
	ADD	A31,B1,B8
	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
	ADD	B8,A9,A9
||	STW	A8,*ARG0[6]
	STW	A9,*ARG0[7]
	.endif
	.endasmfunc