bug fixes

author: Werner Koch <wk@gnupg.org> 1998-02-12 00:22:09 +0100
committer: Werner Koch <wk@gnupg.org> 1998-02-12 00:22:09 +0100
commit: bc5789665ae8c9f8fc3d02841cd6c9ade447a12a (patch)
tree: a98aa9fdf58eaffecb38ce481606ee8536280fc8 /mpi
parent: a couple of changes; but some parts are now broken (diff)
download: gnupg2-bc5789665ae8c9f8fc3d02841cd6c9ade447a12a.tar.xz
gnupg2-bc5789665ae8c9f8fc3d02841cd6c9ade447a12a.zip
4 files changed, 403 insertions, 0 deletions
diff --git a/mpi/alpha/README b/mpi/alpha/README
new file mode 100644
index 000000000..55c0a2917
--- /dev/null
+++ b/mpi/alpha/README
@@ -0,0 +1,53 @@
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth.  The on-chip L1 cache is
+write-through, and a cache line is transfered from the store buffer to the
+off-chip L2 in as much 15 cycles on most systems.  This delay hurts
+mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+instructions.
+
+3. mulq and umulh is documented to have a latency of 23 cycles, but 2 of
+these cycles are pipelined.  Thus, multiply instructions can be issued at a
+rate of one each 21nd cycle.
+
+EV5
+
+1. The memory bandwidth of this chip seems excellent, both for loads and
+stores.  Even when the working set is larger than the on-chip L1 and L2
+caches, the perfromance remain almost unaffected.
+
+2. mulq has a measured latency of 13 cycles and an issue rate of 1 each 8th
+cycle.  umulh has a measured latency of 15 cycles and an issue rate of 1
+each 10th cycle.  But the exact timing is somewhat confusing.
+
+3. mpn_add_n.  With 4-fold unrolling, we need 37 instructions, whereof 12
+   are memory operations.  This will take at least
+	ceil(37/2) [dual issue] + 1 [taken branch] = 20 cycles
+   We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+   cache cycles, which should be completely hidden in the 20 issue cycles.
+   The computation is inherently serial, with these dependencies:
+     addq
+     /   \
+   addq  cmpult
+     |     |
+   cmpult  |
+       \  /
+        or
+   I.e., there is a 4 cycle path for each limb, making 16 cycles the absolute
+   minimum.  We could replace the `or' with a cmoveq/cmovne, which would save
+   a cycle on EV5, but that might waste a cycle on EV4.  Also, cmov takes 2
+   cycles.
+     addq
+     /   \
+   addq  cmpult
+     |      \
+   cmpult -> cmovne
+
+STATUS
+
diff --git a/mpi/alpha/distfiles b/mpi/alpha/distfiles
index 4dd0ffe3a..e92d183d2 100644
--- a/mpi/alpha/distfiles
+++ b/mpi/alpha/distfiles
@@ -1,3 +1,6 @@
+README
+mpih-add1.S
+mpih-shift.S
 
 udiv-qrnnd.S
 
diff --git a/mpi/alpha/mpih-add1.S b/mpi/alpha/mpih-add1.S
new file mode 100644
index 000000000..6f88499c9
--- /dev/null
+++ b/mpi/alpha/mpih-add1.S
@@ -0,0 +1,134 @@
+/* alpha  add_n -- Add two limb vectors of the same length > 0 and store
+ *		   sum in a third limb vector.
+ *
+ *	Copyright (C) 1995 Free Software Foundation, Inc.
+ *	Copyright (c) 1997 by Werner Koch (dd9jn)
+ *
+ * This file is part of G10.
+ *
+ * G10 is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * G10 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+
+
+/*******************
+ *  mpi_limb_t
+ *  mpihelp_add_n( mpi_ptr_t res_ptr,	($16)
+ *		   mpi_ptr_t s1_ptr,	($17)
+ *		   mpi_ptr_t s2_ptr,	($18)
+ *		   mpi_size_t size)	($19)
+ */
+
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	mpihelp_add_n
+	.ent	mpihelp_add_n
+mpihelp_add_n:
+	.frame	$30,0,$26,0
+
+	ldq	$3,0($17)
+	ldq	$4,0($18)
+
+	subq	$19,1,$19
+	and	$19,4-1,$2	# number of limbs in first loop
+	bis	$31,$31,$0
+	beq	$2,.L0		# if multiple of 4 limbs, skip first loop
+
+	subq	$19,$2,$19
+
+.Loop0: subq	$2,1,$2
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	addq	$17,8,$17
+	addq	$18,8,$18
+	bis	$5,$5,$3
+	bis	$6,$6,$4
+	addq	$16,8,$16
+	bne	$2,.Loop0
+
+.L0:	beq	$19,.Lend
+
+	.align	3
+.Loop:	subq	$19,4,$19
+
+	ldq	$5,8($17)
+	addq	$4,$0,$4
+	ldq	$6,8($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+
+	ldq	$3,16($17)
+	addq	$6,$0,$6
+	ldq	$4,16($18)
+	cmpult	$6,$0,$1
+	addq	$5,$6,$6
+	cmpult	$6,$5,$0
+	stq	$6,8($16)
+	or	$0,$1,$0
+
+	ldq	$5,24($17)
+	addq	$4,$0,$4
+	ldq	$6,24($18)
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,16($16)
+	or	$0,$1,$0
+
+	ldq	$3,32($17)
+	addq	$6,$0,$6
+	ldq	$4,32($18)
+	cmpult	$6,$0,$1
+	addq	$5,$6,$6
+	cmpult	$6,$5,$0
+	stq	$6,24($16)
+	or	$0,$1,$0
+
+	addq	$17,32,$17
+	addq	$18,32,$18
+	addq	$16,32,$16
+	bne	$19,.Loop
+
+.Lend:	addq	$4,$0,$4
+	cmpult	$4,$0,$1
+	addq	$3,$4,$4
+	cmpult	$4,$3,$0
+	stq	$4,0($16)
+	or	$0,$1,$0
+	ret	$31,($26),1
+
+	.end	mpihelp_add_n
+
diff --git a/mpi/alpha/mpih-shift.S b/mpi/alpha/mpih-shift.S
new file mode 100644
index 000000000..dcf0a1f46
--- /dev/null
+++ b/mpi/alpha/mpih-shift.S
@@ -0,0 +1,213 @@
+/* alpha    rshift, lshift
+ *	Copyright (C) 1994, 1995 Free Software Foundation, Inc.
+ *	Copyright (c) 1997 by Werner Koch (dd9jn)
+ *
+ * This file is part of G10.
+ *
+ * G10 is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * G10 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+
+
+/*******************
+ * mpi_limb_t
+ * mpihelp_lshift( mpi_ptr_t wp,	(r16)
+ *		   mpi_ptr_t up,	(r17)
+ *		   mpi_size_t usize,	(r18)
+ *		   unsigned cnt)	(r19)
+ *
+ * This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ * it would take 4 cycles/limb.  It should be possible to get down to 3
+ * cycles/limb since both ldq and stq can be paired with the other used
+ * instructions.  But there are many restrictions in the 21064 pipeline that
+ * makes it hard, if not impossible, to get down to 3 cycles/limb:
+ *
+ * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ * 2. Only aligned instruction pairs can be paired.
+ * 3. The store buffer or silo might not be able to deal with the bandwidth.
+ */
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	mpihelp_lshift
+	.ent	mpihelp_lshift
+mpihelp_lshift:
+	.frame	$30,0,$26,0
+
+	s8addq	$18,$17,$17	# make r17 point at end of s1
+	ldq	$4,-8($17)	# load first limb
+	subq	$17,8,$17
+	subq	$31,$19,$7
+	s8addq	$18,$16,$16	# make r16 point at end of RES
+	subq	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	srl	$4,$7,$0	# compute function result
+
+	beq	$20,.L0
+	subq	$18,$20,$18
+
+	.align	3
+.Loop0:
+	ldq	$3,-8($17)
+	subq	$16,8,$16
+	subq	$17,8,$17
+	subq	$20,1,$20
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stq	$8,0($16)
+	bne	$20,.Loop0
+
+.L0:	beq	$18,.Lend
+
+	.align	3
+.Loop:	ldq	$3,-8($17)
+	subq	$16,32,$16
+	subq	$18,4,$18
+	sll	$4,$19,$5
+	srl	$3,$7,$6
+
+	ldq	$4,-16($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,24($16)
+	srl	$4,$7,$2
+
+	ldq	$3,-24($17)
+	sll	$4,$19,$5
+	bis	$1,$2,$8
+	stq	$8,16($16)
+	srl	$3,$7,$6
+
+	ldq	$4,-32($17)
+	sll	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,8($16)
+	srl	$4,$7,$2
+
+	subq	$17,32,$17
+	bis	$1,$2,$8
+	stq	$8,0($16)
+
+	bgt	$18,.Loop
+
+.Lend:	sll	$4,$19,$8
+	stq	$8,-8($16)
+	ret	$31,($26),1
+	.end	mpihelp_lshift
+
+
+
+
+
+/*******************
+ * mpi_limb_t
+ * mpihelp_rshift( mpi_ptr_t wp,	(r16)
+ *		   mpi_ptr_t up,	(r17)
+ *		   mpi_size_t usize,	(r18)
+ *		   unsigned cnt)	(r19)
+ *
+ * This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+ * it would take 4 cycles/limb.  It should be possible to get down to 3
+ * cycles/limb since both ldq and stq can be paired with the other used
+ * instructions.  But there are many restrictions in the 21064 pipeline that
+ * makes it hard, if not impossible, to get down to 3 cycles/limb:
+ *
+ * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+ * 2. Only aligned instruction pairs can be paired.
+ * 3. The store buffer or silo might not be able to deal with the bandwidth.
+ */
+
+	.set	noreorder
+	.set	noat
+.text
+	.align	3
+	.globl	mpihelp_rshift
+	.ent	mpihelp_rshift
+mpihelp_rshift:
+	.frame	$30,0,$26,0
+
+	ldq	$4,0($17)	# load first limb
+	addq	$17,8,$17
+	subq	$31,$19,$7
+	subq	$18,1,$18
+	and	$18,4-1,$20	# number of limbs in first loop
+	sll	$4,$7,$0	# compute function result
+
+	beq	$20,.R0
+	subq	$18,$20,$18
+
+	.align	3
+.Roop0:
+	ldq	$3,0($17)
+	addq	$16,8,$16
+	addq	$17,8,$17
+	subq	$20,1,$20
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+	bis	$3,$3,$4
+	bis	$5,$6,$8
+	stq	$8,-8($16)
+	bne	$20,.Roop0
+
+.R0:	beq	$18,.Rend
+
+	.align	3
+.Roop:	ldq	$3,0($17)
+	addq	$16,32,$16
+	subq	$18,4,$18
+	srl	$4,$19,$5
+	sll	$3,$7,$6
+
+	ldq	$4,8($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,-32($16)
+	sll	$4,$7,$2
+
+	ldq	$3,16($17)
+	srl	$4,$19,$5
+	bis	$1,$2,$8
+	stq	$8,-24($16)
+	sll	$3,$7,$6
+
+	ldq	$4,24($17)
+	srl	$3,$19,$1
+	bis	$5,$6,$8
+	stq	$8,-16($16)
+	sll	$4,$7,$2
+
+	addq	$17,32,$17
+	bis	$1,$2,$8
+	stq	$8,-8($16)
+
+	bgt	$18,.Roop
+
+.Rend:	srl	$4,$19,$8
+	stq	$8,0($16)
+	ret	$31,($26),1
+	.end	mpihelp_rshift
+
author	Werner Koch <wk@gnupg.org>	1998-02-12 00:22:09 +0100
committer	Werner Koch <wk@gnupg.org>	1998-02-12 00:22:09 +0100
commit	bc5789665ae8c9f8fc3d02841cd6c9ade447a12a (patch)
tree	a98aa9fdf58eaffecb38ce481606ee8536280fc8 /mpi
parent	a couple of changes; but some parts are now broken (diff)
download	gnupg2-bc5789665ae8c9f8fc3d02841cd6c9ade447a12a.tar.xz gnupg2-bc5789665ae8c9f8fc3d02841cd6c9ade447a12a.zip