diff options
author | Werner Koch <wk@gnupg.org> | 1998-02-12 00:22:09 +0100 |
---|---|---|
committer | Werner Koch <wk@gnupg.org> | 1998-02-12 00:22:09 +0100 |
commit | bc5789665ae8c9f8fc3d02841cd6c9ade447a12a (patch) | |
tree | a98aa9fdf58eaffecb38ce481606ee8536280fc8 /mpi | |
parent | a couple of changes; but some parts are now broken (diff) | |
download | gnupg2-bc5789665ae8c9f8fc3d02841cd6c9ade447a12a.tar.xz gnupg2-bc5789665ae8c9f8fc3d02841cd6c9ade447a12a.zip |
bug fixes
Diffstat (limited to 'mpi')
-rw-r--r-- | mpi/alpha/README | 53 | ||||
-rw-r--r-- | mpi/alpha/distfiles | 3 | ||||
-rw-r--r-- | mpi/alpha/mpih-add1.S | 134 | ||||
-rw-r--r-- | mpi/alpha/mpih-shift.S | 213 |
4 files changed, 403 insertions, 0 deletions
diff --git a/mpi/alpha/README b/mpi/alpha/README new file mode 100644 index 000000000..55c0a2917 --- /dev/null +++ b/mpi/alpha/README @@ -0,0 +1,53 @@ +This directory contains mpn functions optimized for DEC Alpha processors. + +RELEVANT OPTIMIZATION ISSUES + +EV4 + +1. This chip has very limited store bandwidth. The on-chip L1 cache is +write-through, and a cache line is transfered from the store buffer to the +off-chip L2 in as much 15 cycles on most systems. This delay hurts +mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift. + +2. Pairing is possible between memory instructions and integer arithmetic +instructions. + +3. mulq and umulh is documented to have a latency of 23 cycles, but 2 of +these cycles are pipelined. Thus, multiply instructions can be issued at a +rate of one each 21nd cycle. + +EV5 + +1. The memory bandwidth of this chip seems excellent, both for loads and +stores. Even when the working set is larger than the on-chip L1 and L2 +caches, the perfromance remain almost unaffected. + +2. mulq has a measured latency of 13 cycles and an issue rate of 1 each 8th +cycle. umulh has a measured latency of 15 cycles and an issue rate of 1 +each 10th cycle. But the exact timing is somewhat confusing. + +3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12 + are memory operations. This will take at least + ceil(37/2) [dual issue] + 1 [taken branch] = 20 cycles + We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data + cache cycles, which should be completely hidden in the 20 issue cycles. + The computation is inherently serial, with these dependencies: + addq + / \ + addq cmpult + | | + cmpult | + \ / + or + I.e., there is a 4 cycle path for each limb, making 16 cycles the absolute + minimum. We could replace the `or' with a cmoveq/cmovne, which would save + a cycle on EV5, but that might waste a cycle on EV4. Also, cmov takes 2 + cycles. + addq + / \ + addq cmpult + | \ + cmpult -> cmovne + +STATUS + diff --git a/mpi/alpha/distfiles b/mpi/alpha/distfiles index 4dd0ffe3a..e92d183d2 100644 --- a/mpi/alpha/distfiles +++ b/mpi/alpha/distfiles @@ -1,3 +1,6 @@ +README +mpih-add1.S +mpih-shift.S udiv-qrnnd.S diff --git a/mpi/alpha/mpih-add1.S b/mpi/alpha/mpih-add1.S new file mode 100644 index 000000000..6f88499c9 --- /dev/null +++ b/mpi/alpha/mpih-add1.S @@ -0,0 +1,134 @@ +/* alpha add_n -- Add two limb vectors of the same length > 0 and store + * sum in a third limb vector. + * + * Copyright (C) 1995 Free Software Foundation, Inc. + * Copyright (c) 1997 by Werner Koch (dd9jn) + * + * This file is part of G10. + * + * G10 is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * G10 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + + + +/******************* + * mpi_limb_t + * mpihelp_add_n( mpi_ptr_t res_ptr, ($16) + * mpi_ptr_t s1_ptr, ($17) + * mpi_ptr_t s2_ptr, ($18) + * mpi_size_t size) ($19) + */ + + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_add_n + .ent mpihelp_add_n +mpihelp_add_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end mpihelp_add_n + diff --git a/mpi/alpha/mpih-shift.S b/mpi/alpha/mpih-shift.S new file mode 100644 index 000000000..dcf0a1f46 --- /dev/null +++ b/mpi/alpha/mpih-shift.S @@ -0,0 +1,213 @@ +/* alpha rshift, lshift + * Copyright (C) 1994, 1995 Free Software Foundation, Inc. + * Copyright (c) 1997 by Werner Koch (dd9jn) + * + * This file is part of G10. + * + * G10 is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * G10 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + + + +/******************* + * mpi_limb_t + * mpihelp_lshift( mpi_ptr_t wp, (r16) + * mpi_ptr_t up, (r17) + * mpi_size_t usize, (r18) + * unsigned cnt) (r19) + * + * This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + * it would take 4 cycles/limb. It should be possible to get down to 3 + * cycles/limb since both ldq and stq can be paired with the other used + * instructions. But there are many restrictions in the 21064 pipeline that + * makes it hard, if not impossible, to get down to 3 cycles/limb: + * + * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + * 2. Only aligned instruction pairs can be paired. + * 3. The store buffer or silo might not be able to deal with the bandwidth. + */ + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_lshift + .ent mpihelp_lshift +mpihelp_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $17,8,$17 + subq $31,$19,$7 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + srl $4,$7,$0 # compute function result + + beq $20,.L0 + subq $18,$20,$18 + + .align 3 +.Loop0: + ldq $3,-8($17) + subq $16,8,$16 + subq $17,8,$17 + subq $20,1,$20 + sll $4,$19,$5 + srl $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,0($16) + bne $20,.Loop0 + +.L0: beq $18,.Lend + + .align 3 +.Loop: ldq $3,-8($17) + subq $16,32,$16 + subq $18,4,$18 + sll $4,$19,$5 + srl $3,$7,$6 + + ldq $4,-16($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,24($16) + srl $4,$7,$2 + + ldq $3,-24($17) + sll $4,$19,$5 + bis $1,$2,$8 + stq $8,16($16) + srl $3,$7,$6 + + ldq $4,-32($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,8($16) + srl $4,$7,$2 + + subq $17,32,$17 + bis $1,$2,$8 + stq $8,0($16) + + bgt $18,.Loop + +.Lend: sll $4,$19,$8 + stq $8,-8($16) + ret $31,($26),1 + .end mpihelp_lshift + + + + + +/******************* + * mpi_limb_t + * mpihelp_rshift( mpi_ptr_t wp, (r16) + * mpi_ptr_t up, (r17) + * mpi_size_t usize, (r18) + * unsigned cnt) (r19) + * + * This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + * it would take 4 cycles/limb. It should be possible to get down to 3 + * cycles/limb since both ldq and stq can be paired with the other used + * instructions. But there are many restrictions in the 21064 pipeline that + * makes it hard, if not impossible, to get down to 3 cycles/limb: + * + * 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + * 2. Only aligned instruction pairs can be paired. + * 3. The store buffer or silo might not be able to deal with the bandwidth. + */ + + .set noreorder + .set noat +.text + .align 3 + .globl mpihelp_rshift + .ent mpihelp_rshift +mpihelp_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + addq $17,8,$17 + subq $31,$19,$7 + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + sll $4,$7,$0 # compute function result + + beq $20,.R0 + subq $18,$20,$18 + + .align 3 +.Roop0: + ldq $3,0($17) + addq $16,8,$16 + addq $17,8,$17 + subq $20,1,$20 + srl $4,$19,$5 + sll $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,-8($16) + bne $20,.Roop0 + +.R0: beq $18,.Rend + + .align 3 +.Roop: ldq $3,0($17) + addq $16,32,$16 + subq $18,4,$18 + srl $4,$19,$5 + sll $3,$7,$6 + + ldq $4,8($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-32($16) + sll $4,$7,$2 + + ldq $3,16($17) + srl $4,$19,$5 + bis $1,$2,$8 + stq $8,-24($16) + sll $3,$7,$6 + + ldq $4,24($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-16($16) + sll $4,$7,$2 + + addq $17,32,$17 + bis $1,$2,$8 + stq $8,-8($16) + + bgt $18,.Roop + +.Rend: srl $4,$19,$8 + stq $8,0($16) + ret $31,($26),1 + .end mpihelp_rshift + |