From: Roland McGrath Date: Mon, 16 Oct 1995 01:18:40 +0000 (+0000) Subject: Updated from ../=mpn/gmp-1.906.7 X-Git-Tag: glibc-2.16-ports-before-merge~3908 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=7def3d92a4edd807d2df2f016e5c1d6b8e748358;p=thirdparty%2Fglibc.git Updated from ../=mpn/gmp-1.906.7 --- diff --git a/sysdeps/alpha/add_n.s b/sysdeps/alpha/add_n.s new file mode 100644 index 00000000000..e1ad4600f50 --- /dev/null +++ b/sysdeps/alpha/add_n.s @@ -0,0 +1,119 @@ + # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_add_n diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s new file mode 100644 index 00000000000..46d277df6ea --- /dev/null +++ b/sysdeps/alpha/addmul_1.s @@ -0,0 +1,100 @@ + # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on the 21064. + + # To improve performance for long multiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Alpha + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 2 +__mpn_addmul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + addq $5,$3,$3 + cmpult $3,$5,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end __mpn_addmul_1 diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s new file mode 100644 index 00000000000..2aaf0417742 --- /dev/null +++ b/sysdeps/alpha/alphaev5/add_n.s @@ -0,0 +1,118 @@ + # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 4 +.Loop: subq $19,4,$19 + unop + + ldq $6,8($18) + addq $4,$0,$0 + ldq $5,8($17) + cmpult $0,$4,$1 + ldq $4,16($18) + addq $3,$0,$20 + cmpult $20,$3,$0 + ldq $3,16($17) + or $0,$1,$0 + addq $6,$0,$0 + cmpult $0,$6,$1 + ldq $6,24($18) + addq $5,$0,$21 + cmpult $21,$5,$0 + ldq $5,24($17) + or $0,$1,$0 + addq $4,$0,$0 + cmpult $0,$4,$1 + ldq $4,32($18) + addq $3,$0,$22 + cmpult $22,$3,$0 + ldq $3,32($17) + or $0,$1,$0 + addq $6,$0,$0 + cmpult $0,$6,$1 + addq $5,$0,$23 + cmpult $23,$5,$0 + or $0,$1,$0 + + stq $20,0($16) + stq $21,8($16) + stq $22,16($16) + stq $23,24($16) + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_add_n diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s new file mode 100644 index 00000000000..fdb089550f0 --- /dev/null +++ b/sysdeps/alpha/alphaev5/lshift.s @@ -0,0 +1,175 @@ + # Alpha EV5 __mpn_lshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.25 cycles/limb on the EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $31,$19,$20 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + srl $4,$20,$0 # compute function result + + beq $28,L0 + subq $18,$28,$18 + + .align 3 +Loop0: ldq $3,-16($17) + subq $16,8,$16 + sll $4,$19,$5 + subq $17,8,$17 + subq $28,1,$28 + srl $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stq $8,0($16) + bne $28,Loop0 + +L0: sll $4,$19,$24 + beq $18,Lend + # warm up phase 1 + ldq $1,-16($17) + subq $18,4,$18 + ldq $2,-24($17) + ldq $3,-32($17) + ldq $4,-40($17) + beq $18,Lcool1 + # warm up phase 2 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + ldq $1,-48($17) + sll $2,$19,$22 + ldq $2,-56($17) + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + ldq $3,-64($17) + sll $4,$19,$24 + ldq $4,-72($17) + subq $18,4,$18 + beq $18,Lcool1 + .align 4 + # main loop +Loop: stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + + srl $1,$20,$7 + subq $18,4,$18 + sll $1,$19,$21 + unop # ldq $31,-96($17) + + srl $2,$20,$8 + ldq $1,-80($17) + sll $2,$19,$22 + ldq $2,-88($17) + + stq $5,-24($16) + or $7,$24,$7 + stq $6,-32($16) + or $8,$21,$8 + + srl $3,$20,$5 + unop # ldq $31,-96($17) + sll $3,$19,$23 + subq $16,32,$16 + + srl $4,$20,$6 + ldq $3,-96($17 + sll $4,$19,$24 + ldq $4,-104($17) + + subq $17,32,$17 + bne $18,Loop + unop + unop + # cool down phase 2/1 +Lcool1: stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + stq $5,-24($16) + or $7,$24,$7 + stq $6,-32($16) + or $8,$21,$8 + srl $3,$20,$5 + sll $3,$19,$23 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 2/2 + stq $7,-40($16) + or $5,$22,$5 + stq $8,-48($16) + or $6,$23,$6 + stq $5,-56($16) + stq $6,-64($16) + # cool down phase 2/3 + stq $24,-72($16) + ret $31,($26),1 + + # cool down phase 1/1 +Lcool1: srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 1/2 + stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + stq $5,-24($16) + stq $6,-32($16) + stq $24,-40($16) + ret $31,($26),1 + +Lend stq $24,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s new file mode 100644 index 00000000000..1da9960b46a --- /dev/null +++ b/sysdeps/alpha/alphaev5/rshift.s @@ -0,0 +1,173 @@ + # Alpha EV5 __mpn_rshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.25 cycles/limb on the EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + subq $31,$19,$20 + subq $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + sll $4,$20,$0 # compute function result + + beq $28,L0 + subq $18,$28,$18 + + .align 3 +Loop0: ldq $3,8($17) + addq $16,8,$16 + srl $4,$19,$5 + addq $17,8,$17 + subq $28,1,$28 + sll $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stq $8,-8($16) + bne $28,Loop0 + +L0: srl $4,$19,$24 + beq $18,Lend + # warm up phase 1 + ldq $1,8($17) + subq $18,4,$18 + ldq $2,16($17) + ldq $3,24($17) + ldq $4,32($17) + beq $18,Lcool1 + # warm up phase 2 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + ldq $1,40($17) + srl $2,$19,$22 + ldq $2,48($17) + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + ldq $3,56($17) + srl $4,$19,$24 + ldq $4,64($17) + subq $18,4,$18 + beq $18,Lcool2 + .align 4 + # main loop +Loop: stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + + sll $1,$20,$7 + subq $18,4,$18 + srl $1,$19,$21 + unop # ldq $31,-96($17) + + sll $2,$20,$8 + ldq $1,72($17) + srl $2,$19,$22 + ldq $2,80($17) + + stq $5,16($16) + or $7,$24,$7 + stq $6,24($16) + or $8,$21,$8 + + sll $3,$20,$5 + unop # ldq $31,-96($17) + srl $3,$19,$23 + addq $16,32,$16 + + sll $4,$20,$6 + ldq $3,88($17) + srl $4,$19,$24 + ldq $4,96($17) + + addq $17,32,$17 + bne $18,Loop + unop + unop + # cool down phase 2/1 +Lcool2: stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + stq $5,16($16) + or $7,$24,$7 + stq $6,24($16) + or $8,$21,$8 + sll $3,$20,$5 + srl $3,$19,$23 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 2/2 + stq $7,32($16) + or $5,$22,$5 + stq $8,40($16) + or $6,$23,$6 + stq $5,48($16) + stq $6,56($16) + # cool down phase 2/3 + stq $24,64($16) + ret $31,($26),1 + + # cool down phase 1/1 +Lcool1: sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 1/2 + stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + stq $5,16($16) + stq $6,24($16) + stq $24,32($16) + ret $31,($26),1 + +Lend: stq $24,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s new file mode 100644 index 00000000000..c28434926b7 --- /dev/null +++ b/sysdeps/alpha/lshift.s @@ -0,0 +1,108 @@ + # Alpha 21064 __mpn_lshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldq and stq can be paired with the other used + # instructions. But there are many restrictions in the 21064 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $17,8,$17 + subq $31,$19,$7 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + srl $4,$7,$0 # compute function result + + beq $20,L0 + subq $18,$20,$18 + + .align 3 +Loop0: + ldq $3,-8($17) + subq $16,8,$16 + subq $17,8,$17 + subq $20,1,$20 + sll $4,$19,$5 + srl $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,0($16) + bne $20,Loop0 + +L0: beq $18,Lend + + .align 3 +Loop: ldq $3,-8($17) + subq $16,32,$16 + subq $18,4,$18 + sll $4,$19,$5 + srl $3,$7,$6 + + ldq $4,-16($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,24($16) + srl $4,$7,$2 + + ldq $3,-24($17) + sll $4,$19,$5 + bis $1,$2,$8 + stq $8,16($16) + srl $3,$7,$6 + + ldq $4,-32($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,8($16) + srl $4,$7,$2 + + subq $17,32,$17 + bis $1,$2,$8 + stq $8,0($16) + + bgt $18,Loop + +Lend: sll $4,$19,$8 + stq $8,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s new file mode 100644 index 00000000000..3ef194d7e6d --- /dev/null +++ b/sysdeps/alpha/mul_1.s @@ -0,0 +1,84 @@ + # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store + # the result in a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5. + + # To improve performance for long multiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Alpha + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_mul_1 + .ent __mpn_mul_1 2 +__mpn_mul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + bic $31,$31,$4 # clear cy_limb + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,8($17) # $2 = s1_limb + subq $18,1,$18 # size-- + stq $3,0($16) + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,16($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + stq $3,8($16) + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $16,8,$16 # res_ptr++ + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + stq $3,8($16) + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: stq $3,0($16) + ret $31,($26),1 + + .end __mpn_mul_1 diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s new file mode 100644 index 00000000000..74eab0434a4 --- /dev/null +++ b/sysdeps/alpha/rshift.s @@ -0,0 +1,106 @@ + # Alpha 21064 __mpn_rshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldq and stq can be paired with the other used + # instructions. But there are many restrictions in the 21064 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + addq $17,8,$17 + subq $31,$19,$7 + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + sll $4,$7,$0 # compute function result + + beq $20,L0 + subq $18,$20,$18 + + .align 3 +Loop0: + ldq $3,0($17) + addq $16,8,$16 + addq $17,8,$17 + subq $20,1,$20 + srl $4,$19,$5 + sll $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,-8($16) + bne $20,Loop0 + +L0: beq $18,Lend + + .align 3 +Loop: ldq $3,0($17) + addq $16,32,$16 + subq $18,4,$18 + srl $4,$19,$5 + sll $3,$7,$6 + + ldq $4,8($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-32($16) + sll $4,$7,$2 + + ldq $3,16($17) + srl $4,$19,$5 + bis $1,$2,$8 + stq $8,-24($16) + sll $3,$7,$6 + + ldq $4,24($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-16($16) + sll $4,$7,$2 + + addq $17,32,$17 + bis $1,$2,$8 + stq $8,-8($16) + + bgt $18,Loop + +Lend: srl $4,$19,$8 + stq $8,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/alpha/sub_n.s b/sysdeps/alpha/sub_n.s new file mode 100644 index 00000000000..5200025b415 --- /dev/null +++ b/sysdeps/alpha/sub_n.s @@ -0,0 +1,119 @@ + # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_sub_n diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s new file mode 100644 index 00000000000..acaa11c5456 --- /dev/null +++ b/sysdeps/alpha/submul_1.s @@ -0,0 +1,100 @@ + # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and + # subtract the result from a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on the 21064. + + # To improve performance for long multiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Alpha + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_submul_1 + .ent __mpn_submul_1 2 +__mpn_submul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + subq $5,$3,$3 + cmpult $5,$3,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end __mpn_submul_1 diff --git a/sysdeps/hppa/add_n.s b/sysdeps/hppa/add_n.s new file mode 100644 index 00000000000..7f3e32342bc --- /dev/null +++ b/sysdeps/hppa/add_n.s @@ -0,0 +1,57 @@ +; HP-PA __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; One might want to unroll this as for other processors, but it turns +; out that the data cache contention after a store makes such +; unrolling useless. We can't come under 5 cycles/limb anyway. + + .code + .export __mpn_add_n +__mpn_add_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L$end ; check for (SIZE == 1) + add %r20,%r19,%r28 ; add first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L$loop + addc %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/addmul_1.s b/sysdeps/hppa/hppa1.1/addmul_1.s new file mode 100644 index 00000000000..a9dfdd1c284 --- /dev/null +++ b/sysdeps/hppa/hppa1.1/addmul_1.s @@ -0,0 +1,101 @@ +; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 11 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 10 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + + .code + .export __mpn_addmul_1 +__mpn_addmul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + add %r29,%r1,%r19 + stw %r19,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/mul_1.s b/sysdeps/hppa/hppa1.1/mul_1.s new file mode 100644 index 00000000000..ebf0778b906 --- /dev/null +++ b/sysdeps/hppa/hppa1.1/mul_1.s @@ -0,0 +1,97 @@ +; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store +; the result in a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 9 cycles/limb on a PA7000. With the used instructions, it can +; not become faster due to data cache contention after a store. On the +; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since +; only the xmpyu does not need the integer pipeline, so the only dual-issue +; we will get are addc+xmpyu. Unrolling would not help either CPU. + +; We could use fldds to read two limbs at a time from the S1 array, and that +; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and +; PA7100, respectively. We don't do that since it does not seem worth the +; (alignment) troubles... + +; At least the PA7100 is rumored to be able to deal with cache-misses +; without stalling instruction issue. If this is true, and the cache is +; actually also lockup-free, we should use a deeper software pipeline, and +; load from S1 very early! (The loads and stores to -12(sp) will surely be +; in the cache.) + + .code + .export __mpn_mul_1 +__mpn_mul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop fldws,ma 4(%r25),%fr5 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + fstds %fr6,-16(%r30) + ldw -16(%r30),%r28 + ldo -64(%r30),%r30 + bv 0(%r2) + fstws %fr6R,0(%r26) + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/submul_1.s b/sysdeps/hppa/hppa1.1/submul_1.s new file mode 100644 index 00000000000..44cabf46900 --- /dev/null +++ b/sysdeps/hppa/hppa1.1/submul_1.s @@ -0,0 +1,110 @@ +; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 12 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 11 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + +; It seems possible to make this run as fast as __mpn_addmul_1, if we use +; sub,>>= %r29,%r19,%r22 +; addi 1,%r28,%r28 +; but that requires reworking the hairy software pipeline... + + .code + .export __mpn_submul_1 +__mpn_submul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + sub %r29,%r1,%r22 + add %r22,%r1,%r0 + stw %r22,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/udiv_qrnnd.s b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s new file mode 100644 index 00000000000..4ffef3a4fbf --- /dev/null +++ b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s @@ -0,0 +1,74 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on PA 7000 and later. + +; Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + + .code +L$0000 .word 0x43f00000 + .word 0x0 + .export __udiv_qrnnd +__udiv_qrnnd + .proc + .callinfo frame=64,no_calls + .entry + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) ; n_hi + stws %r24,-12(0,%r30) ; n_lo + ldil L'L$0000,%r19 + ldo R'L$0000(%r19),%r19 + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L$1 + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r19),%fr4 + fadd,dbl %fr4,%fr5,%fr5 +L$1 + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r19 + comib,= 0,%r19,L$2 + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 +L$2 bv 0(%r2) + stws %r22,0(0,%r26) + + .exit + .procend diff --git a/sysdeps/hppa/lshift.s b/sysdeps/hppa/lshift.s new file mode 100644 index 00000000000..0479f4a281c --- /dev/null +++ b/sysdeps/hppa/lshift.s @@ -0,0 +1,65 @@ +; HP-PA __mpn_lshift -- + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __mpn_lshift +__mpn_lshift + .proc + .callinfo frame=64,no_calls + .entry + + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L$0004 + vshd %r0,%r22,%r28 ; compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,= -1,%r24,L$0002 + vshd %r22,%r29,%r20 + +L$loop ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,= -1,%r24,L$0003 + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,<> -1,%r24,L$loop + vshd %r22,%r29,%r20 + +L$0002 stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +L$0003 stws,mb %r20,-4(0,%r26) +L$0004 vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + + .exit + .procend diff --git a/sysdeps/hppa/rshift.s b/sysdeps/hppa/rshift.s new file mode 100644 index 00000000000..18d33f2f86a --- /dev/null +++ b/sysdeps/hppa/rshift.s @@ -0,0 +1,62 @@ +; HP-PA __mpn_rshift -- + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __mpn_rshift +__mpn_rshift + .proc + .callinfo frame=64,no_calls + .entry + + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L$0004 + vshd %r22,%r0,%r28 ; compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,= -1,%r24,L$0002 + vshd %r29,%r22,%r20 + +L$loop ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,= -1,%r24,L$0003 + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,<> -1,%r24,L$loop + vshd %r29,%r22,%r20 + +L$0002 stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +L$0003 stws,ma %r20,4(0,%r26) +L$0004 vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + + .exit + .procend diff --git a/sysdeps/hppa/sub_n.s b/sysdeps/hppa/sub_n.s new file mode 100644 index 00000000000..daae46e61db --- /dev/null +++ b/sysdeps/hppa/sub_n.s @@ -0,0 +1,58 @@ +; HP-PA __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; One might want to unroll this as for other processors, but it turns +; out that the data cache contention after a store makes such +; unrolling useless. We can't come under 5 cycles/limb anyway. + + .code + .export __mpn_sub_n +__mpn_sub_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L$end ; check for (SIZE == 1) + sub %r20,%r19,%r28 ; subtract first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L$loop + subb %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 + + .exit + .procend diff --git a/sysdeps/hppa/udiv_qrnnd.s b/sysdeps/hppa/udiv_qrnnd.s new file mode 100644 index 00000000000..0b069bf7f96 --- /dev/null +++ b/sysdeps/hppa/udiv_qrnnd.s @@ -0,0 +1,285 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on pre-PA7000 CPUs. + +; Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + +; The code size is a bit excessive. We could merge the last two ds;addc +; sequences by simply moving the "bb,< Odd" instruction down. The only +; trouble is the FFFFFFFF code that would need some hacking. + + .code + .export __udiv_qrnnd +__udiv_qrnnd + .proc + .callinfo frame=0,no_calls + .entry + + comb,< %r23,0,L$largedivisor + sub %r0,%r23,%r1 ; clear cy as side-effect + ds %r0,%r1,%r0 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r28 + ds %r25,%r23,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r23,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r28,%r28,%r28 + +L$largedivisor + extru %r24,31,1,%r19 ; r19 = n0 & 1 + bb,< %r23,31,L$odd + extru %r23,30,31,%r22 ; r22 = d >> 1 + shd %r25,%r24,1,%r24 ; r24 = new n0 + extru %r25,30,31,%r25 ; r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r24,%r24,%r28 + +L$odd addib,sv,n 1,%r22,L$FF.. ; r22 = (d / 2 + 1) + shd %r25,%r24,1,%r24 ; r24 = new n0 + extru %r25,30,31,%r25 ; r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r28 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 +; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25 + add,nuv %r28,%r25,%r25 + addl %r25,%r1,%r25 + addc %r0,%r28,%r28 + sub,<< %r25,%r23,%r0 + addl %r25,%r1,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r0,%r28,%r28 + +; This is just a special case of the code above. +; We come here when d == 0xFFFFFFFF +L$FF.. add,uv %r25,%r24,%r24 + sub,<< %r24,%r23,%r0 + ldo 1(%r24),%r24 + stws %r24,0(0,%r26) + bv 0(%r2) + addc %r0,%r25,%r28 + + .exit + .procend diff --git a/sysdeps/i960/add_n.s b/sysdeps/i960/add_n.s new file mode 100644 index 00000000000..6031f6d4c35 --- /dev/null +++ b/sysdeps/i960/add_n.s @@ -0,0 +1,21 @@ +.text + .align 4 + .globl ___mpn_add_n +___mpn_add_n: + mov 0,g6 # clear carry-save register + cmpo 1,0 # clear cy + +Loop: subo 1,g3,g3 # update loop counter + ld (g1),g5 # load from s1_ptr + addo 4,g1,g1 # s1_ptr++ + ld (g2),g4 # load from s2_ptr + addo 4,g2,g2 # s2_ptr++ + cmpo g6,1 # restore cy from g6, relies on cy being 0 + addc g4,g5,g4 # main add + subc 0,0,g6 # save cy in g6 + st g4,(g0) # store result to res_ptr + addo 4,g0,g0 # res_ptr++ + cmpobne 0,g3,Loop # when branch is taken, clears C bit + + mov g6,g0 + ret diff --git a/sysdeps/i960/addmul_1.s b/sysdeps/i960/addmul_1.s new file mode 100644 index 00000000000..1a3de95e504 --- /dev/null +++ b/sysdeps/i960/addmul_1.s @@ -0,0 +1,26 @@ +.text + .align 4 + .globl ___mpn_mul_1 +___mpn_mul_1: + subo g2,0,g2 + shlo 2,g2,g4 + subo g4,g1,g1 + subo g4,g0,g13 + mov 0,g0 + + cmpo 1,0 # clear C bit on AC.cc + +Loop: ld (g1)[g2*4],g5 + emul g3,g5,g6 + ld (g13)[g2*4],g5 + + addc g0,g6,g6 # relies on that C bit is clear + addc 0,g7,g7 + addc g5,g6,g6 # relies on that C bit is clear + st g6,(g13)[g2*4] + addc 0,g7,g0 + + addo g2,1,g2 + cmpobne 0,g2,Loop # when branch is taken, clears C bit + + ret diff --git a/sysdeps/i960/mul_1.s b/sysdeps/i960/mul_1.s new file mode 100644 index 00000000000..e75ea42d396 --- /dev/null +++ b/sysdeps/i960/mul_1.s @@ -0,0 +1,23 @@ +.text + .align 4 + .globl ___mpn_mul_1 +___mpn_mul_1: + subo g2,0,g2 + shlo 2,g2,g4 + subo g4,g1,g1 + subo g4,g0,g13 + mov 0,g0 + + cmpo 1,0 # clear C bit on AC.cc + +Loop: ld (g1)[g2*4],g5 + emul g3,g5,g6 + + addc g0,g6,g6 # relies on that C bit is clear + st g6,(g13)[g2*4] + addc 0,g7,g0 + + addo g2,1,g2 + cmpobne 0,g2,Loop # when branch is taken, clears C bit + + ret diff --git a/sysdeps/i960/sub_n.s b/sysdeps/i960/sub_n.s new file mode 100644 index 00000000000..13ebbfa9f27 --- /dev/null +++ b/sysdeps/i960/sub_n.s @@ -0,0 +1,21 @@ +.text + .align 4 + .globl ___mpn_sub_n +___mpn_sub_n: + mov 1,g6 # set carry-save register + cmpo 1,0 # clear cy + +Loop: subo 1,g3,g3 # update loop counter + ld (g1),g5 # load from s1_ptr + addo 4,g1,g1 # s1_ptr++ + ld (g2),g4 # load from s2_ptr + addo 4,g2,g2 # s2_ptr++ + cmpo g6,1 # restore cy from g6, relies on cy being 0 + subc g4,g5,g4 # main subtract + subc 0,0,g6 # save cy in g6 + st g4,(g0) # store result to res_ptr + addo 4,g0,g0 # res_ptr++ + cmpobne 0,g3,Loop # when branch is taken, cy will be 0 + + mov g6,g0 + ret diff --git a/sysdeps/m88k/m88100/add_n.s b/sysdeps/m88k/m88100/add_n.s new file mode 100644 index 00000000000..7e4ccccb907 --- /dev/null +++ b/sysdeps/m88k/m88100/add_n.s @@ -0,0 +1,103 @@ +; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___mpn_add_n +___mpn_add_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu.co r5,r0,r5 ; (clear carry as side effect) + mak r5,r5,3<4> + bcnd eq0,r5,Lzero + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; add 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + addu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; add 7 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; add 6 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; add 5 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; add 4 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; add 3 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; add 2 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; add 1 + 8r limbs + addu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb diff --git a/sysdeps/m88k/m88100/mul_1.s b/sysdeps/m88k/m88100/mul_1.s new file mode 100644 index 00000000000..35c238d5703 --- /dev/null +++ b/sysdeps/m88k/m88100/mul_1.s @@ -0,0 +1,128 @@ +; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + +; Common overhead is about 11 cycles/invocation. + +; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention.) + +; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.) + +; To enhance speed: +; 1. Unroll main loop 4-8 times. +; 2. Schedule code to avoid WB contention. It might be tempting to move the +; ld instruction in the loops down to save 2 cycles (less WB contention), +; but that looses because the ultimate value will be read from outside +; the allocated space. But if we handle the ultimate multiplication in +; the tail, we can do this. +; 3. Make the multiplication with less instructions. I think the code for +; (S2_LIMB >= 0x10000) is not minimal. +; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or +; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11 +; cycles/limb. (Assuming infinite unrolling.) + + text + align 16 + global ___mpn_mul_1 +___mpn_mul_1: + + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + ld r9,r3[r4] + mask r7,r5,0xffff ; r7 = lo(S2_LIMB) + extu r8,r5,16 ; r8 = hi(S2_LIMB) + bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0) + subu r6,r6,4 + +; General code for any value of S2_LIMB. + + ; Make a stack frame and save r25 and r26 + subu r31,r31,16 + st.d r25,r31,8 + + ; Enter the loop in the middle + br.n L1 + addu r4,r4,1 + +Loop: + ld r9,r3[r4] + st r26,r6[r4] +; bcnd ne0,r0,0 ; bubble + addu r4,r4,1 +L1: mul r26,r9,r5 ; low word of product mul_1 WB ld + mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1 + mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1 + mul r10,r12,r8 ; r10 = prod_1a mul_3 + extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1 + mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1 + mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2 + extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3 + addu r10,r10,r11 ; addu_1 WB extu_2 +; bcnd ne0,r0,0 ; bubble WB addu_1 + addu.co r10,r10,r12 ; WB mul_4 + mask.u r10,r10,0xffff ; move the 16 most significant bits... + addu.ci r10,r10,r0 ; ...to the low half of the word... + rot r10,r10,16 ; ...and put carry in pos 16. + addu.co r26,r26,r2 ; add old carry limb + bcnd.n ne0,r4,Loop + addu.ci r2,r25,r10 ; compute new carry limb + + st r26,r6[r4] + ld.d r25,r31,8 + jmp.n r1 + addu r31,r31,16 + +; Fast code for S2_LIMB < 0x10000 +Lsmall: + ; Enter the loop in the middle + br.n SL1 + addu r4,r4,1 + +SLoop: + ld r9,r3[r4] ; + st r8,r6[r4] ; + addu r4,r4,1 ; +SL1: mul r8,r9,r5 ; low word of product + mask r12,r9,0xffff ; r12 = lo(s1_limb) + extu r13,r9,16 ; r13 = hi(s1_limb) + mul r11,r12,r7 ; r11 = prod_0 + mul r12,r13,r7 ; r12 = prod_1b + addu.cio r8,r8,r2 ; add old carry limb + extu r10,r11,16 ; r11 = hi(prod_0) + addu r10,r10,r12 ; + bcnd.n ne0,r4,SLoop + extu r2,r10,16 ; r2 = new carry limb + + jmp.n r1 + st r8,r6[r4] diff --git a/sysdeps/m88k/m88100/sub_n.s b/sysdeps/m88k/m88100/sub_n.s new file mode 100644 index 00000000000..3963cd5479f --- /dev/null +++ b/sysdeps/m88k/m88100/sub_n.s @@ -0,0 +1,104 @@ +; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___mpn_sub_n +___mpn_sub_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu.co r5,r0,r5 ; (clear carry as side effect) + mak r5,r5,3<4> + bcnd eq0,r5,Lzero + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; subtract 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + subu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; subtract 7 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; subtract 6 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; subtract 5 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; subtract 4 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; subtract 3 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; subtract 2 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; subtract 1 + 8r limbs + subu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 diff --git a/sysdeps/m88k/m88110/mul_1.s b/sysdeps/m88k/m88110/mul_1.s new file mode 100644 index 00000000000..08c3ca07ee6 --- /dev/null +++ b/sysdeps/m88k/m88110/mul_1.s @@ -0,0 +1,84 @@ +; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + + text + align 16 + global ___mpn_mul_1 +___mpn_mul_1: + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + + ld r6,r3[r4] + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n eq0,r4,Lend + subu r8,r8,8 + +Loop: ld r6,r3[r4] + addu.cio r9,r11,r2 + or r2,r10,r0 ; could be avoided if unrolled + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n ne0,r4,Loop + st r9,r8[r4] + +Lend: addu.cio r9,r11,r2 + st r9,r8,4 + jmp.n r1 + addu.ci r2,r10,r0 + +; This is the Right Way to do this on '110. 4 cycles / 64-bit limb. +; ld.d r10, +; mulu.d +; addu.cio +; addu.cio +; st.d +; mulu.d ,r11,r5 +; ld.d r12, +; mulu.d ,r10,r5 +; addu.cio +; addu.cio +; st.d +; mulu.d +; ld.d r10, +; mulu.d +; addu.cio +; addu.cio +; st.d +; mulu.d +; ld.d r10, +; mulu.d +; addu.cio +; addu.cio +; st.d +; mulu.d diff --git a/sysdeps/mips/add_n.s b/sysdeps/mips/add_n.s new file mode 100644 index 00000000000..c82910816e4 --- /dev/null +++ b/sysdeps/mips/add_n.s @@ -0,0 +1,119 @@ + # MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .set noreorder + .set nomacro + + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_add_n diff --git a/sysdeps/mips/addmul_1.s b/sysdeps/mips/addmul_1.s new file mode 100644 index 00000000000..abc2fb8dcfb --- /dev/null +++ b/sysdeps/mips/addmul_1.s @@ -0,0 +1,96 @@ + # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and + # add the product to a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop # should be "bnel" + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_addmul_1 diff --git a/sysdeps/mips/lshift.s b/sysdeps/mips/lshift.s new file mode 100644 index 00000000000..ce33e7c84c2 --- /dev/null +++ b/sysdeps/mips/lshift.s @@ -0,0 +1,94 @@ + # MIPS2 __mpn_lshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .set noreorder + .set nomacro + + sll $2,$6,2 + addu $5,$5,$2 # make r5 point at end of src + lw $10,-4($5) # load first limb + subu $13,$0,$7 + addu $4,$4,$2 # make r4 point at end of res + addiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + srl $2,$10,$13 # compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,-8($5) + addiu $4,$4,-4 + addiu $5,$5,-4 + addiu $9,$9,-1 + sll $11,$10,$7 + srl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,-8($5) + addiu $4,$4,-16 + addiu $6,$6,-4 + sll $11,$10,$7 + srl $12,$3,$13 + + lw $10,-12($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,12($4) + srl $9,$10,$13 + + lw $3,-16($5) + sll $11,$10,$7 + or $8,$14,$9 + sw $8,8($4) + srl $12,$3,$13 + + lw $10,-20($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,4($4) + srl $9,$10,$13 + + addiu $5,$5,-16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,0($4) + +.Lend: sll $8,$10,$7 + j $31 + sw $8,-4($4) + .end __mpn_lshift diff --git a/sysdeps/mips/mips3/add_n.s b/sysdeps/mips/mips3/add_n.s new file mode 100644 index 00000000000..b5257804ad6 --- /dev/null +++ b/sysdeps/mips/mips3/add_n.s @@ -0,0 +1,119 @@ + # MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .set noreorder + .set nomacro + + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + or $2,$2,$8 + + ld $10,16($5) + daddu $13,$13,$2 + ld $11,16($6) + sltu $8,$13,$2 + daddu $13,$12,$13 + sltu $2,$13,$12 + sd $13,8($4) + or $2,$2,$8 + + ld $12,24($5) + daddu $11,$11,$2 + ld $13,24($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,16($4) + or $2,$2,$8 + + ld $10,32($5) + daddu $13,$13,$2 + ld $11,32($6) + sltu $8,$13,$2 + daddu $13,$12,$13 + sltu $2,$13,$12 + sd $13,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_add_n diff --git a/sysdeps/mips/mips3/addmul_1.s b/sysdeps/mips/mips3/addmul_1.s new file mode 100644 index 00000000000..7af01726142 --- /dev/null +++ b/sysdeps/mips/mips3/addmul_1.s @@ -0,0 +1,96 @@ + # MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and + # add the product to a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop # should be "bnel" + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_addmul_1 diff --git a/sysdeps/mips/mips3/lshift.s b/sysdeps/mips/mips3/lshift.s new file mode 100644 index 00000000000..c05dcafffd3 --- /dev/null +++ b/sysdeps/mips/mips3/lshift.s @@ -0,0 +1,94 @@ + # MIPS3 __mpn_lshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .set noreorder + .set nomacro + + dsll $2,$6,3 + daddu $5,$5,$2 # make r5 point at end of src + ld $10,-8($5) # load first limb + dsubu $13,$0,$7 + daddu $4,$4,$2 # make r4 point at end of res + daddiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + dsrl $2,$10,$13 # compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,-16($5) + daddiu $4,$4,-8 + daddiu $5,$5,-8 + daddiu $9,$9,-1 + dsll $11,$10,$7 + dsrl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,-16($5) + daddiu $4,$4,-32 + daddiu $6,$6,-4 + dsll $11,$10,$7 + dsrl $12,$3,$13 + + ld $10,-24($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,24($4) + dsrl $9,$10,$13 + + ld $3,-32($5) + dsll $11,$10,$7 + or $8,$14,$9 + sd $8,16($4) + dsrl $12,$3,$13 + + ld $10,-40($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,8($4) + dsrl $9,$10,$13 + + daddiu $5,$5,-32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,0($4) + +.Lend: dsll $8,$10,$7 + j $31 + sd $8,-8($4) + .end __mpn_lshift diff --git a/sysdeps/mips/mips3/mul_1.s b/sysdeps/mips/mips3/mul_1.s new file mode 100644 index 00000000000..87954e5bc3e --- /dev/null +++ b/sysdeps/mips/mips3/mul_1.s @@ -0,0 +1,84 @@ + # MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and + # store the product in a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_mul_1 + .ent __mpn_mul_1 +__mpn_mul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: mflo $10 + mfhi $9 + daddiu $5,$5,8 + daddu $10,$10,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$10,$2 # carry from previous addition -> $2 + sd $10,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop # should be "bnel" + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + dmultu $8,$7 + sd $10,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + sd $10,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_mul_1 diff --git a/sysdeps/mips/mips3/rshift.s b/sysdeps/mips/mips3/rshift.s new file mode 100644 index 00000000000..e0e2ca2c5fe --- /dev/null +++ b/sysdeps/mips/mips3/rshift.s @@ -0,0 +1,91 @@ + # MIPS3 __mpn_rshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .set noreorder + .set nomacro + + ld $10,0($5) # load first limb + dsubu $13,$0,$7 + daddiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + dsll $2,$10,$13 # compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,8($5) + daddiu $4,$4,8 + daddiu $5,$5,8 + daddiu $9,$9,-1 + dsrl $11,$10,$7 + dsll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,-8($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,8($5) + daddiu $4,$4,32 + daddiu $6,$6,-4 + dsrl $11,$10,$7 + dsll $12,$3,$13 + + ld $10,16($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-32($4) + dsll $9,$10,$13 + + ld $3,24($5) + dsrl $11,$10,$7 + or $8,$14,$9 + sd $8,-24($4) + dsll $12,$3,$13 + + ld $10,32($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-16($4) + dsll $9,$10,$13 + + daddiu $5,$5,32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,-8($4) + +.Lend: dsrl $8,$10,$7 + j $31 + sd $8,0($4) + .end __mpn_rshift diff --git a/sysdeps/mips/mips3/sub_n.s b/sysdeps/mips/mips3/sub_n.s new file mode 100644 index 00000000000..9a45ffde5a0 --- /dev/null +++ b/sysdeps/mips/mips3/sub_n.s @@ -0,0 +1,119 @@ + # MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .set noreorder + .set nomacro + + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + or $2,$2,$8 + + ld $10,16($5) + daddu $13,$13,$2 + ld $11,16($6) + sltu $8,$13,$2 + dsubu $13,$12,$13 + sltu $2,$12,$13 + sd $13,8($4) + or $2,$2,$8 + + ld $12,24($5) + daddu $11,$11,$2 + ld $13,24($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,16($4) + or $2,$2,$8 + + ld $10,32($5) + daddu $13,$13,$2 + ld $11,32($6) + sltu $8,$13,$2 + dsubu $13,$12,$13 + sltu $2,$12,$13 + sd $13,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_sub_n diff --git a/sysdeps/mips/mips3/submul_1.s b/sysdeps/mips/mips3/submul_1.s new file mode 100644 index 00000000000..f28c6a5167f --- /dev/null +++ b/sysdeps/mips/mips3/submul_1.s @@ -0,0 +1,96 @@ + # MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and + # subtract the product from a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_submul_1 + .ent __mpn_submul_1 +__mpn_submul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop # should be "bnel" + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_submul_1 diff --git a/sysdeps/mips/mul_1.s b/sysdeps/mips/mul_1.s new file mode 100644 index 00000000000..01327e22d8d --- /dev/null +++ b/sysdeps/mips/mul_1.s @@ -0,0 +1,84 @@ + # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and + # store the product in a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_mul_1 + .ent __mpn_mul_1 +__mpn_mul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: mflo $10 + mfhi $9 + addiu $5,$5,4 + addu $10,$10,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$10,$2 # carry from previous addition -> $2 + sw $10,0($4) + addiu $4,$4,4 + bne $6,$0,Loop # should be "bnel" + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + multu $8,$7 + sw $10,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + sw $10,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_mul_1 diff --git a/sysdeps/mips/rshift.s b/sysdeps/mips/rshift.s new file mode 100644 index 00000000000..69416913739 --- /dev/null +++ b/sysdeps/mips/rshift.s @@ -0,0 +1,91 @@ + # MIPS2 __mpn_rshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .set noreorder + .set nomacro + + lw $10,0($5) # load first limb + subu $13,$0,$7 + addiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + sll $2,$10,$13 # compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,4($5) + addiu $4,$4,4 + addiu $5,$5,4 + addiu $9,$9,-1 + srl $11,$10,$7 + sll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,-4($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,4($5) + addiu $4,$4,16 + addiu $6,$6,-4 + srl $11,$10,$7 + sll $12,$3,$13 + + lw $10,8($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-16($4) + sll $9,$10,$13 + + lw $3,12($5) + srl $11,$10,$7 + or $8,$14,$9 + sw $8,-12($4) + sll $12,$3,$13 + + lw $10,16($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-8($4) + sll $9,$10,$13 + + addiu $5,$5,16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,-4($4) + +.Lend: srl $8,$10,$7 + j $31 + sw $8,0($4) + .end __mpn_rshift diff --git a/sysdeps/mips/sub_n.s b/sysdeps/mips/sub_n.s new file mode 100644 index 00000000000..63f3b553542 --- /dev/null +++ b/sysdeps/mips/sub_n.s @@ -0,0 +1,119 @@ + # MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .set noreorder + .set nomacro + + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_sub_n diff --git a/sysdeps/mips/submul_1.s b/sysdeps/mips/submul_1.s new file mode 100644 index 00000000000..616dd1b47ce --- /dev/null +++ b/sysdeps/mips/submul_1.s @@ -0,0 +1,96 @@ + # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and + # subtract the product from a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_submul_1 + .ent __mpn_submul_1 +__mpn_submul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop # should be "bnel" + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_submul_1 diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s new file mode 100644 index 00000000000..34ad9e1d2db --- /dev/null +++ b/sysdeps/rs6000/add_n.s @@ -0,0 +1,54 @@ +# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + + .toc + .extern __mpn_add_n[DS] + .extern .__mpn_add_n +.csect [PR] + .align 2 + .globl __mpn_add_n + .globl .__mpn_add_n + .csect __mpn_add_n[DS] +__mpn_add_n: + .long .__mpn_add_n, TOC[tc0], 0 + .csect [PR] +.__mpn_add_n: + mtctr 6 # copy size into CTR + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before used + a 7,0,8 # add least significant limbs, set cy + bdz Lend # If done, skip loop +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) # store previous limb in load latecny slot + ae 7,0,8 # add new limbs with cy, set cy + bdn Loop # decrement CTR and loop back +Lend: st 7,4(3) # store ultimate result limb + lil 3,0 # load cy into ... + aze 3,3 # ... return value register + br diff --git a/sysdeps/rs6000/addmul_1.s b/sysdeps/rs6000/addmul_1.s new file mode 100644 index 00000000000..862b6139fee --- /dev/null +++ b/sysdeps/rs6000/addmul_1.s @@ -0,0 +1,122 @@ +# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + + .toc + .csect .__mpn_addmul_1[PR] + .align 2 + .globl __mpn_addmul_1 + .globl .__mpn_addmul_1 + .csect __mpn_addmul_1[DS] +__mpn_addmul_1: + .long .__mpn_addmul_1[PR], TOC[tc0], 0 + .csect .__mpn_addmul_1[PR] +.__mpn_addmul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + cax 9,9,7 + l 7,4(3) + a 8,8,7 # add res_limb + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + l 7,4(3) + aze 9,9 + a 8,8,7 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 8,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 8,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/sysdeps/rs6000/lshift.s b/sysdeps/rs6000/lshift.s new file mode 100644 index 00000000000..69c75020615 --- /dev/null +++ b/sysdeps/rs6000/lshift.s @@ -0,0 +1,58 @@ +# IBM POWER __mpn_lshift -- + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 + + .toc + .extern __mpn_lshift[DS] + .extern .__mpn_lshift +.csect [PR] + .align 2 + .globl __mpn_lshift + .globl .__mpn_lshift + .csect __mpn_lshift[DS] +__mpn_lshift: + .long .__mpn_lshift, TOC[tc0], 0 + .csect [PR] +.__mpn_lshift: + sli 0,5,2 + cax 9,3,0 + cax 4,4,0 + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + lu 0,-4(4) # read most significant limb + sre 3,0,8 # compute carry out limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,-4(4) # read 2:nd most significant limb + sreq 7,0,8 # compute most significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,-4(4) # load next lower limb + stu 7,-4(9) # store previous result during read latency + sreq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,-4(9) # store 2:nd least significant limb +Lend2: sle 7,0,6 # compute least significant limb + st 7,-4(9) # store it" \ + br diff --git a/sysdeps/rs6000/mul_1.s b/sysdeps/rs6000/mul_1.s new file mode 100644 index 00000000000..f4fa8943391 --- /dev/null +++ b/sysdeps/rs6000/mul_1.s @@ -0,0 +1,109 @@ +# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + + .toc + .csect .__mpn_mul_1[PR] + .align 2 + .globl __mpn_mul_1 + .globl .__mpn_mul_1 + .csect __mpn_mul_1[DS] +__mpn_mul_1: + .long .__mpn_mul_1[PR], TOC[tc0], 0 + .csect .__mpn_mul_1[PR] +.__mpn_mul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + ai 0,0,0 # reset carry + cax 9,9,7 + blt Lneg +Lpos: bdz Lend +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + cax 10,10,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,9 + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + cax 9,9,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,10 + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/sysdeps/rs6000/rshift.s b/sysdeps/rs6000/rshift.s new file mode 100644 index 00000000000..6056acc7530 --- /dev/null +++ b/sysdeps/rs6000/rshift.s @@ -0,0 +1,56 @@ +# IBM POWER __mpn_rshift -- + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 + + .toc + .extern __mpn_rshift[DS] + .extern .__mpn_rshift +.csect [PR] + .align 2 + .globl __mpn_rshift + .globl .__mpn_rshift + .csect __mpn_rshift[DS] +__mpn_rshift: + .long .__mpn_rshift, TOC[tc0], 0 + .csect [PR] +.__mpn_rshift: + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + l 0,0(4) # read least significant limb + ai 9,3,-4 # adjust res_ptr since it's offset in the stu:s + sle 3,0,8 # compute carry limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,4(4) # read 2:nd least significant limb + sleq 7,0,8 # compute least significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,4(4) # load next higher limb + stu 7,4(9) # store previous result during read latency + sleq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,4(9) # store 2:nd most significant limb +Lend2: sre 7,0,6 # compute most significant limb + st 7,4(9) # store it" \ + br diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s new file mode 100644 index 00000000000..402fdcefc4e --- /dev/null +++ b/sysdeps/rs6000/sub_n.s @@ -0,0 +1,55 @@ +# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# store difference in a third limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + + .toc + .extern __mpn_sub_n[DS] + .extern .__mpn_sub_n +.csect [PR] + .align 2 + .globl __mpn_sub_n + .globl .__mpn_sub_n + .csect __mpn_sub_n[DS] +__mpn_sub_n: + .long .__mpn_sub_n, TOC[tc0], 0 + .csect [PR] +.__mpn_sub_n: + mtctr 6 # copy size into CTR + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before used + sf 7,0,8 # add least significant limbs, set cy + bdz Lend # If done, skip loop +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) # store previous limb in load latecny slot + sfe 7,0,8 # add new limbs with cy, set cy + bdn Loop # decrement CTR and loop back +Lend: st 7,4(3) # store ultimate result limb + sfe 3,0,0 # load !cy into ... + sfi 3,3,0 # ... return value register + br diff --git a/sysdeps/rs6000/submul_1.s b/sysdeps/rs6000/submul_1.s new file mode 100644 index 00000000000..252633261d0 --- /dev/null +++ b/sysdeps/rs6000/submul_1.s @@ -0,0 +1,127 @@ +# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + + .toc + .csect .__mpn_submul_1[PR] + .align 2 + .globl __mpn_submul_1 + .globl .__mpn_submul_1 + .csect __mpn_submul_1[DS] +__mpn_submul_1: + .long .__mpn_submul_1[PR], TOC[tc0], 0 + .csect .__mpn_submul_1[PR] +.__mpn_submul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 11 + cax 9,9,7 + l 7,4(3) + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 11,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 11,0,10 + l 7,4(3) + aze 9,9 + sf 8,11,7 + a 11,8,11 # invert cy (r11 is junk) + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 11,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 11,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/sysdeps/vax/add_n.s b/sysdeps/vax/add_n.s new file mode 100644 index 00000000000..c89b2260511 --- /dev/null +++ b/sysdeps/vax/add_n.s @@ -0,0 +1,47 @@ +# VAX __mpn_add_n -- Add two limb vectors of the same length > 0 and store +# sum in a third limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# s2_ptr (sp + 12) +# size (sp + 16) + +.text + .align 1 +.globl ___mpn_add_n +___mpn_add_n: + .word 0x0 + movl 16(ap),r0 + movl 12(ap),r1 + movl 8(ap),r2 + movl 4(ap),r3 + subl2 r4,r4 + +Loop: + movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + jsobgtr r0,Loop + + adwc r0,r0 + ret diff --git a/sysdeps/vax/addmul_1.s b/sysdeps/vax/addmul_1.s new file mode 100644 index 00000000000..8e83204b814 --- /dev/null +++ b/sysdeps/vax/addmul_1.s @@ -0,0 +1,125 @@ +# VAX __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___mpn_addmul_1 +___mpn_addmul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + addl2 r2,(r9)+ + adwc $0,r3 +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + addl2 r2,(r9)+ + adwc $0,r3 + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + addl2 r2,(r9)+ + adwc $0,r3 +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + addl2 r2,(r9)+ + adwc r1,r3 + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + addl2 r10,(r9)+ + adwc r1,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/sysdeps/vax/mul_1.s b/sysdeps/vax/mul_1.s new file mode 100644 index 00000000000..3fe375bacfd --- /dev/null +++ b/sysdeps/vax/mul_1.s @@ -0,0 +1,122 @@ +# VAX __mpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___mpn_mul_1 +___mpn_mul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + +# One might want to combine the addl2 and the store below, but that +# is actually just slower according to my timing tests. (VAX 3600) + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + movl r2,(r9)+ +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + movl r2,(r9)+ + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + movl r2,(r9)+ +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r1,r3 + addl2 r11,r2 + adwc r6,r3 + movl r2,(r9)+ + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r1,r11 + addl2 r3,r10 + adwc r6,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/sysdeps/vax/sub_n.s b/sysdeps/vax/sub_n.s new file mode 100644 index 00000000000..300b4dee8fd --- /dev/null +++ b/sysdeps/vax/sub_n.s @@ -0,0 +1,47 @@ +# VAX __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store +# difference in a third limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# s2_ptr (sp + 12) +# size (sp + 16) + +.text + .align 1 +.globl ___mpn_sub_n +___mpn_sub_n: + .word 0x0 + movl 16(ap),r0 + movl 12(ap),r1 + movl 8(ap),r2 + movl 4(ap),r3 + subl2 r4,r4 + +Loop: + movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + jsobgtr r0,Loop + + adwc r0,r0 + ret diff --git a/sysdeps/vax/submul_1.s b/sysdeps/vax/submul_1.s new file mode 100644 index 00000000000..875cbfd651a --- /dev/null +++ b/sysdeps/vax/submul_1.s @@ -0,0 +1,125 @@ +# VAX __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___mpn_submul_1 +___mpn_submul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + subl2 r2,(r9)+ + adwc $0,r3 +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + subl2 r2,(r9)+ + adwc $0,r3 + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + subl2 r2,(r9)+ + adwc $0,r3 +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + subl2 r2,(r9)+ + adwc r1,r3 + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + subl2 r10,(r9)+ + adwc r1,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/sysdeps/z8000/add_n.s b/sysdeps/z8000/add_n.s new file mode 100644 index 00000000000..21efaf57145 --- /dev/null +++ b/sysdeps/z8000/add_n.s @@ -0,0 +1,52 @@ +! Z8000 __mpn_add_n -- Add two limb vectors of equal, non-zero length. + +! Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Library General Public License as published by +! the Free Software Foundation; either version 2 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! License for more details. + +! You should have received a copy of the GNU Library General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + unseg + .text + even + global ___mpn_add_n +___mpn_add_n: + pop r0,@r6 + pop r1,@r5 + add r0,r1 + ld @r7,r0 + dec r4 + jr eq,Lend +Loop: pop r0,@r6 + pop r1,@r5 + adc r0,r1 + inc r7,#2 + ld @r7,r0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + adc r2,r2 + ret t diff --git a/sysdeps/z8000/mul_1.s b/sysdeps/z8000/mul_1.s new file mode 100644 index 00000000000..2075225d115 --- /dev/null +++ b/sysdeps/z8000/mul_1.s @@ -0,0 +1,67 @@ +! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store +! the result in a second limb vector. + +! Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Library General Public License as published by +! the Free Software Foundation; either version 2 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! License for more details. + +! You should have received a copy of the GNU Library General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! size r5 +! s2_limb r4 + + unseg + .text + even + global ___mpn_mul_1 +___mpn_mul_1: + sub r2,r2 ! zero carry limb + and r4,r4 + jr mi,Lneg + +Lpos: pop r1,@r6 + ld r9,r1 + mult rr8,r4 + and r1,r1 ! shift msb of loaded limb into cy + jr mi,Lp ! branch if loaded limb's msb is set + add r8,r4 ! hi_limb += sign_comp2 +Lp: add r9,r2 ! lo_limb += cy_limb + xor r2,r2 + adc r2,r8 + ld @r7,r9 + inc r7,#2 + dec r5 + jr ne,Lpos + ret t + +Lneg: pop r1,@r6 + ld r9,r1 + mult rr8,r4 + add r8,r1 ! hi_limb += sign_comp1 + and r1,r1 + jr mi,Ln + add r8,r4 ! hi_limb += sign_comp2 +Ln: add r9,r2 ! lo_limb += cy_limb + xor r2,r2 + adc r2,r8 + ld @r7,r9 + inc r7,#2 + dec r5 + jr ne,Lneg + ret t diff --git a/sysdeps/z8000/sub_n.s b/sysdeps/z8000/sub_n.s new file mode 100644 index 00000000000..f75ef22d045 --- /dev/null +++ b/sysdeps/z8000/sub_n.s @@ -0,0 +1,53 @@ +! Z8000 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +! store difference in a third limb vector. + +! Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Library General Public License as published by +! the Free Software Foundation; either version 2 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! License for more details. + +! You should have received a copy of the GNU Library General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + unseg + .text + even + global ___mpn_sub_n +___mpn_sub_n: + pop r0,@r6 + pop r1,@r5 + sub r0,r1 + ld @r7,r0 + dec r4 + jr eq,Lend +Loop: pop r0,@r6 + pop r1,@r5 + sbc r0,r1 + inc r7,#2 + ld @r7,r0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + adc r2,r2 + ret t