-/* This assember is for R4000 and above machines. It takes advantage
- * of the 64 bit registers present on these CPUs.
- * Make sure that the SSLeay bignum library is compiled with
- * SIXTY_FOUR_BIT set and BN_LLONG undefined.
- * This must either be compiled with the system CC, or, if you use GNU gas,
- * cc -E mips3.s|gas -o mips3.o
+.rdata
+.asciiz "mips3.s, Version 1.1"
+.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project.
+ *
+ * Rights for redistribution and usage in source and binary forms are
+ * granted according to the OpenSSL license. Warranty of any kind is
+ * disclaimed.
+ * ====================================================================
+ */
+
+/*
+ * This is my modest contributon to the OpenSSL project (see
+ * http://www.openssl.org/ for more information about it) and is
+ * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
+ * module. For updates see http://fy.chalmers.se/~appro/hpe/.
+ *
+ * The module is designed to work with either of the "new" MIPS ABI(5),
+ * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+ * IRIX 5.x not only because it doesn't support new ABIs but also
+ * because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+ * 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+ * cause illegal instruction exception:-(
+ *
+ * In addition the code depends on preprocessor flags set up by MIPSpro
+ * compiler driver (either as or cc) and therefore (probably?) can't be
+ * compiled by the GNU assembler. GNU C driver manages fine though...
+ * I mean as long as -mmips-as is specified or is the default option,
+ * because then it simply invokes /usr/bin/as which in turn takes
+ * perfect care of the preprocessor definitions. Another neat feature
+ * offered by the MIPSpro assembler is an optimization pass. This gave
+ * me the opportunity to have the code looking more regular as all those
+ * architecture dependent instruction rescheduling details were left to
+ * the assembler. Cool, huh?
+ *
+ * Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+ * goes way over 3 times faster!
+ *
+ * <appro@fy.chalmers.se>
*/
+#include <asm.h>
+#include <regdef.h>
+
+#if _MIPS_ISA>=4
+#define MOVNZ(cond,dst,src) \
+ movn dst,src,cond
+#else
+#define MOVNZ(cond,dst,src) \
+ .set noreorder; \
+ bnezl cond,.+8; \
+ move dst,src; \
.set reorder
- .set noat
-
-#define R1 $1
-#define CC $2
-#define R2 $3
-#define R3 $8
-#define R4 $9
-#define L1 $10
-#define L2 $11
-#define L3 $12
-#define L4 $13
-#define H1 $14
-#define H2 $15
-#define H3 $24
-#define H4 $25
-
-#define P1 $4
-#define P2 $5
-#define P3 $6
-#define P4 $7
-
- .align 2
- .ent bn_mul_add_words
- .globl bn_mul_add_words
-.text
-bn_mul_add_words:
- .frame $sp,0,$31
- .mask 0x00000000,0
- .fmask 0x00000000,0
+#endif
- #blt P3,4,$lab34
-
- subu R1,P3,4
- move CC,$0
- bltz R1,$lab34
-$lab2:
- ld R1,0(P1)
- ld L1,0(P2)
- ld R2,8(P1)
- ld L2,8(P2)
- ld R3,16(P1)
- ld L3,16(P2)
- ld R4,24(P1)
- ld L4,24(P2)
- dmultu L1,P4
- daddu R1,R1,CC
- mflo L1
- sltu CC,R1,CC
- daddu R1,R1,L1
- mfhi H1
- sltu L1,R1,L1
- sd R1,0(P1)
- daddu CC,CC,L1
- dmultu L2,P4
- daddu CC,H1,CC
- mflo L2
- daddu R2,R2,CC
- sltu CC,R2,CC
- mfhi H2
- daddu R2,R2,L2
- daddu P2,P2,32
- sltu L2,R2,L2
- sd R2,8(P1)
- daddu CC,CC,L2
- dmultu L3,P4
- daddu CC,H2,CC
- mflo L3
- daddu R3,R3,CC
- sltu CC,R3,CC
- mfhi H3
- daddu R3,R3,L3
- daddu P1,P1,32
- sltu L3,R3,L3
- sd R3,-16(P1)
- daddu CC,CC,L3
- dmultu L4,P4
- daddu CC,H3,CC
- mflo L4
- daddu R4,R4,CC
- sltu CC,R4,CC
- mfhi H4
- daddu R4,R4,L4
- subu P3,P3,4
- sltu L4,R4,L4
- daddu CC,CC,L4
- daddu CC,H4,CC
-
- subu R1,P3,4
- sd R4,-8(P1) # delay slot
- bgez R1,$lab2
-
- bleu P3,0,$lab3
- .align 2
-$lab33:
- ld L1,0(P2)
- ld R1,0(P1)
- dmultu L1,P4
- daddu R1,R1,CC
- sltu CC,R1,CC
- daddu P1,P1,8
- mflo L1
- mfhi H1
- daddu R1,R1,L1
- daddu P2,P2,8
- sltu L1,R1,L1
- subu P3,P3,1
- daddu CC,CC,L1
- sd R1,-8(P1)
- daddu CC,H1,CC
- bgtz P3,$lab33
- j $31
- .align 2
-$lab3:
- j $31
- .align 2
-$lab34:
- bgt P3,0,$lab33
- j $31
- .end bn_mul_add_words
-
- .align 2
- # Program Unit: bn_mul_words
- .ent bn_mul_words
- .globl bn_mul_words
.text
-bn_mul_words:
- .frame $sp,0,$31
- .mask 0x00000000,0
- .fmask 0x00000000,0
-
- subu P3,P3,4
- move CC,$0
- bltz P3,$lab45
-$lab44:
- ld L1,0(P2)
- ld L2,8(P2)
- ld L3,16(P2)
- ld L4,24(P2)
- dmultu L1,P4
- subu P3,P3,4
- mflo L1
- mfhi H1
- daddu L1,L1,CC
- dmultu L2,P4
- sltu CC,L1,CC
- sd L1,0(P1)
- daddu CC,H1,CC
- mflo L2
- mfhi H2
- daddu L2,L2,CC
- dmultu L3,P4
- sltu CC,L2,CC
- sd L2,8(P1)
- daddu CC,H2,CC
- mflo L3
- mfhi H3
- daddu L3,L3,CC
- dmultu L4,P4
- sltu CC,L3,CC
- sd L3,16(P1)
- daddu CC,H3,CC
- mflo L4
- mfhi H4
- daddu L4,L4,CC
- daddu P1,P1,32
- sltu CC,L4,CC
- daddu P2,P2,32
- daddu CC,H4,CC
- sd L4,-8(P1)
-
- bgez P3,$lab44
- b $lab45
-$lab46:
- ld L1,0(P2)
- daddu P1,P1,8
- dmultu L1,P4
- daddu P2,P2,8
- mflo L1
- mfhi H1
- daddu L1,L1,CC
- subu P3,P3,1
- sltu CC,L1,CC
- sd L1,-8(P1)
- daddu CC,H1,CC
- bgtz P3,$lab46
- j $31
-$lab45:
- addu P3,P3,4
- bgtz P3,$lab46
- j $31
- .align 2
- .end bn_mul_words
-
- # Program Unit: bn_sqr_words
- .ent bn_sqr_words
- .globl bn_sqr_words
-.text
-bn_sqr_words:
- .frame $sp,0,$31
- .mask 0x00000000,0
- .fmask 0x00000000,0
+
+.set noat
+.set reorder
+
+#define MINUS4 v1
+
+.align 5
+LEAF(bn_mul_add_words)
+ .set noreorder
+ bgtzl a2,.L_bn_mul_add_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_mul_add_words_proceed:
+ li MINUS4,-4
+ and ta0,a2,MINUS4
+ move v0,zero
+ beqz ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+ dmultu t0,a3
+ ld t1,0(a0)
+ ld t2,8(a1)
+ ld t3,8(a0)
+ ld ta0,16(a1)
+ ld ta1,16(a0)
+ daddu t1,v0
+ sltu v0,t1,v0 /* All manuals say it "compares 32-bit
+ * values", but it seems to work fine
+ * even on 64-bit registers. */
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,0(a0)
+ daddu v0,AT
+
+ dmultu t2,a3
+ ld ta2,24(a1)
+ ld ta3,24(a0)
+ daddu t3,v0
+ sltu v0,t3,v0
+ mflo AT
+ mfhi t2
+ daddu t3,AT
+ daddu v0,t2
+ sltu AT,t3,AT
+ sd t3,8(a0)
+ daddu v0,AT
+
+ dmultu ta0,a3
+ subu a2,4
+ PTR_ADD a0,32
+ PTR_ADD a1,32
+ daddu ta1,v0
+ sltu v0,ta1,v0
+ mflo AT
+ mfhi ta0
+ daddu ta1,AT
+ daddu v0,ta0
+ sltu AT,ta1,AT
+ sd ta1,-16(a0)
+ daddu v0,AT
+
+
+ dmultu ta2,a3
+ and ta0,a2,MINUS4
+ daddu ta3,v0
+ sltu v0,ta3,v0
+ mflo AT
+ mfhi ta2
+ daddu ta3,AT
+ daddu v0,ta2
+ sltu AT,ta3,AT
+ sd ta3,-8(a0)
+ daddu v0,AT
+ .set noreorder
+ bgtzl ta0,.L_bn_mul_add_words_loop
+ ld t0,0(a1)
+
+ bnezl a2,.L_bn_mul_add_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_mul_add_words_return:
+ jr ra
+
+.L_bn_mul_add_words_tail:
+ dmultu t0,a3
+ ld t1,0(a0)
+ subu a2,1
+ daddu t1,v0
+ sltu v0,t1,v0
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,0(a0)
+ daddu v0,AT
+ beqz a2,.L_bn_mul_add_words_return
+
+ ld t0,8(a1)
+ dmultu t0,a3
+ ld t1,8(a0)
+ subu a2,1
+ daddu t1,v0
+ sltu v0,t1,v0
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,8(a0)
+ daddu v0,AT
+ beqz a2,.L_bn_mul_add_words_return
+
+ ld t0,16(a1)
+ dmultu t0,a3
+ ld t1,16(a0)
+ daddu t1,v0
+ sltu v0,t1,v0
+ mflo AT
+ mfhi t0
+ daddu t1,AT
+ daddu v0,t0
+ sltu AT,t1,AT
+ sd t1,16(a0)
+ daddu v0,AT
+ jr ra
+END(bn_mul_add_words)
+
+.align 5
+LEAF(bn_mul_words)
+ .set noreorder
+ bgtzl a2,.L_bn_mul_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_mul_words_proceed:
+ li MINUS4,-4
+ and ta0,a2,MINUS4
+ move v0,zero
+ beqz ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+ dmultu t0,a3
+ ld t2,8(a1)
+ ld ta0,16(a1)
+ ld ta2,24(a1)
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,0(a0)
+ daddu v0,t1,t0
+
+ dmultu t2,a3
+ subu a2,4
+ PTR_ADD a0,32
+ PTR_ADD a1,32
+ mflo AT
+ mfhi t2
+ daddu v0,AT
+ sltu t3,v0,AT
+ sd v0,-24(a0)
+ daddu v0,t3,t2
+
+ dmultu ta0,a3
+ mflo AT
+ mfhi ta0
+ daddu v0,AT
+ sltu ta1,v0,AT
+ sd v0,-16(a0)
+ daddu v0,ta1,ta0
+
+
+ dmultu ta2,a3
+ and ta0,a2,MINUS4
+ mflo AT
+ mfhi ta2
+ daddu v0,AT
+ sltu ta3,v0,AT
+ sd v0,-8(a0)
+ daddu v0,ta3,ta2
+ .set noreorder
+ bgtzl ta0,.L_bn_mul_words_loop
+ ld t0,0(a1)
+
+ bnezl a2,.L_bn_mul_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_mul_words_return:
+ jr ra
+
+.L_bn_mul_words_tail:
+ dmultu t0,a3
+ subu a2,1
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,0(a0)
+ daddu v0,t1,t0
+ beqz a2,.L_bn_mul_words_return
+
+ ld t0,8(a1)
+ dmultu t0,a3
+ subu a2,1
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,8(a0)
+ daddu v0,t1,t0
+ beqz a2,.L_bn_mul_words_return
+
+ ld t0,16(a1)
+ dmultu t0,a3
+ mflo AT
+ mfhi t0
+ daddu v0,AT
+ sltu t1,v0,AT
+ sd v0,16(a0)
+ daddu v0,t1,t0
+ jr ra
+END(bn_mul_words)
+
+.align 5
+LEAF(bn_sqr_words)
+ .set noreorder
+ bgtzl a2,.L_bn_sqr_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_sqr_words_proceed:
+ li MINUS4,-4
+ and ta0,a2,MINUS4
+ move v0,zero
+ beqz ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+ dmultu t0,t0
+ ld t2,8(a1)
+ ld ta0,16(a1)
+ ld ta2,24(a1)
+ mflo t1
+ mfhi t0
+ sd t1,0(a0)
+ sd t0,8(a0)
+
+ dmultu t2,t2
+ subu a2,4
+ PTR_ADD a0,64
+ PTR_ADD a1,32
+ mflo t3
+ mfhi t2
+ sd t3,-48(a0)
+ sd t2,-40(a0)
+
+ dmultu ta0,ta0
+ mflo ta1
+ mfhi ta0
+ sd ta1,-32(a0)
+ sd ta0,-24(a0)
+
+
+ dmultu ta2,ta2
+ and ta0,a2,MINUS4
+ mflo ta3
+ mfhi ta2
+ sd ta3,-16(a0)
+ sd ta2,-8(a0)
+
+ .set noreorder
+ bgtzl ta0,.L_bn_sqr_words_loop
+ ld t0,0(a1)
+
+ bnezl a2,.L_bn_sqr_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_sqr_words_return:
+ move v0,zero
+ jr ra
+
+.L_bn_sqr_words_tail:
+ dmultu t0,t0
+ subu a2,1
+ mflo t1
+ mfhi t0
+ sd t1,0(a0)
+ sd t0,8(a0)
+ beqz a2,.L_bn_sqr_words_return
+
+ ld t0,8(a1)
+ dmultu t0,t0
+ subu a2,1
+ mflo t1
+ mfhi t0
+ sd t1,16(a0)
+ sd t0,24(a0)
+ beqz a2,.L_bn_sqr_words_return
+
+ ld t0,16(a1)
+ dmultu t0,t0
+ mflo t1
+ mfhi t0
+ sd t1,32(a0)
+ sd t0,40(a0)
+ jr ra
+END(bn_sqr_words)
+
+.align 5
+LEAF(bn_add_words)
+ .set noreorder
+ bgtzl a3,.L_bn_add_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_add_words_proceed:
+ li MINUS4,-4
+ and AT,a3,MINUS4
+ move v0,zero
+ beqz AT,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+ ld ta0,0(a2)
+ subu a3,4
+ ld t1,8(a1)
+ and AT,a3,MINUS4
+ ld t2,16(a1)
+ PTR_ADD a2,32
+ ld t3,24(a1)
+ PTR_ADD a0,32
+ ld ta1,-24(a2)
+ PTR_ADD a1,32
+ ld ta2,-16(a2)
+ ld ta3,-8(a2)
+ daddu ta0,t0
+ sltu t8,ta0,t0
+ daddu t0,ta0,v0
+ sltu v0,t0,ta0
+ sd t0,-32(a0)
+ daddu v0,t8
+
+ daddu ta1,t1
+ sltu t9,ta1,t1
+ daddu t1,ta1,v0
+ sltu v0,t1,ta1
+ sd t1,-24(a0)
+ daddu v0,t9
+
+ daddu ta2,t2
+ sltu t8,ta2,t2
+ daddu t2,ta2,v0
+ sltu v0,t2,ta2
+ sd t2,-16(a0)
+ daddu v0,t8
- subu P3,P3,4
- b $lab55
- bltz P3,$lab55
-$lab54:
- ld L1,0(P2)
- ld L2,8(P2)
- ld L3,16(P2)
- ld L4,24(P2)
-
- dmultu L1,L1
- subu P3,P3,4
- mflo L1
- mfhi H1
- sd L1,0(P1)
- sd H1,8(P1)
-
- dmultu L2,L2
- daddu P1,P1,32
- mflo L2
- mfhi H2
- sd L2,-48(P1)
- sd H2,-40(P1)
-
- dmultu L3,L3
- daddu P2,P2,32
- mflo L3
- mfhi H3
- sd L3,-32(P1)
- sd H3,-24(P1)
-
- dmultu L4,L4
-
- mflo L4
- mfhi H4
- sd L4,-16(P1)
- sd H4,-8(P1)
-
- bgtz P3,$lab54
- b $lab55
-$lab56:
- ld L1,0(P2)
- daddu P1,P1,16
- dmultu L1,L1
- daddu P2,P2,8
- subu P3,P3,1
- mflo L1
- mfhi H1
- sd L1,-16(P1)
- sd H1,-8(P1)
-
- bgtz P3,$lab56
- j $31
-$lab55:
- daddu P3,P3,4
- bgtz P3,$lab56
- j $31
- .align 2
- .end bn_sqr_words
-
- # Program Unit: bn_add_words
- .ent bn_add_words
- .globl bn_add_words
-.text
-bn_add_words: # 0x590
- .frame $sp,0,$31
- .mask 0x00000000,0
- .fmask 0x00000000,0
+ daddu ta3,t3
+ sltu t9,ta3,t3
+ daddu t3,ta3,v0
+ sltu v0,t3,ta3
+ sd t3,-8(a0)
+ daddu v0,t9
- subu P4,P4,4
- move CC,$0
- bltz P4,$lab65
-$lab64:
- ld L1,0(P2)
- ld R1,0(P3)
- ld L2,8(P2)
- ld R2,8(P3)
-
- daddu L1,L1,CC
- ld L3,16(P2)
- sltu CC,L1,CC
- daddu L1,L1,R1
- sltu R1,L1,R1
- ld R3,16(P3)
- daddu CC,CC,R1
- ld L4,24(P2)
-
- daddu L2,L2,CC
- ld R4,24(P3)
- sltu CC,L2,CC
- daddu L2,L2,R2
- sltu R2,L2,R2
- sd L1,0(P1)
- daddu CC,CC,R2
- daddu P1,P1,32
- daddu L3,L3,CC
- sd L2,-24(P1)
-
- sltu CC,L3,CC
- daddu L3,L3,R3
- sltu R3,L3,R3
- daddu P2,P2,32
- daddu CC,CC,R3
-
- daddu L4,L4,CC
- daddu P3,P3,32
- sltu CC,L4,CC
- daddu L4,L4,R4
- sltu R4,L4,R4
- subu P4,P4,4
- sd L3,-16(P1)
- daddu CC,CC,R4
- sd L4,-8(P1)
-
- bgtz P4,$lab64
- b $lab65
-$lab66:
- ld L1,0(P2)
- ld R1,0(P3)
- daddu L1,L1,CC
- daddu P1,P1,8
- sltu CC,L1,CC
- daddu P2,P2,8
- daddu P3,P3,8
- daddu L1,L1,R1
- subu P4,P4,1
- sltu R1,L1,R1
- sd L1,-8(P1)
- daddu CC,CC,R1
-
- bgtz P4,$lab66
- j $31
-$lab65:
- addu P4,P4,4
- bgtz P4,$lab66
- j $31
- .end bn_add_words
-
-#if 1
- # Program Unit: bn_div64
- .set at
+ .set noreorder
+ bgtzl AT,.L_bn_add_words_loop
+ ld t0,0(a1)
+
+ bnezl a3,.L_bn_add_words_tail
+ ld t0,0(a1)
.set reorder
- .text
- .align 2
- .globl bn_div64
- # 321 {
- .ent bn_div64
-bn_div64:
- dsubu $sp, 64
- sd $31, 56($sp)
- sd $16, 48($sp)
- .mask 0x80010000, -56
- .frame $sp, 64, $31
- move $9, $4
- move $12, $5
- move $16, $6
- # 322 BN_ULONG dh,dl,q,ret=0,th,tl,t;
- move $31, $0
- # 323 int i,count=2;
- li $13, 2
- # 324
- # 325 if (d == 0) return(BN_MASK2);
- bne $16, 0, $80
- dli $2, -1
- b $93
-$80:
- # 326
- # 327 i=BN_num_bits_word(d);
- move $4, $16
- sd $31, 16($sp)
- sd $9, 24($sp)
- sd $12, 32($sp)
- sd $13, 40($sp)
- .livereg 0x800ff0e,0xfff
- jal BN_num_bits_word
- dli $4, 64
- ld $31, 16($sp)
- ld $9, 24($sp)
- ld $12, 32($sp)
- ld $13, 40($sp)
- move $3, $2
- # 328 if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i))
- beq $2, $4, $81
- dli $14, 1
- dsll $15, $14, $2
- bleu $9, $15, $81
- # 329 {
- # 330 #if !defined(NO_STDIO) && !defined(WIN16)
- # 331 fprintf(stderr,"Division would overflow (%d)\n",i);
- # 332 #endif
- # 333 abort();
- sd $3, 8($sp)
- sd $31, 16($sp)
- sd $9, 24($sp)
- sd $12, 32($sp)
- sd $13, 40($sp)
- .livereg 0xff0e,0xfff
- jal abort
- dli $4, 64
- ld $3, 8($sp)
- ld $31, 16($sp)
- ld $9, 24($sp)
- ld $12, 32($sp)
- ld $13, 40($sp)
- # 334 }
-$81:
- # 335 i=BN_BITS2-i;
- dsubu $3, $4, $3
- # 336 if (h >= d) h-=d;
- bltu $9, $16, $82
- dsubu $9, $9, $16
-$82:
- # 337
- # 338 if (i)
- beq $3, 0, $83
- # 339 {
- # 340 d<<=i;
- dsll $16, $16, $3
- # 341 h=(h<<i)|(l>>(BN_BITS2-i));
- dsll $24, $9, $3
- dsubu $25, $4, $3
- dsrl $14, $12, $25
- or $9, $24, $14
- # 342 l<<=i;
- dsll $12, $12, $3
- # 343 }
-$83:
- # 344 dh=(d&BN_MASK2h)>>BN_BITS4;
- # 345 dl=(d&BN_MASK2l);
- and $8, $16,0xFFFFFFFF00000000
- dsrl $8, $8, 32
- # dli $10,0xFFFFFFFF # Is this needed?
- # and $10, $16, $10
- dsll $10, $16, 32
- dsrl $10, $10, 32
- dli $6,0xFFFFFFFF00000000
-$84:
- # 346 for (;;)
- # 347 {
- # 348 if ((h>>BN_BITS4) == dh)
- dsrl $15, $9, 32
- bne $8, $15, $85
- # 349 q=BN_MASK2l;
- dli $5, 0xFFFFFFFF
- b $86
-$85:
- # 350 else
- # 351 q=h/dh;
- ddivu $5, $9, $8
-$86:
- # 352
- # 353 for (;;)
- # 354 {
- # 355 t=(h-q*dh);
- dmul $4, $5, $8
- dsubu $2, $9, $4
- move $3, $2
- # 356 if ((t&BN_MASK2h) ||
- # 357 ((dl*q) <= (
- # 358 (t<<BN_BITS4)+
- # 359 ((l&BN_MASK2h)>>BN_BITS4))))
- and $25, $2, $6
- bne $25, $0, $87
- dmul $24, $10, $5
- dsll $14, $3, 32
- and $15, $12, $6
- dsrl $25, $15, 32
- daddu $15, $14, $25
- bgtu $24, $15, $88
-$87:
- # 360 break;
- dmul $3, $10, $5
- b $89
-$88:
- # 361 q--;
- daddu $5, $5, -1
- # 362 }
- b $86
-$89:
- # 363 th=q*dh;
- # 364 tl=q*dl;
- # 365 t=(tl>>BN_BITS4);
- # 366 tl=(tl<<BN_BITS4)&BN_MASK2h;
- dsll $14, $3, 32
- and $2, $14, $6
- move $11, $2
- # 367 th+=t;
- dsrl $25, $3, 32
- daddu $7, $4, $25
- # 368
- # 369 if (l < tl) th++;
- bgeu $12, $2, $90
- daddu $7, $7, 1
-$90:
- # 370 l-=tl;
- dsubu $12, $12, $11
- # 371 if (h < th)
- bgeu $9, $7, $91
- # 372 {
- # 373 h+=d;
- daddu $9, $9, $16
- # 374 q--;
- daddu $5, $5, -1
- # 375 }
-$91:
- # 376 h-=th;
- dsubu $9, $9, $7
- # 377
- # 378 if (--count == 0) break;
- addu $13, $13, -1
- beq $13, 0, $92
- # 379
- # 380 ret=q<<BN_BITS4;
- dsll $31, $5, 32
- # 381 h=((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2;
- dsll $24, $9, 32
- dsrl $15, $12, 32
- or $9, $24, $15
- # 382 l=(l&BN_MASK2l)<<BN_BITS4;
- and $12, $12, 0xFFFFFFFF
- dsll $12, $12, 32
- # 383 }
- b $84
-$92:
- # 384 ret|=q;
- or $31, $31, $5
- # 385 return(ret);
- move $2, $31
-$93:
- ld $16, 48($sp)
- ld $31, 56($sp)
- daddu $sp, 64
- j $31
- .end bn_div64
-#endif
+
+.L_bn_add_words_return:
+ jr ra
+
+.L_bn_add_words_tail:
+ ld ta0,0(a2)
+ daddu ta0,t0
+ subu a3,1
+ sltu t8,ta0,t0
+ daddu t0,ta0,v0
+ sltu v0,t0,ta0
+ sd t0,0(a0)
+ daddu v0,t8
+ beqz a3,.L_bn_add_words_return
+
+ ld t1,8(a1)
+ ld ta1,8(a2)
+ daddu ta1,t1
+ subu a3,1
+ sltu t9,ta1,t1
+ daddu t1,ta1,v0
+ sltu v0,t1,ta1
+ sd t1,8(a0)
+ daddu v0,t9
+ beqz a3,.L_bn_add_words_return
+
+ ld t2,16(a1)
+ ld ta2,16(a2)
+ daddu ta2,t2
+ sltu t8,ta2,t2
+ daddu t2,ta2,v0
+ sltu v0,t2,ta2
+ sd t2,16(a0)
+ daddu v0,t8
+ jr ra
+END(bn_add_words)
+
+.align 5
+LEAF(bn_sub_words)
+ .set noreorder
+ bgtzl a3,.L_bn_sub_words_proceed
+ ld t0,0(a1)
+ jr ra
+ move v0,zero
+ .set reorder
+
+.L_bn_sub_words_proceed:
+ li MINUS4,-4
+ and AT,a3,MINUS4
+ move v0,zero
+ beqz AT,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+ ld ta0,0(a2)
+ subu a3,4
+ ld t1,8(a1)
+ and AT,a3,MINUS4
+ ld t2,16(a1)
+ PTR_ADD a2,32
+ ld t3,24(a1)
+ PTR_ADD a0,32
+ ld ta1,-24(a2)
+ PTR_ADD a1,32
+ ld ta2,-16(a2)
+ ld ta3,-8(a2)
+ sltu t8,t0,ta0
+ dsubu t0,ta0
+ dsubu ta0,t0,v0
+ sd ta0,-32(a0)
+ MOVNZ (t0,v0,t8)
+
+ sltu t9,t1,ta1
+ dsubu t1,ta1
+ dsubu ta1,t1,v0
+ sd ta1,-24(a0)
+ MOVNZ (t1,v0,t9)
+
+
+ sltu t8,t2,ta2
+ dsubu t2,ta2
+ dsubu ta2,t2,v0
+ sd ta2,-16(a0)
+ MOVNZ (t2,v0,t8)
+
+ sltu t9,t3,ta3
+ dsubu t3,ta3
+ dsubu ta3,t3,v0
+ sd ta3,-8(a0)
+ MOVNZ (t3,v0,t9)
+
+ .set noreorder
+ bgtzl AT,.L_bn_sub_words_loop
+ ld t0,0(a1)
+
+ bnezl a3,.L_bn_sub_words_tail
+ ld t0,0(a1)
+ .set reorder
+
+.L_bn_sub_words_return:
+ jr ra
+
+.L_bn_sub_words_tail:
+ ld ta0,0(a2)
+ subu a3,1
+ sltu t8,t0,ta0
+ dsubu t0,ta0
+ dsubu ta0,t0,v0
+ MOVNZ (t0,v0,t8)
+ sd ta0,0(a0)
+ beqz a3,.L_bn_sub_words_return
+
+ ld t1,8(a1)
+ subu a3,1
+ ld ta1,8(a2)
+ sltu t9,t1,ta1
+ dsubu t1,ta1
+ dsubu ta1,t1,v0
+ MOVNZ (t1,v0,t9)
+ sd ta1,8(a0)
+ beqz a3,.L_bn_sub_words_return
+
+ ld t2,16(a1)
+ ld ta2,16(a2)
+ sltu t8,t2,ta2
+ dsubu t2,ta2
+ dsubu ta2,t2,v0
+ MOVNZ (t2,v0,t8)
+ sd ta2,16(a0)
+ jr ra
+END(bn_sub_words)
+
+#undef MINUS4
+
+.align 5
+LEAF(bn_div_3_words)
+ .set reorder
+ move a3,a0 /* we know that bn_div_words doesn't
+ * touch a3, ta2, ta3 and preserves a2
+ * so that we can save two arguments
+ * and return address in registers
+ * instead of stack:-)
+ */
+ ld a0,(a3)
+ move ta2,a1
+ ld a1,-8(a3)
+ bne a0,a2,.L_bn_div_3_words_proceed
+ li v0,-1
+ jr ra
+.L_bn_div_3_words_proceed:
+ move ta3,ra
+ bal bn_div_words
+ move ra,ta3
+ dmultu ta2,v0
+ ld t2,-16(a3)
+ move ta0,zero
+ mfhi t1
+ mflo t0
+ sltu t8,t1,v1
+.L_bn_div_3_words_inner_loop:
+ bnez t8,.L_bn_div_3_words_inner_loop_done
+ sgeu AT,t2,t0
+ seq t9,t1,v1
+ and AT,t9
+ sltu t3,t0,ta2
+ daddu v1,a2
+ dsubu t1,t3
+ dsubu t0,ta2
+ sltu t8,t1,v1
+ sltu ta0,v1,a2
+ or t8,ta0
+ .set noreorder
+ beqzl AT,.L_bn_div_3_words_inner_loop
+ dsubu v0,1
+ .set reorder
+.L_bn_div_3_words_inner_loop_done:
+ jr ra
+END(bn_div_3_words)
+
+.align 5
+LEAF(bn_div_words)
+ .set noreorder
+ bnezl a2,.L_bn_div_words_proceed
+ move v1,zero
+ jr ra
+ li v0,-1 /* I'd rather signal div-by-zero
+ * which can be done with 'break 7' */
+
+.L_bn_div_words_proceed:
+ bltz a2,.L_bn_div_words_body
+ move t9,v1
+ dsll a2,1
+ bgtz a2,.-4
+ addu t9,1
+
+ .set reorder
+ negu t1,t9
+ li t2,-1
+ dsll t2,t1
+ and t2,a0
+ dsrl AT,a1,t1
+ .set noreorder
+ bnezl t2,.+8
+ break 6 /* signal overflow */
+ .set reorder
+ dsll a0,t9
+ dsll a1,t9
+ or a0,AT
+
+#define QT ta0
+#define HH ta1
+#define DH v1
+.L_bn_div_words_body:
+ dsrl DH,a2,32
+ sgeu AT,a0,a2
+ .set noreorder
+ bnezl AT,.+8
+ dsubu a0,a2
+ .set reorder
+
+ li QT,-1
+ dsrl HH,a0,32
+ dsrl QT,32 /* q=0xffffffff */
+ beq DH,HH,.L_bn_div_words_skip_div1
+ ddivu zero,a0,DH
+ mflo QT
+.L_bn_div_words_skip_div1:
+ dmultu a2,QT
+ dsll t3,a0,32
+ dsrl AT,a1,32
+ or t3,AT
+ mflo t0
+ mfhi t1
+.L_bn_div_words_inner_loop1:
+ sltu t2,t3,t0
+ seq t8,HH,t1
+ sltu AT,HH,t1
+ and t2,t8
+ sltu v0,t0,a2
+ or AT,t2
+ .set noreorder
+ beqz AT,.L_bn_div_words_inner_loop1_done
+ dsubu t1,v0
+ dsubu t0,a2
+ b .L_bn_div_words_inner_loop1
+ dsubu QT,1
+ .set reorder
+.L_bn_div_words_inner_loop1_done:
+
+ dsll a1,32
+ dsubu a0,t3,t0
+ dsll v0,QT,32
+
+ li QT,-1
+ dsrl HH,a0,32
+ dsrl QT,32 /* q=0xffffffff */
+ beq DH,HH,.L_bn_div_words_skip_div2
+ ddivu zero,a0,DH
+ mflo QT
+.L_bn_div_words_skip_div2:
+#undef DH
+ dmultu a2,QT
+ dsll t3,a0,32
+ dsrl AT,a1,32
+ or t3,AT
+ mflo t0
+ mfhi t1
+.L_bn_div_words_inner_loop2:
+ sltu t2,t3,t0
+ seq t8,HH,t1
+ sltu AT,HH,t1
+ and t2,t8
+ sltu v1,t0,a2
+ or AT,t2
+ .set noreorder
+ beqz AT,.L_bn_div_words_inner_loop2_done
+ dsubu t1,v1
+ dsubu t0,a2
+ b .L_bn_div_words_inner_loop2
+ dsubu QT,1
+ .set reorder
+.L_bn_div_words_inner_loop2_done:
+#undef HH
+
+ dsubu a0,t3,t0
+ or v0,QT
+ dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
+ dsrl a2,t9 /* restore a2 */
+ jr ra
+#undef QT
+END(bn_div_words)
+
+#define a_0 t0
+#define a_1 t1
+#define a_2 t2
+#define a_3 t3
+#define b_0 ta0
+#define b_1 ta1
+#define b_2 ta2
+#define b_3 ta3
+
+#define a_4 s0
+#define a_5 s2
+#define a_6 s4
+#define a_7 a1 /* once we load a[7] we don't need a anymore */
+#define b_4 s1
+#define b_5 s3
+#define b_6 s5
+#define b_7 a2 /* once we load b[7] we don't need b anymore */
+
+#define t_1 t8
+#define t_2 t9
+
+#define c_1 v0
+#define c_2 v1
+#define c_3 a3
+
+#define FRAME_SIZE 48
+
+.align 5
+LEAF(bn_mul_comba8)
+ .set noreorder
+ PTR_SUB sp,FRAME_SIZE
+ .frame sp,64,ra
+ .set reorder
+ ld a_0,0(a1) /* If compiled with -mips3 option on
+ * R5000 box assembler barks on this
+ * line with "shouldn't have mult/div
+ * as last instruction in bb (R10K
+ * bug)" warning. If anybody out there
+ * has a clue about how to circumvent
+ * this do send me a note.
+ * <appro@fy.chalmers.se>
+ */
+ ld b_0,0(a2)
+ ld a_1,8(a1)
+ ld a_2,16(a1)
+ ld a_3,24(a1)
+ ld b_1,8(a2)
+ ld b_2,16(a2)
+ ld b_3,24(a2)
+ dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ sd s0,0(sp)
+ sd s1,8(sp)
+ sd s2,16(sp)
+ sd s3,24(sp)
+ sd s4,32(sp)
+ sd s5,40(sp)
+ mflo c_1
+ mfhi c_2
+
+ dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
+ ld a_4,32(a1)
+ ld a_5,40(a1)
+ ld a_6,48(a1)
+ ld a_7,56(a1)
+ ld b_4,32(a2)
+ ld b_5,40(a2)
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
+ ld b_6,48(a2)
+ ld b_7,56(a2)
+ sd c_1,0(a0) /* r[0]=c1; */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ sd c_2,8(a0) /* r[1]=c2; */
+
+ dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0) /* r[2]=c3; */
+
+ dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,24(a0) /* r[3]=c1; */
+
+ dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0) /* r[4]=c2; */
+
+ dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,40(a0) /* r[5]=c3; */
+
+ dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,48(a0) /* r[6]=c1; */
+
+ dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,56(a0) /* r[7]=c2; */
+
+ dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,64(a0) /* r[8]=c3; */
+
+ dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,72(a0) /* r[9]=c1; */
+
+ dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,80(a0) /* r[10]=c2; */
+
+ dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,88(a0) /* r[11]=c3; */
+
+ dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,96(a0) /* r[12]=c1; */
+
+ dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,104(a0) /* r[13]=c2; */
+
+ dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
+ ld s0,0(sp)
+ ld s1,8(sp)
+ ld s2,16(sp)
+ ld s3,24(sp)
+ ld s4,32(sp)
+ ld s5,40(sp)
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sd c_3,112(a0) /* r[14]=c3; */
+ sd c_1,120(a0) /* r[15]=c1; */
+
+ PTR_ADD sp,FRAME_SIZE
+
+ jr ra
+END(bn_mul_comba8)
+
+.align 5
+LEAF(bn_mul_comba4)
+ .set reorder
+ ld a_0,0(a1)
+ ld b_0,0(a2)
+ ld a_1,8(a1)
+ ld a_2,16(a1)
+ dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ ld a_3,24(a1)
+ ld b_1,8(a2)
+ ld b_2,16(a2)
+ ld b_3,24(a2)
+ mflo c_1
+ mfhi c_2
+ sd c_1,0(a0)
+
+ dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ sd c_2,8(a0)
+
+ dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0)
+
+ dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu c_3,c_2,t_2
+ dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,24(a0)
+
+ dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu c_1,c_3,t_2
+ dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0)
+
+ dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu c_2,c_1,t_2
+ dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,40(a0)
+
+ dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sd c_1,48(a0)
+ sd c_2,56(a0)
+
+ jr ra
+END(bn_mul_comba4)
+
+#undef a_4
+#undef a_5
+#undef a_6
+#undef a_7
+#define a_4 b_0
+#define a_5 b_1
+#define a_6 b_2
+#define a_7 b_3
+
+.align 5
+LEAF(bn_sqr_comba8)
+ .set reorder
+ ld a_0,0(a1)
+ ld a_1,8(a1)
+ ld a_2,16(a1)
+ ld a_3,24(a1)
+
+ dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ ld a_4,32(a1)
+ ld a_5,40(a1)
+ ld a_6,48(a1)
+ ld a_7,56(a1)
+ mflo c_1
+ mfhi c_2
+ sd c_1,0(a0)
+
+ dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ sd c_2,8(a0)
+
+ dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu c_2,c_1,AT
+ daddu c_1,t_2
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0)
+
+ dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu c_3,c_2,AT
+ daddu c_2,t_2
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu AT,c_2,AT
+ daddu c_2,t_2
+ daddu c_3,AT
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ sd c_1,24(a0)
+
+ dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu c_1,c_3,AT
+ daddu c_3,t_2
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu AT,c_3,AT
+ daddu c_3,t_2
+ daddu c_1,AT
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0)
+
+ dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu c_2,c_1,AT
+ daddu c_1,t_2
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu AT,c_1,AT
+ daddu c_1,t_2
+ daddu c_2,AT
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu AT,c_1,AT
+ daddu c_1,t_2
+ daddu c_2,AT
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ sd c_3,40(a0)
+
+ dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu c_3,c_2,AT
+ daddu c_2,t_2
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu AT,c_2,AT
+ daddu c_2,t_2
+ daddu c_3,AT
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu AT,c_2,AT
+ daddu c_2,t_2
+ daddu c_3,AT
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,48(a0)
+
+ dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu c_1,c_3,AT
+ daddu c_3,t_2
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu AT,c_3,AT
+ daddu c_3,t_2
+ daddu c_1,AT
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu AT,c_3,AT
+ daddu c_3,t_2
+ daddu c_1,AT
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu AT,c_3,AT
+ daddu c_3,t_2
+ daddu c_1,AT
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ sd c_2,56(a0)
+
+ dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu c_2,c_1,AT
+ daddu c_1,t_2
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu AT,c_1,AT
+ daddu c_1,t_2
+ daddu c_2,AT
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu AT,c_1,AT
+ daddu c_1,t_2
+ daddu c_2,AT
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,64(a0)
+
+ dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu c_3,c_2,AT
+ daddu c_2,t_2
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu AT,c_2,AT
+ daddu c_2,t_2
+ daddu c_3,AT
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu AT,c_2,AT
+ daddu c_2,t_2
+ daddu c_3,AT
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ sd c_1,72(a0)
+
+ dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu c_1,c_3,AT
+ daddu c_3,t_2
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu AT,c_3,AT
+ daddu c_3,t_2
+ daddu c_1,AT
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,80(a0)
+
+ dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu c_2,c_1,AT
+ daddu c_1,t_2
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu AT,c_1,AT
+ daddu c_1,t_2
+ daddu c_2,AT
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ sd c_3,88(a0)
+
+ dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu c_3,c_2,AT
+ daddu c_2,t_2
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sltu AT,c_2,t_2
+ daddu c_3,AT
+ sd c_1,96(a0)
+
+ dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu c_1,c_3,AT
+ daddu c_3,t_2
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ sd c_2,104(a0)
+
+ dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sd c_3,112(a0)
+ sd c_1,120(a0)
+
+ jr ra
+END(bn_sqr_comba8)
+
+.align 5
+LEAF(bn_sqr_comba4)
+ .set reorder
+ ld a_0,0(a1)
+ ld a_1,8(a1)
+ dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
+ ld a_2,16(a1)
+ ld a_3,24(a1)
+ mflo c_1
+ mfhi c_2
+ sd c_1,0(a0)
+
+ dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ slt c_1,t_2,zero
+ dsll t_2,1
+ slt a2,t_1,zero
+ daddu t_2,a2
+ dsll t_1,1
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_3,t_2,AT
+ sd c_2,8(a0)
+
+ dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu c_2,c_1,AT
+ daddu c_1,t_2
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu t_2,AT
+ daddu c_1,t_2
+ sltu AT,c_1,t_2
+ daddu c_2,AT
+ sd c_3,16(a0)
+
+ dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu c_3,c_2,AT
+ daddu c_2,t_2
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu c_1,t_1
+ daddu AT,t_2
+ sltu t_1,c_1,t_1
+ daddu c_2,AT
+ daddu t_2,t_1
+ sltu AT,c_2,AT
+ daddu c_2,t_2
+ daddu c_3,AT
+ sltu t_2,c_2,t_2
+ daddu c_3,t_2
+ sd c_1,24(a0)
+
+ dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu c_2,t_1
+ daddu AT,t_2
+ sltu t_1,c_2,t_1
+ daddu c_3,AT
+ daddu t_2,t_1
+ sltu c_1,c_3,AT
+ daddu c_3,t_2
+ sltu t_2,c_3,t_2
+ daddu c_1,t_2
+ dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
+ mflo t_1
+ mfhi t_2
+ daddu c_2,t_1
+ sltu AT,c_2,t_1
+ daddu t_2,AT
+ daddu c_3,t_2
+ sltu AT,c_3,t_2
+ daddu c_1,AT
+ sd c_2,32(a0)
+
+ dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
+ mflo t_1
+ mfhi t_2
+ daddu c_3,t_1
+ sltu AT,c_3,t_1
+ daddu c_3,t_1
+ daddu AT,t_2
+ sltu t_1,c_3,t_1
+ daddu c_1,AT
+ daddu t_2,t_1
+ sltu c_2,c_1,AT
+ daddu c_1,t_2
+ sltu t_2,c_1,t_2
+ daddu c_2,t_2
+ sd c_3,40(a0)
+
+ dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
+ mflo t_1
+ mfhi t_2
+ daddu c_1,t_1
+ sltu AT,c_1,t_1
+ daddu t_2,AT
+ daddu c_2,t_2
+ sd c_1,48(a0)
+ sd c_2,56(a0)
+
+ jr ra
+END(bn_sqr_comba4)