]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
da4d239d AP |
9 | # |
10 | # ==================================================================== | |
e3713c36 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
da4d239d AP |
12 | # project. |
13 | # | |
14 | # Rights for redistribution and usage in source and binary forms are | |
15 | # granted according to the OpenSSL license. Warranty of any kind is | |
16 | # disclaimed. | |
17 | # ==================================================================== | |
18 | ||
19 | ||
20 | # July 1999 | |
21 | # | |
22 | # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. | |
23 | # | |
24 | # The module is designed to work with either of the "new" MIPS ABI(5), | |
60250017 | 25 | # namely N32 or N64, offered by IRIX 6.x. It's not meant to work under |
da4d239d AP |
26 | # IRIX 5.x not only because it doesn't support new ABIs but also |
27 | # because 5.x kernels put R4x00 CPU into 32-bit mode and all those | |
28 | # 64-bit instructions (daddu, dmultu, etc.) found below gonna only | |
29 | # cause illegal instruction exception:-( | |
30 | # | |
31 | # In addition the code depends on preprocessor flags set up by MIPSpro | |
32 | # compiler driver (either as or cc) and therefore (probably?) can't be | |
33 | # compiled by the GNU assembler. GNU C driver manages fine though... | |
34 | # I mean as long as -mmips-as is specified or is the default option, | |
35 | # because then it simply invokes /usr/bin/as which in turn takes | |
36 | # perfect care of the preprocessor definitions. Another neat feature | |
37 | # offered by the MIPSpro assembler is an optimization pass. This gave | |
38 | # me the opportunity to have the code looking more regular as all those | |
39 | # architecture dependent instruction rescheduling details were left to | |
40 | # the assembler. Cool, huh? | |
41 | # | |
42 | # Performance improvement is astonishing! 'apps/openssl speed rsa dsa' | |
43 | # goes way over 3 times faster! | |
44 | # | |
e3713c36 | 45 | # <appro@openssl.org> |
da4d239d AP |
46 | |
47 | # October 2010 | |
48 | # | |
49 | # Adapt the module even for 32-bit ABIs and other OSes. The former was | |
50 | # achieved by mechanical replacement of 64-bit arithmetic instructions | |
51 | # such as dmultu, daddu, etc. with their 32-bit counterparts and | |
52 | # adjusting offsets denoting multiples of BN_ULONG. Above mentioned | |
53 | # >3x performance improvement naturally does not apply to 32-bit code | |
54 | # [because there is no instruction 32-bit compiler can't use], one | |
55 | # has to content with 40-85% improvement depending on benchmark and | |
56 | # key length, more for longer keys. | |
57 | ||
1a002d88 | 58 | $flavour = shift || "o32"; |
a5aa63a4 | 59 | while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} |
d4665887 | 60 | open STDOUT,">$output"; |
da4d239d AP |
61 | |
62 | if ($flavour =~ /64|n32/i) { | |
63 | $LD="ld"; | |
64 | $ST="sd"; | |
65 | $MULTU="dmultu"; | |
66 | $DIVU="ddivu"; | |
67 | $ADDU="daddu"; | |
68 | $SUBU="dsubu"; | |
69 | $SRL="dsrl"; | |
70 | $SLL="dsll"; | |
71 | $BNSZ=8; | |
72 | $PTR_ADD="daddu"; | |
73 | $PTR_SUB="dsubu"; | |
74 | $SZREG=8; | |
75 | $REG_S="sd"; | |
76 | $REG_L="ld"; | |
77 | } else { | |
78 | $LD="lw"; | |
79 | $ST="sw"; | |
80 | $MULTU="multu"; | |
81 | $DIVU="divu"; | |
82 | $ADDU="addu"; | |
83 | $SUBU="subu"; | |
84 | $SRL="srl"; | |
85 | $SLL="sll"; | |
86 | $BNSZ=4; | |
87 | $PTR_ADD="addu"; | |
88 | $PTR_SUB="subu"; | |
89 | $SZREG=4; | |
90 | $REG_S="sw"; | |
91 | $REG_L="lw"; | |
92 | $code=".set mips2\n"; | |
93 | } | |
94 | ||
95 | # Below is N32/64 register layout used in the original module. | |
96 | # | |
97 | ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); | |
98 | ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); | |
99 | ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); | |
100 | ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); | |
101 | ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); | |
102 | ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); | |
103 | # | |
104 | # No special adaptation is required for O32. NUBI on the other hand | |
105 | # is treated by saving/restoring ($v1,$t0..$t3). | |
106 | ||
107 | $gp=$v1 if ($flavour =~ /nubi/i); | |
108 | ||
109 | $minus4=$v1; | |
110 | ||
111 | $code.=<<___; | |
947716c1 AP |
112 | #include "mips_arch.h" |
113 | ||
114 | #if defined(_MIPS_ARCH_MIPS64R6) | |
115 | # define ddivu(rs,rt) | |
116 | # define mfqt(rd,rs,rt) ddivu rd,rs,rt | |
117 | # define mfrm(rd,rs,rt) dmodu rd,rs,rt | |
118 | #elif defined(_MIPS_ARCH_MIPS32R6) | |
119 | # define divu(rs,rt) | |
120 | # define mfqt(rd,rs,rt) divu rd,rs,rt | |
121 | # define mfrm(rd,rs,rt) modu rd,rs,rt | |
122 | #else | |
123 | # define $DIVU(rs,rt) $DIVU $zero,rs,rt | |
124 | # define mfqt(rd,rs,rt) mflo rd | |
125 | # define mfrm(rd,rs,rt) mfhi rd | |
126 | #endif | |
127 | ||
da4d239d AP |
128 | .rdata |
129 | .asciiz "mips3.s, Version 1.2" | |
130 | .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" | |
131 | ||
132 | .text | |
133 | .set noat | |
134 | ||
135 | .align 5 | |
136 | .globl bn_mul_add_words | |
137 | .ent bn_mul_add_words | |
138 | bn_mul_add_words: | |
139 | .set noreorder | |
140 | bgtz $a2,bn_mul_add_words_internal | |
141 | move $v0,$zero | |
142 | jr $ra | |
143 | move $a0,$v0 | |
144 | .end bn_mul_add_words | |
145 | ||
146 | .align 5 | |
147 | .ent bn_mul_add_words_internal | |
148 | bn_mul_add_words_internal: | |
149 | ___ | |
150 | $code.=<<___ if ($flavour =~ /nubi/i); | |
151 | .frame $sp,6*$SZREG,$ra | |
152 | .mask 0x8000f008,-$SZREG | |
153 | .set noreorder | |
154 | $PTR_SUB $sp,6*$SZREG | |
155 | $REG_S $ra,5*$SZREG($sp) | |
156 | $REG_S $t3,4*$SZREG($sp) | |
157 | $REG_S $t2,3*$SZREG($sp) | |
158 | $REG_S $t1,2*$SZREG($sp) | |
159 | $REG_S $t0,1*$SZREG($sp) | |
160 | $REG_S $gp,0*$SZREG($sp) | |
161 | ___ | |
162 | $code.=<<___; | |
163 | .set reorder | |
164 | li $minus4,-4 | |
165 | and $ta0,$a2,$minus4 | |
da4d239d AP |
166 | beqz $ta0,.L_bn_mul_add_words_tail |
167 | ||
168 | .L_bn_mul_add_words_loop: | |
0c2adb0a | 169 | $LD $t0,0($a1) |
947716c1 | 170 | $MULTU ($t0,$a3) |
da4d239d AP |
171 | $LD $t1,0($a0) |
172 | $LD $t2,$BNSZ($a1) | |
173 | $LD $t3,$BNSZ($a0) | |
174 | $LD $ta0,2*$BNSZ($a1) | |
175 | $LD $ta1,2*$BNSZ($a0) | |
176 | $ADDU $t1,$v0 | |
177 | sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit | |
178 | # values", but it seems to work fine | |
179 | # even on 64-bit registers. | |
947716c1 AP |
180 | mflo ($at,$t0,$a3) |
181 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
182 | $ADDU $t1,$at |
183 | $ADDU $v0,$t0 | |
947716c1 | 184 | $MULTU ($t2,$a3) |
da4d239d AP |
185 | sltu $at,$t1,$at |
186 | $ST $t1,0($a0) | |
187 | $ADDU $v0,$at | |
188 | ||
189 | $LD $ta2,3*$BNSZ($a1) | |
190 | $LD $ta3,3*$BNSZ($a0) | |
191 | $ADDU $t3,$v0 | |
192 | sltu $v0,$t3,$v0 | |
947716c1 AP |
193 | mflo ($at,$t2,$a3) |
194 | mfhi ($t2,$t2,$a3) | |
da4d239d AP |
195 | $ADDU $t3,$at |
196 | $ADDU $v0,$t2 | |
947716c1 | 197 | $MULTU ($ta0,$a3) |
da4d239d AP |
198 | sltu $at,$t3,$at |
199 | $ST $t3,$BNSZ($a0) | |
200 | $ADDU $v0,$at | |
201 | ||
202 | subu $a2,4 | |
203 | $PTR_ADD $a0,4*$BNSZ | |
204 | $PTR_ADD $a1,4*$BNSZ | |
205 | $ADDU $ta1,$v0 | |
206 | sltu $v0,$ta1,$v0 | |
947716c1 AP |
207 | mflo ($at,$ta0,$a3) |
208 | mfhi ($ta0,$ta0,$a3) | |
da4d239d AP |
209 | $ADDU $ta1,$at |
210 | $ADDU $v0,$ta0 | |
947716c1 | 211 | $MULTU ($ta2,$a3) |
da4d239d AP |
212 | sltu $at,$ta1,$at |
213 | $ST $ta1,-2*$BNSZ($a0) | |
214 | $ADDU $v0,$at | |
215 | ||
216 | ||
217 | and $ta0,$a2,$minus4 | |
218 | $ADDU $ta3,$v0 | |
219 | sltu $v0,$ta3,$v0 | |
947716c1 AP |
220 | mflo ($at,$ta2,$a3) |
221 | mfhi ($ta2,$ta2,$a3) | |
da4d239d AP |
222 | $ADDU $ta3,$at |
223 | $ADDU $v0,$ta2 | |
224 | sltu $at,$ta3,$at | |
225 | $ST $ta3,-$BNSZ($a0) | |
da4d239d | 226 | .set noreorder |
0c2adb0a AP |
227 | bgtz $ta0,.L_bn_mul_add_words_loop |
228 | $ADDU $v0,$at | |
da4d239d AP |
229 | |
230 | beqz $a2,.L_bn_mul_add_words_return | |
231 | nop | |
232 | ||
233 | .L_bn_mul_add_words_tail: | |
234 | .set reorder | |
235 | $LD $t0,0($a1) | |
947716c1 | 236 | $MULTU ($t0,$a3) |
da4d239d AP |
237 | $LD $t1,0($a0) |
238 | subu $a2,1 | |
239 | $ADDU $t1,$v0 | |
240 | sltu $v0,$t1,$v0 | |
947716c1 AP |
241 | mflo ($at,$t0,$a3) |
242 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
243 | $ADDU $t1,$at |
244 | $ADDU $v0,$t0 | |
245 | sltu $at,$t1,$at | |
246 | $ST $t1,0($a0) | |
247 | $ADDU $v0,$at | |
248 | beqz $a2,.L_bn_mul_add_words_return | |
249 | ||
250 | $LD $t0,$BNSZ($a1) | |
947716c1 | 251 | $MULTU ($t0,$a3) |
da4d239d AP |
252 | $LD $t1,$BNSZ($a0) |
253 | subu $a2,1 | |
254 | $ADDU $t1,$v0 | |
255 | sltu $v0,$t1,$v0 | |
947716c1 AP |
256 | mflo ($at,$t0,$a3) |
257 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
258 | $ADDU $t1,$at |
259 | $ADDU $v0,$t0 | |
260 | sltu $at,$t1,$at | |
261 | $ST $t1,$BNSZ($a0) | |
262 | $ADDU $v0,$at | |
263 | beqz $a2,.L_bn_mul_add_words_return | |
264 | ||
265 | $LD $t0,2*$BNSZ($a1) | |
947716c1 | 266 | $MULTU ($t0,$a3) |
da4d239d AP |
267 | $LD $t1,2*$BNSZ($a0) |
268 | $ADDU $t1,$v0 | |
269 | sltu $v0,$t1,$v0 | |
947716c1 AP |
270 | mflo ($at,$t0,$a3) |
271 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
272 | $ADDU $t1,$at |
273 | $ADDU $v0,$t0 | |
274 | sltu $at,$t1,$at | |
275 | $ST $t1,2*$BNSZ($a0) | |
276 | $ADDU $v0,$at | |
277 | ||
278 | .L_bn_mul_add_words_return: | |
279 | .set noreorder | |
280 | ___ | |
281 | $code.=<<___ if ($flavour =~ /nubi/i); | |
282 | $REG_L $t3,4*$SZREG($sp) | |
283 | $REG_L $t2,3*$SZREG($sp) | |
284 | $REG_L $t1,2*$SZREG($sp) | |
285 | $REG_L $t0,1*$SZREG($sp) | |
286 | $REG_L $gp,0*$SZREG($sp) | |
287 | $PTR_ADD $sp,6*$SZREG | |
288 | ___ | |
289 | $code.=<<___; | |
290 | jr $ra | |
291 | move $a0,$v0 | |
66001268 | 292 | .end bn_mul_add_words_internal |
da4d239d AP |
293 | |
294 | .align 5 | |
295 | .globl bn_mul_words | |
296 | .ent bn_mul_words | |
297 | bn_mul_words: | |
298 | .set noreorder | |
299 | bgtz $a2,bn_mul_words_internal | |
300 | move $v0,$zero | |
301 | jr $ra | |
302 | move $a0,$v0 | |
303 | .end bn_mul_words | |
304 | ||
305 | .align 5 | |
306 | .ent bn_mul_words_internal | |
307 | bn_mul_words_internal: | |
308 | ___ | |
309 | $code.=<<___ if ($flavour =~ /nubi/i); | |
310 | .frame $sp,6*$SZREG,$ra | |
311 | .mask 0x8000f008,-$SZREG | |
312 | .set noreorder | |
313 | $PTR_SUB $sp,6*$SZREG | |
314 | $REG_S $ra,5*$SZREG($sp) | |
315 | $REG_S $t3,4*$SZREG($sp) | |
316 | $REG_S $t2,3*$SZREG($sp) | |
317 | $REG_S $t1,2*$SZREG($sp) | |
318 | $REG_S $t0,1*$SZREG($sp) | |
319 | $REG_S $gp,0*$SZREG($sp) | |
320 | ___ | |
321 | $code.=<<___; | |
322 | .set reorder | |
323 | li $minus4,-4 | |
324 | and $ta0,$a2,$minus4 | |
da4d239d AP |
325 | beqz $ta0,.L_bn_mul_words_tail |
326 | ||
327 | .L_bn_mul_words_loop: | |
0c2adb0a | 328 | $LD $t0,0($a1) |
947716c1 | 329 | $MULTU ($t0,$a3) |
da4d239d AP |
330 | $LD $t2,$BNSZ($a1) |
331 | $LD $ta0,2*$BNSZ($a1) | |
332 | $LD $ta2,3*$BNSZ($a1) | |
947716c1 AP |
333 | mflo ($at,$t0,$a3) |
334 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
335 | $ADDU $v0,$at |
336 | sltu $t1,$v0,$at | |
947716c1 | 337 | $MULTU ($t2,$a3) |
da4d239d AP |
338 | $ST $v0,0($a0) |
339 | $ADDU $v0,$t1,$t0 | |
340 | ||
341 | subu $a2,4 | |
342 | $PTR_ADD $a0,4*$BNSZ | |
343 | $PTR_ADD $a1,4*$BNSZ | |
947716c1 AP |
344 | mflo ($at,$t2,$a3) |
345 | mfhi ($t2,$t2,$a3) | |
da4d239d AP |
346 | $ADDU $v0,$at |
347 | sltu $t3,$v0,$at | |
947716c1 | 348 | $MULTU ($ta0,$a3) |
da4d239d AP |
349 | $ST $v0,-3*$BNSZ($a0) |
350 | $ADDU $v0,$t3,$t2 | |
351 | ||
947716c1 AP |
352 | mflo ($at,$ta0,$a3) |
353 | mfhi ($ta0,$ta0,$a3) | |
da4d239d AP |
354 | $ADDU $v0,$at |
355 | sltu $ta1,$v0,$at | |
947716c1 | 356 | $MULTU ($ta2,$a3) |
da4d239d AP |
357 | $ST $v0,-2*$BNSZ($a0) |
358 | $ADDU $v0,$ta1,$ta0 | |
359 | ||
360 | and $ta0,$a2,$minus4 | |
947716c1 AP |
361 | mflo ($at,$ta2,$a3) |
362 | mfhi ($ta2,$ta2,$a3) | |
da4d239d AP |
363 | $ADDU $v0,$at |
364 | sltu $ta3,$v0,$at | |
365 | $ST $v0,-$BNSZ($a0) | |
da4d239d | 366 | .set noreorder |
0c2adb0a AP |
367 | bgtz $ta0,.L_bn_mul_words_loop |
368 | $ADDU $v0,$ta3,$ta2 | |
da4d239d AP |
369 | |
370 | beqz $a2,.L_bn_mul_words_return | |
371 | nop | |
372 | ||
373 | .L_bn_mul_words_tail: | |
374 | .set reorder | |
375 | $LD $t0,0($a1) | |
947716c1 | 376 | $MULTU ($t0,$a3) |
da4d239d | 377 | subu $a2,1 |
947716c1 AP |
378 | mflo ($at,$t0,$a3) |
379 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
380 | $ADDU $v0,$at |
381 | sltu $t1,$v0,$at | |
382 | $ST $v0,0($a0) | |
383 | $ADDU $v0,$t1,$t0 | |
384 | beqz $a2,.L_bn_mul_words_return | |
385 | ||
386 | $LD $t0,$BNSZ($a1) | |
947716c1 | 387 | $MULTU ($t0,$a3) |
da4d239d | 388 | subu $a2,1 |
947716c1 AP |
389 | mflo ($at,$t0,$a3) |
390 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
391 | $ADDU $v0,$at |
392 | sltu $t1,$v0,$at | |
393 | $ST $v0,$BNSZ($a0) | |
394 | $ADDU $v0,$t1,$t0 | |
395 | beqz $a2,.L_bn_mul_words_return | |
396 | ||
397 | $LD $t0,2*$BNSZ($a1) | |
947716c1 AP |
398 | $MULTU ($t0,$a3) |
399 | mflo ($at,$t0,$a3) | |
400 | mfhi ($t0,$t0,$a3) | |
da4d239d AP |
401 | $ADDU $v0,$at |
402 | sltu $t1,$v0,$at | |
403 | $ST $v0,2*$BNSZ($a0) | |
404 | $ADDU $v0,$t1,$t0 | |
405 | ||
406 | .L_bn_mul_words_return: | |
407 | .set noreorder | |
408 | ___ | |
409 | $code.=<<___ if ($flavour =~ /nubi/i); | |
410 | $REG_L $t3,4*$SZREG($sp) | |
411 | $REG_L $t2,3*$SZREG($sp) | |
412 | $REG_L $t1,2*$SZREG($sp) | |
413 | $REG_L $t0,1*$SZREG($sp) | |
414 | $REG_L $gp,0*$SZREG($sp) | |
415 | $PTR_ADD $sp,6*$SZREG | |
416 | ___ | |
417 | $code.=<<___; | |
418 | jr $ra | |
419 | move $a0,$v0 | |
420 | .end bn_mul_words_internal | |
421 | ||
422 | .align 5 | |
423 | .globl bn_sqr_words | |
424 | .ent bn_sqr_words | |
425 | bn_sqr_words: | |
426 | .set noreorder | |
427 | bgtz $a2,bn_sqr_words_internal | |
428 | move $v0,$zero | |
429 | jr $ra | |
430 | move $a0,$v0 | |
431 | .end bn_sqr_words | |
432 | ||
433 | .align 5 | |
434 | .ent bn_sqr_words_internal | |
435 | bn_sqr_words_internal: | |
436 | ___ | |
437 | $code.=<<___ if ($flavour =~ /nubi/i); | |
438 | .frame $sp,6*$SZREG,$ra | |
439 | .mask 0x8000f008,-$SZREG | |
440 | .set noreorder | |
441 | $PTR_SUB $sp,6*$SZREG | |
442 | $REG_S $ra,5*$SZREG($sp) | |
443 | $REG_S $t3,4*$SZREG($sp) | |
444 | $REG_S $t2,3*$SZREG($sp) | |
445 | $REG_S $t1,2*$SZREG($sp) | |
446 | $REG_S $t0,1*$SZREG($sp) | |
447 | $REG_S $gp,0*$SZREG($sp) | |
448 | ___ | |
449 | $code.=<<___; | |
450 | .set reorder | |
451 | li $minus4,-4 | |
452 | and $ta0,$a2,$minus4 | |
da4d239d AP |
453 | beqz $ta0,.L_bn_sqr_words_tail |
454 | ||
455 | .L_bn_sqr_words_loop: | |
0c2adb0a | 456 | $LD $t0,0($a1) |
947716c1 | 457 | $MULTU ($t0,$t0) |
da4d239d AP |
458 | $LD $t2,$BNSZ($a1) |
459 | $LD $ta0,2*$BNSZ($a1) | |
460 | $LD $ta2,3*$BNSZ($a1) | |
947716c1 AP |
461 | mflo ($t1,$t0,$t0) |
462 | mfhi ($t0,$t0,$t0) | |
da4d239d AP |
463 | $ST $t1,0($a0) |
464 | $ST $t0,$BNSZ($a0) | |
465 | ||
947716c1 | 466 | $MULTU ($t2,$t2) |
da4d239d AP |
467 | subu $a2,4 |
468 | $PTR_ADD $a0,8*$BNSZ | |
469 | $PTR_ADD $a1,4*$BNSZ | |
947716c1 AP |
470 | mflo ($t3,$t2,$t2) |
471 | mfhi ($t2,$t2,$t2) | |
da4d239d AP |
472 | $ST $t3,-6*$BNSZ($a0) |
473 | $ST $t2,-5*$BNSZ($a0) | |
474 | ||
947716c1 AP |
475 | $MULTU ($ta0,$ta0) |
476 | mflo ($ta1,$ta0,$ta0) | |
477 | mfhi ($ta0,$ta0,$ta0) | |
da4d239d AP |
478 | $ST $ta1,-4*$BNSZ($a0) |
479 | $ST $ta0,-3*$BNSZ($a0) | |
480 | ||
481 | ||
947716c1 | 482 | $MULTU ($ta2,$ta2) |
da4d239d | 483 | and $ta0,$a2,$minus4 |
947716c1 AP |
484 | mflo ($ta3,$ta2,$ta2) |
485 | mfhi ($ta2,$ta2,$ta2) | |
da4d239d | 486 | $ST $ta3,-2*$BNSZ($a0) |
da4d239d AP |
487 | |
488 | .set noreorder | |
0c2adb0a AP |
489 | bgtz $ta0,.L_bn_sqr_words_loop |
490 | $ST $ta2,-$BNSZ($a0) | |
da4d239d AP |
491 | |
492 | beqz $a2,.L_bn_sqr_words_return | |
493 | nop | |
494 | ||
495 | .L_bn_sqr_words_tail: | |
496 | .set reorder | |
497 | $LD $t0,0($a1) | |
947716c1 | 498 | $MULTU ($t0,$t0) |
da4d239d | 499 | subu $a2,1 |
947716c1 AP |
500 | mflo ($t1,$t0,$t0) |
501 | mfhi ($t0,$t0,$t0) | |
da4d239d AP |
502 | $ST $t1,0($a0) |
503 | $ST $t0,$BNSZ($a0) | |
504 | beqz $a2,.L_bn_sqr_words_return | |
505 | ||
506 | $LD $t0,$BNSZ($a1) | |
947716c1 | 507 | $MULTU ($t0,$t0) |
da4d239d | 508 | subu $a2,1 |
947716c1 AP |
509 | mflo ($t1,$t0,$t0) |
510 | mfhi ($t0,$t0,$t0) | |
da4d239d AP |
511 | $ST $t1,2*$BNSZ($a0) |
512 | $ST $t0,3*$BNSZ($a0) | |
513 | beqz $a2,.L_bn_sqr_words_return | |
514 | ||
515 | $LD $t0,2*$BNSZ($a1) | |
947716c1 AP |
516 | $MULTU ($t0,$t0) |
517 | mflo ($t1,$t0,$t0) | |
518 | mfhi ($t0,$t0,$t0) | |
da4d239d AP |
519 | $ST $t1,4*$BNSZ($a0) |
520 | $ST $t0,5*$BNSZ($a0) | |
521 | ||
522 | .L_bn_sqr_words_return: | |
523 | .set noreorder | |
524 | ___ | |
525 | $code.=<<___ if ($flavour =~ /nubi/i); | |
526 | $REG_L $t3,4*$SZREG($sp) | |
527 | $REG_L $t2,3*$SZREG($sp) | |
528 | $REG_L $t1,2*$SZREG($sp) | |
529 | $REG_L $t0,1*$SZREG($sp) | |
530 | $REG_L $gp,0*$SZREG($sp) | |
531 | $PTR_ADD $sp,6*$SZREG | |
532 | ___ | |
533 | $code.=<<___; | |
534 | jr $ra | |
535 | move $a0,$v0 | |
536 | ||
537 | .end bn_sqr_words_internal | |
538 | ||
539 | .align 5 | |
540 | .globl bn_add_words | |
541 | .ent bn_add_words | |
542 | bn_add_words: | |
543 | .set noreorder | |
544 | bgtz $a3,bn_add_words_internal | |
545 | move $v0,$zero | |
546 | jr $ra | |
547 | move $a0,$v0 | |
548 | .end bn_add_words | |
549 | ||
550 | .align 5 | |
551 | .ent bn_add_words_internal | |
552 | bn_add_words_internal: | |
553 | ___ | |
554 | $code.=<<___ if ($flavour =~ /nubi/i); | |
555 | .frame $sp,6*$SZREG,$ra | |
556 | .mask 0x8000f008,-$SZREG | |
557 | .set noreorder | |
558 | $PTR_SUB $sp,6*$SZREG | |
559 | $REG_S $ra,5*$SZREG($sp) | |
560 | $REG_S $t3,4*$SZREG($sp) | |
561 | $REG_S $t2,3*$SZREG($sp) | |
562 | $REG_S $t1,2*$SZREG($sp) | |
563 | $REG_S $t0,1*$SZREG($sp) | |
564 | $REG_S $gp,0*$SZREG($sp) | |
565 | ___ | |
566 | $code.=<<___; | |
567 | .set reorder | |
568 | li $minus4,-4 | |
569 | and $at,$a3,$minus4 | |
da4d239d AP |
570 | beqz $at,.L_bn_add_words_tail |
571 | ||
572 | .L_bn_add_words_loop: | |
0c2adb0a | 573 | $LD $t0,0($a1) |
da4d239d AP |
574 | $LD $ta0,0($a2) |
575 | subu $a3,4 | |
576 | $LD $t1,$BNSZ($a1) | |
577 | and $at,$a3,$minus4 | |
578 | $LD $t2,2*$BNSZ($a1) | |
579 | $PTR_ADD $a2,4*$BNSZ | |
580 | $LD $t3,3*$BNSZ($a1) | |
581 | $PTR_ADD $a0,4*$BNSZ | |
582 | $LD $ta1,-3*$BNSZ($a2) | |
583 | $PTR_ADD $a1,4*$BNSZ | |
584 | $LD $ta2,-2*$BNSZ($a2) | |
585 | $LD $ta3,-$BNSZ($a2) | |
586 | $ADDU $ta0,$t0 | |
587 | sltu $t8,$ta0,$t0 | |
588 | $ADDU $t0,$ta0,$v0 | |
589 | sltu $v0,$t0,$ta0 | |
590 | $ST $t0,-4*$BNSZ($a0) | |
591 | $ADDU $v0,$t8 | |
592 | ||
593 | $ADDU $ta1,$t1 | |
594 | sltu $t9,$ta1,$t1 | |
595 | $ADDU $t1,$ta1,$v0 | |
596 | sltu $v0,$t1,$ta1 | |
597 | $ST $t1,-3*$BNSZ($a0) | |
598 | $ADDU $v0,$t9 | |
599 | ||
600 | $ADDU $ta2,$t2 | |
601 | sltu $t8,$ta2,$t2 | |
602 | $ADDU $t2,$ta2,$v0 | |
603 | sltu $v0,$t2,$ta2 | |
604 | $ST $t2,-2*$BNSZ($a0) | |
605 | $ADDU $v0,$t8 | |
609b0852 | 606 | |
da4d239d AP |
607 | $ADDU $ta3,$t3 |
608 | sltu $t9,$ta3,$t3 | |
609 | $ADDU $t3,$ta3,$v0 | |
610 | sltu $v0,$t3,$ta3 | |
611 | $ST $t3,-$BNSZ($a0) | |
609b0852 | 612 | |
da4d239d | 613 | .set noreorder |
0c2adb0a AP |
614 | bgtz $at,.L_bn_add_words_loop |
615 | $ADDU $v0,$t9 | |
da4d239d AP |
616 | |
617 | beqz $a3,.L_bn_add_words_return | |
618 | nop | |
619 | ||
620 | .L_bn_add_words_tail: | |
621 | .set reorder | |
622 | $LD $t0,0($a1) | |
623 | $LD $ta0,0($a2) | |
624 | $ADDU $ta0,$t0 | |
625 | subu $a3,1 | |
626 | sltu $t8,$ta0,$t0 | |
627 | $ADDU $t0,$ta0,$v0 | |
628 | sltu $v0,$t0,$ta0 | |
629 | $ST $t0,0($a0) | |
630 | $ADDU $v0,$t8 | |
631 | beqz $a3,.L_bn_add_words_return | |
632 | ||
633 | $LD $t1,$BNSZ($a1) | |
634 | $LD $ta1,$BNSZ($a2) | |
635 | $ADDU $ta1,$t1 | |
636 | subu $a3,1 | |
637 | sltu $t9,$ta1,$t1 | |
638 | $ADDU $t1,$ta1,$v0 | |
639 | sltu $v0,$t1,$ta1 | |
640 | $ST $t1,$BNSZ($a0) | |
641 | $ADDU $v0,$t9 | |
642 | beqz $a3,.L_bn_add_words_return | |
643 | ||
644 | $LD $t2,2*$BNSZ($a1) | |
645 | $LD $ta2,2*$BNSZ($a2) | |
646 | $ADDU $ta2,$t2 | |
647 | sltu $t8,$ta2,$t2 | |
648 | $ADDU $t2,$ta2,$v0 | |
649 | sltu $v0,$t2,$ta2 | |
650 | $ST $t2,2*$BNSZ($a0) | |
651 | $ADDU $v0,$t8 | |
652 | ||
653 | .L_bn_add_words_return: | |
654 | .set noreorder | |
655 | ___ | |
656 | $code.=<<___ if ($flavour =~ /nubi/i); | |
657 | $REG_L $t3,4*$SZREG($sp) | |
658 | $REG_L $t2,3*$SZREG($sp) | |
659 | $REG_L $t1,2*$SZREG($sp) | |
660 | $REG_L $t0,1*$SZREG($sp) | |
661 | $REG_L $gp,0*$SZREG($sp) | |
662 | $PTR_ADD $sp,6*$SZREG | |
663 | ___ | |
664 | $code.=<<___; | |
665 | jr $ra | |
666 | move $a0,$v0 | |
667 | ||
668 | .end bn_add_words_internal | |
669 | ||
670 | .align 5 | |
671 | .globl bn_sub_words | |
672 | .ent bn_sub_words | |
673 | bn_sub_words: | |
674 | .set noreorder | |
675 | bgtz $a3,bn_sub_words_internal | |
676 | move $v0,$zero | |
677 | jr $ra | |
678 | move $a0,$zero | |
679 | .end bn_sub_words | |
680 | ||
681 | .align 5 | |
682 | .ent bn_sub_words_internal | |
683 | bn_sub_words_internal: | |
684 | ___ | |
685 | $code.=<<___ if ($flavour =~ /nubi/i); | |
686 | .frame $sp,6*$SZREG,$ra | |
687 | .mask 0x8000f008,-$SZREG | |
688 | .set noreorder | |
689 | $PTR_SUB $sp,6*$SZREG | |
690 | $REG_S $ra,5*$SZREG($sp) | |
691 | $REG_S $t3,4*$SZREG($sp) | |
692 | $REG_S $t2,3*$SZREG($sp) | |
693 | $REG_S $t1,2*$SZREG($sp) | |
694 | $REG_S $t0,1*$SZREG($sp) | |
695 | $REG_S $gp,0*$SZREG($sp) | |
696 | ___ | |
697 | $code.=<<___; | |
698 | .set reorder | |
699 | li $minus4,-4 | |
700 | and $at,$a3,$minus4 | |
da4d239d AP |
701 | beqz $at,.L_bn_sub_words_tail |
702 | ||
703 | .L_bn_sub_words_loop: | |
0c2adb0a | 704 | $LD $t0,0($a1) |
da4d239d AP |
705 | $LD $ta0,0($a2) |
706 | subu $a3,4 | |
707 | $LD $t1,$BNSZ($a1) | |
708 | and $at,$a3,$minus4 | |
709 | $LD $t2,2*$BNSZ($a1) | |
710 | $PTR_ADD $a2,4*$BNSZ | |
711 | $LD $t3,3*$BNSZ($a1) | |
712 | $PTR_ADD $a0,4*$BNSZ | |
713 | $LD $ta1,-3*$BNSZ($a2) | |
714 | $PTR_ADD $a1,4*$BNSZ | |
715 | $LD $ta2,-2*$BNSZ($a2) | |
716 | $LD $ta3,-$BNSZ($a2) | |
717 | sltu $t8,$t0,$ta0 | |
718 | $SUBU $ta0,$t0,$ta0 | |
719 | $SUBU $t0,$ta0,$v0 | |
720 | sgtu $v0,$t0,$ta0 | |
721 | $ST $t0,-4*$BNSZ($a0) | |
722 | $ADDU $v0,$t8 | |
723 | ||
724 | sltu $t9,$t1,$ta1 | |
725 | $SUBU $ta1,$t1,$ta1 | |
726 | $SUBU $t1,$ta1,$v0 | |
727 | sgtu $v0,$t1,$ta1 | |
728 | $ST $t1,-3*$BNSZ($a0) | |
729 | $ADDU $v0,$t9 | |
730 | ||
731 | ||
732 | sltu $t8,$t2,$ta2 | |
733 | $SUBU $ta2,$t2,$ta2 | |
734 | $SUBU $t2,$ta2,$v0 | |
735 | sgtu $v0,$t2,$ta2 | |
736 | $ST $t2,-2*$BNSZ($a0) | |
737 | $ADDU $v0,$t8 | |
738 | ||
739 | sltu $t9,$t3,$ta3 | |
740 | $SUBU $ta3,$t3,$ta3 | |
741 | $SUBU $t3,$ta3,$v0 | |
742 | sgtu $v0,$t3,$ta3 | |
743 | $ST $t3,-$BNSZ($a0) | |
da4d239d AP |
744 | |
745 | .set noreorder | |
0c2adb0a AP |
746 | bgtz $at,.L_bn_sub_words_loop |
747 | $ADDU $v0,$t9 | |
da4d239d AP |
748 | |
749 | beqz $a3,.L_bn_sub_words_return | |
750 | nop | |
751 | ||
752 | .L_bn_sub_words_tail: | |
753 | .set reorder | |
754 | $LD $t0,0($a1) | |
755 | $LD $ta0,0($a2) | |
756 | subu $a3,1 | |
757 | sltu $t8,$t0,$ta0 | |
758 | $SUBU $ta0,$t0,$ta0 | |
759 | $SUBU $t0,$ta0,$v0 | |
760 | sgtu $v0,$t0,$ta0 | |
761 | $ST $t0,0($a0) | |
762 | $ADDU $v0,$t8 | |
763 | beqz $a3,.L_bn_sub_words_return | |
764 | ||
765 | $LD $t1,$BNSZ($a1) | |
766 | subu $a3,1 | |
767 | $LD $ta1,$BNSZ($a2) | |
768 | sltu $t9,$t1,$ta1 | |
769 | $SUBU $ta1,$t1,$ta1 | |
770 | $SUBU $t1,$ta1,$v0 | |
771 | sgtu $v0,$t1,$ta1 | |
772 | $ST $t1,$BNSZ($a0) | |
773 | $ADDU $v0,$t9 | |
774 | beqz $a3,.L_bn_sub_words_return | |
775 | ||
776 | $LD $t2,2*$BNSZ($a1) | |
777 | $LD $ta2,2*$BNSZ($a2) | |
778 | sltu $t8,$t2,$ta2 | |
779 | $SUBU $ta2,$t2,$ta2 | |
780 | $SUBU $t2,$ta2,$v0 | |
781 | sgtu $v0,$t2,$ta2 | |
782 | $ST $t2,2*$BNSZ($a0) | |
783 | $ADDU $v0,$t8 | |
784 | ||
785 | .L_bn_sub_words_return: | |
786 | .set noreorder | |
787 | ___ | |
788 | $code.=<<___ if ($flavour =~ /nubi/i); | |
789 | $REG_L $t3,4*$SZREG($sp) | |
790 | $REG_L $t2,3*$SZREG($sp) | |
791 | $REG_L $t1,2*$SZREG($sp) | |
792 | $REG_L $t0,1*$SZREG($sp) | |
793 | $REG_L $gp,0*$SZREG($sp) | |
794 | $PTR_ADD $sp,6*$SZREG | |
795 | ___ | |
796 | $code.=<<___; | |
797 | jr $ra | |
798 | move $a0,$v0 | |
66001268 | 799 | .end bn_sub_words_internal |
da4d239d AP |
800 | |
801 | .align 5 | |
802 | .globl bn_div_3_words | |
803 | .ent bn_div_3_words | |
804 | bn_div_3_words: | |
805 | .set noreorder | |
806 | move $a3,$a0 # we know that bn_div_words does not | |
807 | # touch $a3, $ta2, $ta3 and preserves $a2 | |
808 | # so that we can save two arguments | |
809 | # and return address in registers | |
810 | # instead of stack:-) | |
609b0852 | 811 | |
da4d239d AP |
812 | $LD $a0,($a3) |
813 | move $ta2,$a1 | |
814 | bne $a0,$a2,bn_div_3_words_internal | |
815 | $LD $a1,-$BNSZ($a3) | |
816 | li $v0,-1 | |
817 | jr $ra | |
818 | move $a0,$v0 | |
819 | .end bn_div_3_words | |
820 | ||
821 | .align 5 | |
822 | .ent bn_div_3_words_internal | |
823 | bn_div_3_words_internal: | |
824 | ___ | |
825 | $code.=<<___ if ($flavour =~ /nubi/i); | |
826 | .frame $sp,6*$SZREG,$ra | |
827 | .mask 0x8000f008,-$SZREG | |
828 | .set noreorder | |
829 | $PTR_SUB $sp,6*$SZREG | |
830 | $REG_S $ra,5*$SZREG($sp) | |
831 | $REG_S $t3,4*$SZREG($sp) | |
832 | $REG_S $t2,3*$SZREG($sp) | |
833 | $REG_S $t1,2*$SZREG($sp) | |
834 | $REG_S $t0,1*$SZREG($sp) | |
835 | $REG_S $gp,0*$SZREG($sp) | |
836 | ___ | |
837 | $code.=<<___; | |
838 | .set reorder | |
839 | move $ta3,$ra | |
543fd854 | 840 | bal bn_div_words_internal |
da4d239d | 841 | move $ra,$ta3 |
947716c1 | 842 | $MULTU ($ta2,$v0) |
da4d239d AP |
843 | $LD $t2,-2*$BNSZ($a3) |
844 | move $ta0,$zero | |
947716c1 AP |
845 | mfhi ($t1,$ta2,$v0) |
846 | mflo ($t0,$ta2,$v0) | |
da4d239d AP |
847 | sltu $t8,$t1,$a1 |
848 | .L_bn_div_3_words_inner_loop: | |
849 | bnez $t8,.L_bn_div_3_words_inner_loop_done | |
850 | sgeu $at,$t2,$t0 | |
851 | seq $t9,$t1,$a1 | |
852 | and $at,$t9 | |
853 | sltu $t3,$t0,$ta2 | |
854 | $ADDU $a1,$a2 | |
855 | $SUBU $t1,$t3 | |
856 | $SUBU $t0,$ta2 | |
857 | sltu $t8,$t1,$a1 | |
858 | sltu $ta0,$a1,$a2 | |
859 | or $t8,$ta0 | |
860 | .set noreorder | |
0c2adb0a | 861 | beqz $at,.L_bn_div_3_words_inner_loop |
da4d239d | 862 | $SUBU $v0,1 |
0c2adb0a | 863 | $ADDU $v0,1 |
da4d239d AP |
864 | .set reorder |
865 | .L_bn_div_3_words_inner_loop_done: | |
866 | .set noreorder | |
867 | ___ | |
868 | $code.=<<___ if ($flavour =~ /nubi/i); | |
869 | $REG_L $t3,4*$SZREG($sp) | |
870 | $REG_L $t2,3*$SZREG($sp) | |
871 | $REG_L $t1,2*$SZREG($sp) | |
872 | $REG_L $t0,1*$SZREG($sp) | |
873 | $REG_L $gp,0*$SZREG($sp) | |
874 | $PTR_ADD $sp,6*$SZREG | |
875 | ___ | |
876 | $code.=<<___; | |
877 | jr $ra | |
878 | move $a0,$v0 | |
879 | .end bn_div_3_words_internal | |
880 | ||
881 | .align 5 | |
882 | .globl bn_div_words | |
883 | .ent bn_div_words | |
884 | bn_div_words: | |
885 | .set noreorder | |
886 | bnez $a2,bn_div_words_internal | |
887 | li $v0,-1 # I would rather signal div-by-zero | |
888 | # which can be done with 'break 7' | |
889 | jr $ra | |
890 | move $a0,$v0 | |
891 | .end bn_div_words | |
892 | ||
893 | .align 5 | |
894 | .ent bn_div_words_internal | |
895 | bn_div_words_internal: | |
896 | ___ | |
897 | $code.=<<___ if ($flavour =~ /nubi/i); | |
898 | .frame $sp,6*$SZREG,$ra | |
899 | .mask 0x8000f008,-$SZREG | |
900 | .set noreorder | |
901 | $PTR_SUB $sp,6*$SZREG | |
902 | $REG_S $ra,5*$SZREG($sp) | |
903 | $REG_S $t3,4*$SZREG($sp) | |
904 | $REG_S $t2,3*$SZREG($sp) | |
905 | $REG_S $t1,2*$SZREG($sp) | |
906 | $REG_S $t0,1*$SZREG($sp) | |
907 | $REG_S $gp,0*$SZREG($sp) | |
908 | ___ | |
909 | $code.=<<___; | |
910 | move $v1,$zero | |
911 | bltz $a2,.L_bn_div_words_body | |
912 | move $t9,$v1 | |
913 | $SLL $a2,1 | |
914 | bgtz $a2,.-4 | |
915 | addu $t9,1 | |
916 | ||
917 | .set reorder | |
918 | negu $t1,$t9 | |
919 | li $t2,-1 | |
920 | $SLL $t2,$t1 | |
921 | and $t2,$a0 | |
922 | $SRL $at,$a1,$t1 | |
923 | .set noreorder | |
0c2adb0a AP |
924 | beqz $t2,.+12 |
925 | nop | |
da4d239d AP |
926 | break 6 # signal overflow |
927 | .set reorder | |
928 | $SLL $a0,$t9 | |
929 | $SLL $a1,$t9 | |
930 | or $a0,$at | |
931 | ___ | |
932 | $QT=$ta0; | |
933 | $HH=$ta1; | |
934 | $DH=$v1; | |
935 | $code.=<<___; | |
936 | .L_bn_div_words_body: | |
937 | $SRL $DH,$a2,4*$BNSZ # bits | |
938 | sgeu $at,$a0,$a2 | |
939 | .set noreorder | |
0c2adb0a AP |
940 | beqz $at,.+12 |
941 | nop | |
da4d239d AP |
942 | $SUBU $a0,$a2 |
943 | .set reorder | |
944 | ||
945 | li $QT,-1 | |
946 | $SRL $HH,$a0,4*$BNSZ # bits | |
947 | $SRL $QT,4*$BNSZ # q=0xffffffff | |
948 | beq $DH,$HH,.L_bn_div_words_skip_div1 | |
947716c1 AP |
949 | $DIVU ($a0,$DH) |
950 | mfqt ($QT,$a0,$DH) | |
da4d239d | 951 | .L_bn_div_words_skip_div1: |
947716c1 | 952 | $MULTU ($a2,$QT) |
da4d239d AP |
953 | $SLL $t3,$a0,4*$BNSZ # bits |
954 | $SRL $at,$a1,4*$BNSZ # bits | |
955 | or $t3,$at | |
947716c1 AP |
956 | mflo ($t0,$a2,$QT) |
957 | mfhi ($t1,$a2,$QT) | |
da4d239d AP |
958 | .L_bn_div_words_inner_loop1: |
959 | sltu $t2,$t3,$t0 | |
960 | seq $t8,$HH,$t1 | |
961 | sltu $at,$HH,$t1 | |
962 | and $t2,$t8 | |
963 | sltu $v0,$t0,$a2 | |
964 | or $at,$t2 | |
965 | .set noreorder | |
966 | beqz $at,.L_bn_div_words_inner_loop1_done | |
967 | $SUBU $t1,$v0 | |
968 | $SUBU $t0,$a2 | |
969 | b .L_bn_div_words_inner_loop1 | |
970 | $SUBU $QT,1 | |
971 | .set reorder | |
972 | .L_bn_div_words_inner_loop1_done: | |
973 | ||
974 | $SLL $a1,4*$BNSZ # bits | |
975 | $SUBU $a0,$t3,$t0 | |
976 | $SLL $v0,$QT,4*$BNSZ # bits | |
977 | ||
978 | li $QT,-1 | |
979 | $SRL $HH,$a0,4*$BNSZ # bits | |
980 | $SRL $QT,4*$BNSZ # q=0xffffffff | |
981 | beq $DH,$HH,.L_bn_div_words_skip_div2 | |
947716c1 AP |
982 | $DIVU ($a0,$DH) |
983 | mfqt ($QT,$a0,$DH) | |
da4d239d | 984 | .L_bn_div_words_skip_div2: |
947716c1 | 985 | $MULTU ($a2,$QT) |
da4d239d AP |
986 | $SLL $t3,$a0,4*$BNSZ # bits |
987 | $SRL $at,$a1,4*$BNSZ # bits | |
988 | or $t3,$at | |
947716c1 AP |
989 | mflo ($t0,$a2,$QT) |
990 | mfhi ($t1,$a2,$QT) | |
da4d239d AP |
991 | .L_bn_div_words_inner_loop2: |
992 | sltu $t2,$t3,$t0 | |
993 | seq $t8,$HH,$t1 | |
994 | sltu $at,$HH,$t1 | |
995 | and $t2,$t8 | |
996 | sltu $v1,$t0,$a2 | |
997 | or $at,$t2 | |
998 | .set noreorder | |
999 | beqz $at,.L_bn_div_words_inner_loop2_done | |
1000 | $SUBU $t1,$v1 | |
1001 | $SUBU $t0,$a2 | |
1002 | b .L_bn_div_words_inner_loop2 | |
1003 | $SUBU $QT,1 | |
1004 | .set reorder | |
1005 | .L_bn_div_words_inner_loop2_done: | |
1006 | ||
1007 | $SUBU $a0,$t3,$t0 | |
1008 | or $v0,$QT | |
1009 | $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it | |
1010 | $SRL $a2,$t9 # restore $a2 | |
1011 | ||
1012 | .set noreorder | |
1013 | move $a1,$v1 | |
1014 | ___ | |
1015 | $code.=<<___ if ($flavour =~ /nubi/i); | |
1016 | $REG_L $t3,4*$SZREG($sp) | |
1017 | $REG_L $t2,3*$SZREG($sp) | |
1018 | $REG_L $t1,2*$SZREG($sp) | |
1019 | $REG_L $t0,1*$SZREG($sp) | |
1020 | $REG_L $gp,0*$SZREG($sp) | |
1021 | $PTR_ADD $sp,6*$SZREG | |
1022 | ___ | |
1023 | $code.=<<___; | |
1024 | jr $ra | |
1025 | move $a0,$v0 | |
1026 | .end bn_div_words_internal | |
1027 | ___ | |
1028 | undef $HH; undef $QT; undef $DH; | |
1029 | ||
1030 | ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); | |
1031 | ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); | |
1032 | ||
1033 | ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 | |
1034 | ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 | |
1035 | ||
1036 | ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); | |
1037 | ||
1038 | $code.=<<___; | |
1039 | ||
1040 | .align 5 | |
1041 | .globl bn_mul_comba8 | |
1042 | .ent bn_mul_comba8 | |
1043 | bn_mul_comba8: | |
1044 | .set noreorder | |
1045 | ___ | |
1046 | $code.=<<___ if ($flavour =~ /nubi/i); | |
1047 | .frame $sp,12*$SZREG,$ra | |
1048 | .mask 0x803ff008,-$SZREG | |
1049 | $PTR_SUB $sp,12*$SZREG | |
1050 | $REG_S $ra,11*$SZREG($sp) | |
1051 | $REG_S $s5,10*$SZREG($sp) | |
1052 | $REG_S $s4,9*$SZREG($sp) | |
1053 | $REG_S $s3,8*$SZREG($sp) | |
1054 | $REG_S $s2,7*$SZREG($sp) | |
1055 | $REG_S $s1,6*$SZREG($sp) | |
1056 | $REG_S $s0,5*$SZREG($sp) | |
1057 | $REG_S $t3,4*$SZREG($sp) | |
1058 | $REG_S $t2,3*$SZREG($sp) | |
1059 | $REG_S $t1,2*$SZREG($sp) | |
1060 | $REG_S $t0,1*$SZREG($sp) | |
1061 | $REG_S $gp,0*$SZREG($sp) | |
1062 | ___ | |
1063 | $code.=<<___ if ($flavour !~ /nubi/i); | |
1064 | .frame $sp,6*$SZREG,$ra | |
1065 | .mask 0x003f0000,-$SZREG | |
1066 | $PTR_SUB $sp,6*$SZREG | |
1067 | $REG_S $s5,5*$SZREG($sp) | |
1068 | $REG_S $s4,4*$SZREG($sp) | |
1069 | $REG_S $s3,3*$SZREG($sp) | |
1070 | $REG_S $s2,2*$SZREG($sp) | |
1071 | $REG_S $s1,1*$SZREG($sp) | |
1072 | $REG_S $s0,0*$SZREG($sp) | |
1073 | ___ | |
1074 | $code.=<<___; | |
1075 | ||
1076 | .set reorder | |
1077 | $LD $a_0,0($a1) # If compiled with -mips3 option on | |
1078 | # R5000 box assembler barks on this | |
1079 | # 1ine with "should not have mult/div | |
1080 | # as last instruction in bb (R10K | |
1081 | # bug)" warning. If anybody out there | |
1082 | # has a clue about how to circumvent | |
1083 | # this do send me a note. | |
1084 | # <appro\@fy.chalmers.se> | |
1085 | ||
1086 | $LD $b_0,0($a2) | |
1087 | $LD $a_1,$BNSZ($a1) | |
1088 | $LD $a_2,2*$BNSZ($a1) | |
947716c1 | 1089 | $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); |
da4d239d AP |
1090 | $LD $a_3,3*$BNSZ($a1) |
1091 | $LD $b_1,$BNSZ($a2) | |
1092 | $LD $b_2,2*$BNSZ($a2) | |
1093 | $LD $b_3,3*$BNSZ($a2) | |
947716c1 AP |
1094 | mflo ($c_1,$a_0,$b_0) |
1095 | mfhi ($c_2,$a_0,$b_0) | |
da4d239d AP |
1096 | |
1097 | $LD $a_4,4*$BNSZ($a1) | |
1098 | $LD $a_5,5*$BNSZ($a1) | |
947716c1 | 1099 | $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); |
da4d239d AP |
1100 | $LD $a_6,6*$BNSZ($a1) |
1101 | $LD $a_7,7*$BNSZ($a1) | |
1102 | $LD $b_4,4*$BNSZ($a2) | |
1103 | $LD $b_5,5*$BNSZ($a2) | |
947716c1 AP |
1104 | mflo ($t_1,$a_0,$b_1) |
1105 | mfhi ($t_2,$a_0,$b_1) | |
da4d239d AP |
1106 | $ADDU $c_2,$t_1 |
1107 | sltu $at,$c_2,$t_1 | |
947716c1 | 1108 | $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); |
da4d239d AP |
1109 | $ADDU $c_3,$t_2,$at |
1110 | $LD $b_6,6*$BNSZ($a2) | |
1111 | $LD $b_7,7*$BNSZ($a2) | |
1112 | $ST $c_1,0($a0) # r[0]=c1; | |
947716c1 AP |
1113 | mflo ($t_1,$a_1,$b_0) |
1114 | mfhi ($t_2,$a_1,$b_0) | |
da4d239d AP |
1115 | $ADDU $c_2,$t_1 |
1116 | sltu $at,$c_2,$t_1 | |
947716c1 | 1117 | $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); |
da4d239d AP |
1118 | $ADDU $t_2,$at |
1119 | $ADDU $c_3,$t_2 | |
1120 | sltu $c_1,$c_3,$t_2 | |
1121 | $ST $c_2,$BNSZ($a0) # r[1]=c2; | |
1122 | ||
947716c1 AP |
1123 | mflo ($t_1,$a_2,$b_0) |
1124 | mfhi ($t_2,$a_2,$b_0) | |
da4d239d AP |
1125 | $ADDU $c_3,$t_1 |
1126 | sltu $at,$c_3,$t_1 | |
947716c1 | 1127 | $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); |
da4d239d AP |
1128 | $ADDU $t_2,$at |
1129 | $ADDU $c_1,$t_2 | |
947716c1 AP |
1130 | mflo ($t_1,$a_1,$b_1) |
1131 | mfhi ($t_2,$a_1,$b_1) | |
da4d239d AP |
1132 | $ADDU $c_3,$t_1 |
1133 | sltu $at,$c_3,$t_1 | |
947716c1 | 1134 | $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); |
da4d239d AP |
1135 | $ADDU $t_2,$at |
1136 | $ADDU $c_1,$t_2 | |
1137 | sltu $c_2,$c_1,$t_2 | |
947716c1 AP |
1138 | mflo ($t_1,$a_0,$b_2) |
1139 | mfhi ($t_2,$a_0,$b_2) | |
da4d239d AP |
1140 | $ADDU $c_3,$t_1 |
1141 | sltu $at,$c_3,$t_1 | |
947716c1 | 1142 | $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); |
da4d239d AP |
1143 | $ADDU $t_2,$at |
1144 | $ADDU $c_1,$t_2 | |
1145 | sltu $at,$c_1,$t_2 | |
1146 | $ADDU $c_2,$at | |
1147 | $ST $c_3,2*$BNSZ($a0) # r[2]=c3; | |
1148 | ||
947716c1 AP |
1149 | mflo ($t_1,$a_0,$b_3) |
1150 | mfhi ($t_2,$a_0,$b_3) | |
da4d239d AP |
1151 | $ADDU $c_1,$t_1 |
1152 | sltu $at,$c_1,$t_1 | |
947716c1 | 1153 | $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); |
da4d239d AP |
1154 | $ADDU $t_2,$at |
1155 | $ADDU $c_2,$t_2 | |
1156 | sltu $c_3,$c_2,$t_2 | |
947716c1 AP |
1157 | mflo ($t_1,$a_1,$b_2) |
1158 | mfhi ($t_2,$a_1,$b_2) | |
da4d239d AP |
1159 | $ADDU $c_1,$t_1 |
1160 | sltu $at,$c_1,$t_1 | |
947716c1 | 1161 | $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); |
da4d239d AP |
1162 | $ADDU $t_2,$at |
1163 | $ADDU $c_2,$t_2 | |
1164 | sltu $at,$c_2,$t_2 | |
1165 | $ADDU $c_3,$at | |
947716c1 AP |
1166 | mflo ($t_1,$a_2,$b_1) |
1167 | mfhi ($t_2,$a_2,$b_1) | |
da4d239d AP |
1168 | $ADDU $c_1,$t_1 |
1169 | sltu $at,$c_1,$t_1 | |
947716c1 | 1170 | $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); |
da4d239d AP |
1171 | $ADDU $t_2,$at |
1172 | $ADDU $c_2,$t_2 | |
1173 | sltu $at,$c_2,$t_2 | |
1174 | $ADDU $c_3,$at | |
947716c1 AP |
1175 | mflo ($t_1,$a_3,$b_0) |
1176 | mfhi ($t_2,$a_3,$b_0) | |
da4d239d AP |
1177 | $ADDU $c_1,$t_1 |
1178 | sltu $at,$c_1,$t_1 | |
947716c1 | 1179 | $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1); |
da4d239d AP |
1180 | $ADDU $t_2,$at |
1181 | $ADDU $c_2,$t_2 | |
1182 | sltu $at,$c_2,$t_2 | |
1183 | $ADDU $c_3,$at | |
1184 | $ST $c_1,3*$BNSZ($a0) # r[3]=c1; | |
1185 | ||
947716c1 AP |
1186 | mflo ($t_1,$a_4,$b_0) |
1187 | mfhi ($t_2,$a_4,$b_0) | |
da4d239d AP |
1188 | $ADDU $c_2,$t_1 |
1189 | sltu $at,$c_2,$t_1 | |
947716c1 | 1190 | $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); |
da4d239d AP |
1191 | $ADDU $t_2,$at |
1192 | $ADDU $c_3,$t_2 | |
1193 | sltu $c_1,$c_3,$t_2 | |
947716c1 AP |
1194 | mflo ($t_1,$a_3,$b_1) |
1195 | mfhi ($t_2,$a_3,$b_1) | |
da4d239d AP |
1196 | $ADDU $c_2,$t_1 |
1197 | sltu $at,$c_2,$t_1 | |
947716c1 | 1198 | $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); |
da4d239d AP |
1199 | $ADDU $t_2,$at |
1200 | $ADDU $c_3,$t_2 | |
1201 | sltu $at,$c_3,$t_2 | |
1202 | $ADDU $c_1,$at | |
947716c1 AP |
1203 | mflo ($t_1,$a_2,$b_2) |
1204 | mfhi ($t_2,$a_2,$b_2) | |
da4d239d AP |
1205 | $ADDU $c_2,$t_1 |
1206 | sltu $at,$c_2,$t_1 | |
947716c1 | 1207 | $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); |
da4d239d AP |
1208 | $ADDU $t_2,$at |
1209 | $ADDU $c_3,$t_2 | |
1210 | sltu $at,$c_3,$t_2 | |
1211 | $ADDU $c_1,$at | |
947716c1 AP |
1212 | mflo ($t_1,$a_1,$b_3) |
1213 | mfhi ($t_2,$a_1,$b_3) | |
da4d239d AP |
1214 | $ADDU $c_2,$t_1 |
1215 | sltu $at,$c_2,$t_1 | |
947716c1 | 1216 | $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1); |
da4d239d AP |
1217 | $ADDU $t_2,$at |
1218 | $ADDU $c_3,$t_2 | |
1219 | sltu $at,$c_3,$t_2 | |
1220 | $ADDU $c_1,$at | |
947716c1 AP |
1221 | mflo ($t_1,$a_0,$b_4) |
1222 | mfhi ($t_2,$a_0,$b_4) | |
da4d239d AP |
1223 | $ADDU $c_2,$t_1 |
1224 | sltu $at,$c_2,$t_1 | |
947716c1 | 1225 | $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2); |
da4d239d AP |
1226 | $ADDU $t_2,$at |
1227 | $ADDU $c_3,$t_2 | |
1228 | sltu $at,$c_3,$t_2 | |
1229 | $ADDU $c_1,$at | |
1230 | $ST $c_2,4*$BNSZ($a0) # r[4]=c2; | |
1231 | ||
947716c1 AP |
1232 | mflo ($t_1,$a_0,$b_5) |
1233 | mfhi ($t_2,$a_0,$b_5) | |
da4d239d AP |
1234 | $ADDU $c_3,$t_1 |
1235 | sltu $at,$c_3,$t_1 | |
947716c1 | 1236 | $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2); |
da4d239d AP |
1237 | $ADDU $t_2,$at |
1238 | $ADDU $c_1,$t_2 | |
1239 | sltu $c_2,$c_1,$t_2 | |
947716c1 AP |
1240 | mflo ($t_1,$a_1,$b_4) |
1241 | mfhi ($t_2,$a_1,$b_4) | |
da4d239d AP |
1242 | $ADDU $c_3,$t_1 |
1243 | sltu $at,$c_3,$t_1 | |
947716c1 | 1244 | $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); |
da4d239d AP |
1245 | $ADDU $t_2,$at |
1246 | $ADDU $c_1,$t_2 | |
1247 | sltu $at,$c_1,$t_2 | |
1248 | $ADDU $c_2,$at | |
947716c1 AP |
1249 | mflo ($t_1,$a_2,$b_3) |
1250 | mfhi ($t_2,$a_2,$b_3) | |
da4d239d AP |
1251 | $ADDU $c_3,$t_1 |
1252 | sltu $at,$c_3,$t_1 | |
947716c1 | 1253 | $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); |
da4d239d AP |
1254 | $ADDU $t_2,$at |
1255 | $ADDU $c_1,$t_2 | |
1256 | sltu $at,$c_1,$t_2 | |
1257 | $ADDU $c_2,$at | |
947716c1 AP |
1258 | mflo ($t_1,$a_3,$b_2) |
1259 | mfhi ($t_2,$a_3,$b_2) | |
da4d239d AP |
1260 | $ADDU $c_3,$t_1 |
1261 | sltu $at,$c_3,$t_1 | |
947716c1 | 1262 | $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2); |
da4d239d AP |
1263 | $ADDU $t_2,$at |
1264 | $ADDU $c_1,$t_2 | |
1265 | sltu $at,$c_1,$t_2 | |
1266 | $ADDU $c_2,$at | |
947716c1 AP |
1267 | mflo ($t_1,$a_4,$b_1) |
1268 | mfhi ($t_2,$a_4,$b_1) | |
da4d239d AP |
1269 | $ADDU $c_3,$t_1 |
1270 | sltu $at,$c_3,$t_1 | |
947716c1 | 1271 | $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2); |
da4d239d AP |
1272 | $ADDU $t_2,$at |
1273 | $ADDU $c_1,$t_2 | |
1274 | sltu $at,$c_1,$t_2 | |
1275 | $ADDU $c_2,$at | |
947716c1 AP |
1276 | mflo ($t_1,$a_5,$b_0) |
1277 | mfhi ($t_2,$a_5,$b_0) | |
da4d239d AP |
1278 | $ADDU $c_3,$t_1 |
1279 | sltu $at,$c_3,$t_1 | |
947716c1 | 1280 | $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3); |
da4d239d AP |
1281 | $ADDU $t_2,$at |
1282 | $ADDU $c_1,$t_2 | |
1283 | sltu $at,$c_1,$t_2 | |
1284 | $ADDU $c_2,$at | |
1285 | $ST $c_3,5*$BNSZ($a0) # r[5]=c3; | |
1286 | ||
947716c1 AP |
1287 | mflo ($t_1,$a_6,$b_0) |
1288 | mfhi ($t_2,$a_6,$b_0) | |
da4d239d AP |
1289 | $ADDU $c_1,$t_1 |
1290 | sltu $at,$c_1,$t_1 | |
947716c1 | 1291 | $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3); |
da4d239d AP |
1292 | $ADDU $t_2,$at |
1293 | $ADDU $c_2,$t_2 | |
1294 | sltu $c_3,$c_2,$t_2 | |
947716c1 AP |
1295 | mflo ($t_1,$a_5,$b_1) |
1296 | mfhi ($t_2,$a_5,$b_1) | |
da4d239d AP |
1297 | $ADDU $c_1,$t_1 |
1298 | sltu $at,$c_1,$t_1 | |
947716c1 | 1299 | $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3); |
da4d239d AP |
1300 | $ADDU $t_2,$at |
1301 | $ADDU $c_2,$t_2 | |
1302 | sltu $at,$c_2,$t_2 | |
1303 | $ADDU $c_3,$at | |
947716c1 AP |
1304 | mflo ($t_1,$a_4,$b_2) |
1305 | mfhi ($t_2,$a_4,$b_2) | |
da4d239d AP |
1306 | $ADDU $c_1,$t_1 |
1307 | sltu $at,$c_1,$t_1 | |
947716c1 | 1308 | $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); |
da4d239d AP |
1309 | $ADDU $t_2,$at |
1310 | $ADDU $c_2,$t_2 | |
1311 | sltu $at,$c_2,$t_2 | |
1312 | $ADDU $c_3,$at | |
947716c1 AP |
1313 | mflo ($t_1,$a_3,$b_3) |
1314 | mfhi ($t_2,$a_3,$b_3) | |
da4d239d AP |
1315 | $ADDU $c_1,$t_1 |
1316 | sltu $at,$c_1,$t_1 | |
947716c1 | 1317 | $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3); |
da4d239d AP |
1318 | $ADDU $t_2,$at |
1319 | $ADDU $c_2,$t_2 | |
1320 | sltu $at,$c_2,$t_2 | |
1321 | $ADDU $c_3,$at | |
947716c1 AP |
1322 | mflo ($t_1,$a_2,$b_4) |
1323 | mfhi ($t_2,$a_2,$b_4) | |
da4d239d AP |
1324 | $ADDU $c_1,$t_1 |
1325 | sltu $at,$c_1,$t_1 | |
947716c1 | 1326 | $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3); |
da4d239d AP |
1327 | $ADDU $t_2,$at |
1328 | $ADDU $c_2,$t_2 | |
1329 | sltu $at,$c_2,$t_2 | |
1330 | $ADDU $c_3,$at | |
947716c1 AP |
1331 | mflo ($t_1,$a_1,$b_5) |
1332 | mfhi ($t_2,$a_1,$b_5) | |
da4d239d AP |
1333 | $ADDU $c_1,$t_1 |
1334 | sltu $at,$c_1,$t_1 | |
947716c1 | 1335 | $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3); |
da4d239d AP |
1336 | $ADDU $t_2,$at |
1337 | $ADDU $c_2,$t_2 | |
1338 | sltu $at,$c_2,$t_2 | |
1339 | $ADDU $c_3,$at | |
947716c1 AP |
1340 | mflo ($t_1,$a_0,$b_6) |
1341 | mfhi ($t_2,$a_0,$b_6) | |
da4d239d AP |
1342 | $ADDU $c_1,$t_1 |
1343 | sltu $at,$c_1,$t_1 | |
947716c1 | 1344 | $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1); |
da4d239d AP |
1345 | $ADDU $t_2,$at |
1346 | $ADDU $c_2,$t_2 | |
1347 | sltu $at,$c_2,$t_2 | |
1348 | $ADDU $c_3,$at | |
1349 | $ST $c_1,6*$BNSZ($a0) # r[6]=c1; | |
1350 | ||
947716c1 AP |
1351 | mflo ($t_1,$a_0,$b_7) |
1352 | mfhi ($t_2,$a_0,$b_7) | |
da4d239d AP |
1353 | $ADDU $c_2,$t_1 |
1354 | sltu $at,$c_2,$t_1 | |
947716c1 | 1355 | $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1); |
da4d239d AP |
1356 | $ADDU $t_2,$at |
1357 | $ADDU $c_3,$t_2 | |
1358 | sltu $c_1,$c_3,$t_2 | |
947716c1 AP |
1359 | mflo ($t_1,$a_1,$b_6) |
1360 | mfhi ($t_2,$a_1,$b_6) | |
da4d239d AP |
1361 | $ADDU $c_2,$t_1 |
1362 | sltu $at,$c_2,$t_1 | |
947716c1 | 1363 | $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1); |
da4d239d AP |
1364 | $ADDU $t_2,$at |
1365 | $ADDU $c_3,$t_2 | |
1366 | sltu $at,$c_3,$t_2 | |
1367 | $ADDU $c_1,$at | |
947716c1 AP |
1368 | mflo ($t_1,$a_2,$b_5) |
1369 | mfhi ($t_2,$a_2,$b_5) | |
da4d239d AP |
1370 | $ADDU $c_2,$t_1 |
1371 | sltu $at,$c_2,$t_1 | |
947716c1 | 1372 | $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1); |
da4d239d AP |
1373 | $ADDU $t_2,$at |
1374 | $ADDU $c_3,$t_2 | |
1375 | sltu $at,$c_3,$t_2 | |
1376 | $ADDU $c_1,$at | |
947716c1 AP |
1377 | mflo ($t_1,$a_3,$b_4) |
1378 | mfhi ($t_2,$a_3,$b_4) | |
da4d239d AP |
1379 | $ADDU $c_2,$t_1 |
1380 | sltu $at,$c_2,$t_1 | |
947716c1 | 1381 | $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1); |
da4d239d AP |
1382 | $ADDU $t_2,$at |
1383 | $ADDU $c_3,$t_2 | |
1384 | sltu $at,$c_3,$t_2 | |
1385 | $ADDU $c_1,$at | |
947716c1 AP |
1386 | mflo ($t_1,$a_4,$b_3) |
1387 | mfhi ($t_2,$a_4,$b_3) | |
da4d239d AP |
1388 | $ADDU $c_2,$t_1 |
1389 | sltu $at,$c_2,$t_1 | |
947716c1 | 1390 | $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1); |
da4d239d AP |
1391 | $ADDU $t_2,$at |
1392 | $ADDU $c_3,$t_2 | |
1393 | sltu $at,$c_3,$t_2 | |
1394 | $ADDU $c_1,$at | |
947716c1 AP |
1395 | mflo ($t_1,$a_5,$b_2) |
1396 | mfhi ($t_2,$a_5,$b_2) | |
da4d239d AP |
1397 | $ADDU $c_2,$t_1 |
1398 | sltu $at,$c_2,$t_1 | |
947716c1 | 1399 | $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1); |
da4d239d AP |
1400 | $ADDU $t_2,$at |
1401 | $ADDU $c_3,$t_2 | |
1402 | sltu $at,$c_3,$t_2 | |
1403 | $ADDU $c_1,$at | |
947716c1 AP |
1404 | mflo ($t_1,$a_6,$b_1) |
1405 | mfhi ($t_2,$a_6,$b_1) | |
da4d239d AP |
1406 | $ADDU $c_2,$t_1 |
1407 | sltu $at,$c_2,$t_1 | |
947716c1 | 1408 | $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1); |
da4d239d AP |
1409 | $ADDU $t_2,$at |
1410 | $ADDU $c_3,$t_2 | |
1411 | sltu $at,$c_3,$t_2 | |
1412 | $ADDU $c_1,$at | |
947716c1 AP |
1413 | mflo ($t_1,$a_7,$b_0) |
1414 | mfhi ($t_2,$a_7,$b_0) | |
da4d239d AP |
1415 | $ADDU $c_2,$t_1 |
1416 | sltu $at,$c_2,$t_1 | |
947716c1 | 1417 | $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2); |
da4d239d AP |
1418 | $ADDU $t_2,$at |
1419 | $ADDU $c_3,$t_2 | |
1420 | sltu $at,$c_3,$t_2 | |
1421 | $ADDU $c_1,$at | |
1422 | $ST $c_2,7*$BNSZ($a0) # r[7]=c2; | |
1423 | ||
947716c1 AP |
1424 | mflo ($t_1,$a_7,$b_1) |
1425 | mfhi ($t_2,$a_7,$b_1) | |
da4d239d AP |
1426 | $ADDU $c_3,$t_1 |
1427 | sltu $at,$c_3,$t_1 | |
947716c1 | 1428 | $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2); |
da4d239d AP |
1429 | $ADDU $t_2,$at |
1430 | $ADDU $c_1,$t_2 | |
1431 | sltu $c_2,$c_1,$t_2 | |
947716c1 AP |
1432 | mflo ($t_1,$a_6,$b_2) |
1433 | mfhi ($t_2,$a_6,$b_2) | |
da4d239d AP |
1434 | $ADDU $c_3,$t_1 |
1435 | sltu $at,$c_3,$t_1 | |
947716c1 | 1436 | $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2); |
da4d239d AP |
1437 | $ADDU $t_2,$at |
1438 | $ADDU $c_1,$t_2 | |
1439 | sltu $at,$c_1,$t_2 | |
1440 | $ADDU $c_2,$at | |
947716c1 AP |
1441 | mflo ($t_1,$a_5,$b_3) |
1442 | mfhi ($t_2,$a_5,$b_3) | |
da4d239d AP |
1443 | $ADDU $c_3,$t_1 |
1444 | sltu $at,$c_3,$t_1 | |
947716c1 | 1445 | $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2); |
da4d239d AP |
1446 | $ADDU $t_2,$at |
1447 | $ADDU $c_1,$t_2 | |
1448 | sltu $at,$c_1,$t_2 | |
1449 | $ADDU $c_2,$at | |
947716c1 AP |
1450 | mflo ($t_1,$a_4,$b_4) |
1451 | mfhi ($t_2,$a_4,$b_4) | |
da4d239d AP |
1452 | $ADDU $c_3,$t_1 |
1453 | sltu $at,$c_3,$t_1 | |
947716c1 | 1454 | $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2); |
da4d239d AP |
1455 | $ADDU $t_2,$at |
1456 | $ADDU $c_1,$t_2 | |
1457 | sltu $at,$c_1,$t_2 | |
1458 | $ADDU $c_2,$at | |
947716c1 AP |
1459 | mflo ($t_1,$a_3,$b_5) |
1460 | mfhi ($t_2,$a_3,$b_5) | |
da4d239d AP |
1461 | $ADDU $c_3,$t_1 |
1462 | sltu $at,$c_3,$t_1 | |
947716c1 | 1463 | $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2); |
da4d239d AP |
1464 | $ADDU $t_2,$at |
1465 | $ADDU $c_1,$t_2 | |
1466 | sltu $at,$c_1,$t_2 | |
1467 | $ADDU $c_2,$at | |
947716c1 AP |
1468 | mflo ($t_1,$a_2,$b_6) |
1469 | mfhi ($t_2,$a_2,$b_6) | |
da4d239d AP |
1470 | $ADDU $c_3,$t_1 |
1471 | sltu $at,$c_3,$t_1 | |
947716c1 | 1472 | $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2); |
da4d239d AP |
1473 | $ADDU $t_2,$at |
1474 | $ADDU $c_1,$t_2 | |
1475 | sltu $at,$c_1,$t_2 | |
1476 | $ADDU $c_2,$at | |
947716c1 AP |
1477 | mflo ($t_1,$a_1,$b_7) |
1478 | mfhi ($t_2,$a_1,$b_7) | |
da4d239d AP |
1479 | $ADDU $c_3,$t_1 |
1480 | sltu $at,$c_3,$t_1 | |
947716c1 | 1481 | $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3); |
da4d239d AP |
1482 | $ADDU $t_2,$at |
1483 | $ADDU $c_1,$t_2 | |
1484 | sltu $at,$c_1,$t_2 | |
1485 | $ADDU $c_2,$at | |
1486 | $ST $c_3,8*$BNSZ($a0) # r[8]=c3; | |
1487 | ||
947716c1 AP |
1488 | mflo ($t_1,$a_2,$b_7) |
1489 | mfhi ($t_2,$a_2,$b_7) | |
da4d239d AP |
1490 | $ADDU $c_1,$t_1 |
1491 | sltu $at,$c_1,$t_1 | |
947716c1 | 1492 | $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3); |
da4d239d AP |
1493 | $ADDU $t_2,$at |
1494 | $ADDU $c_2,$t_2 | |
1495 | sltu $c_3,$c_2,$t_2 | |
947716c1 AP |
1496 | mflo ($t_1,$a_3,$b_6) |
1497 | mfhi ($t_2,$a_3,$b_6) | |
da4d239d AP |
1498 | $ADDU $c_1,$t_1 |
1499 | sltu $at,$c_1,$t_1 | |
947716c1 | 1500 | $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3); |
da4d239d AP |
1501 | $ADDU $t_2,$at |
1502 | $ADDU $c_2,$t_2 | |
1503 | sltu $at,$c_2,$t_2 | |
1504 | $ADDU $c_3,$at | |
947716c1 AP |
1505 | mflo ($t_1,$a_4,$b_5) |
1506 | mfhi ($t_2,$a_4,$b_5) | |
da4d239d AP |
1507 | $ADDU $c_1,$t_1 |
1508 | sltu $at,$c_1,$t_1 | |
947716c1 | 1509 | $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3); |
da4d239d AP |
1510 | $ADDU $t_2,$at |
1511 | $ADDU $c_2,$t_2 | |
1512 | sltu $at,$c_2,$t_2 | |
1513 | $ADDU $c_3,$at | |
947716c1 AP |
1514 | mflo ($t_1,$a_5,$b_4) |
1515 | mfhi ($t_2,$a_5,$b_4) | |
da4d239d AP |
1516 | $ADDU $c_1,$t_1 |
1517 | sltu $at,$c_1,$t_1 | |
947716c1 | 1518 | $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3); |
da4d239d AP |
1519 | $ADDU $t_2,$at |
1520 | $ADDU $c_2,$t_2 | |
1521 | sltu $at,$c_2,$t_2 | |
1522 | $ADDU $c_3,$at | |
947716c1 AP |
1523 | mflo ($t_1,$a_6,$b_3) |
1524 | mfhi ($t_2,$a_6,$b_3) | |
da4d239d AP |
1525 | $ADDU $c_1,$t_1 |
1526 | sltu $at,$c_1,$t_1 | |
947716c1 | 1527 | $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3); |
da4d239d AP |
1528 | $ADDU $t_2,$at |
1529 | $ADDU $c_2,$t_2 | |
1530 | sltu $at,$c_2,$t_2 | |
1531 | $ADDU $c_3,$at | |
947716c1 AP |
1532 | mflo ($t_1,$a_7,$b_2) |
1533 | mfhi ($t_2,$a_7,$b_2) | |
da4d239d AP |
1534 | $ADDU $c_1,$t_1 |
1535 | sltu $at,$c_1,$t_1 | |
947716c1 | 1536 | $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1); |
da4d239d AP |
1537 | $ADDU $t_2,$at |
1538 | $ADDU $c_2,$t_2 | |
1539 | sltu $at,$c_2,$t_2 | |
1540 | $ADDU $c_3,$at | |
1541 | $ST $c_1,9*$BNSZ($a0) # r[9]=c1; | |
1542 | ||
947716c1 AP |
1543 | mflo ($t_1,$a_7,$b_3) |
1544 | mfhi ($t_2,$a_7,$b_3) | |
da4d239d AP |
1545 | $ADDU $c_2,$t_1 |
1546 | sltu $at,$c_2,$t_1 | |
947716c1 | 1547 | $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1); |
da4d239d AP |
1548 | $ADDU $t_2,$at |
1549 | $ADDU $c_3,$t_2 | |
1550 | sltu $c_1,$c_3,$t_2 | |
947716c1 AP |
1551 | mflo ($t_1,$a_6,$b_4) |
1552 | mfhi ($t_2,$a_6,$b_4) | |
da4d239d AP |
1553 | $ADDU $c_2,$t_1 |
1554 | sltu $at,$c_2,$t_1 | |
947716c1 | 1555 | $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1); |
da4d239d AP |
1556 | $ADDU $t_2,$at |
1557 | $ADDU $c_3,$t_2 | |
1558 | sltu $at,$c_3,$t_2 | |
1559 | $ADDU $c_1,$at | |
947716c1 AP |
1560 | mflo ($t_1,$a_5,$b_5) |
1561 | mfhi ($t_2,$a_5,$b_5) | |
da4d239d AP |
1562 | $ADDU $c_2,$t_1 |
1563 | sltu $at,$c_2,$t_1 | |
947716c1 | 1564 | $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1); |
da4d239d AP |
1565 | $ADDU $t_2,$at |
1566 | $ADDU $c_3,$t_2 | |
1567 | sltu $at,$c_3,$t_2 | |
1568 | $ADDU $c_1,$at | |
947716c1 AP |
1569 | mflo ($t_1,$a_4,$b_6) |
1570 | mfhi ($t_2,$a_4,$b_6) | |
da4d239d AP |
1571 | $ADDU $c_2,$t_1 |
1572 | sltu $at,$c_2,$t_1 | |
947716c1 | 1573 | $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1); |
da4d239d AP |
1574 | $ADDU $t_2,$at |
1575 | $ADDU $c_3,$t_2 | |
1576 | sltu $at,$c_3,$t_2 | |
1577 | $ADDU $c_1,$at | |
947716c1 AP |
1578 | mflo ($t_1,$a_3,$b_7) |
1579 | mfhi ($t_2,$a_3,$b_7) | |
da4d239d AP |
1580 | $ADDU $c_2,$t_1 |
1581 | sltu $at,$c_2,$t_1 | |
947716c1 | 1582 | $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2); |
da4d239d AP |
1583 | $ADDU $t_2,$at |
1584 | $ADDU $c_3,$t_2 | |
1585 | sltu $at,$c_3,$t_2 | |
1586 | $ADDU $c_1,$at | |
1587 | $ST $c_2,10*$BNSZ($a0) # r[10]=c2; | |
1588 | ||
947716c1 AP |
1589 | mflo ($t_1,$a_4,$b_7) |
1590 | mfhi ($t_2,$a_4,$b_7) | |
da4d239d AP |
1591 | $ADDU $c_3,$t_1 |
1592 | sltu $at,$c_3,$t_1 | |
947716c1 | 1593 | $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2); |
da4d239d AP |
1594 | $ADDU $t_2,$at |
1595 | $ADDU $c_1,$t_2 | |
1596 | sltu $c_2,$c_1,$t_2 | |
947716c1 AP |
1597 | mflo ($t_1,$a_5,$b_6) |
1598 | mfhi ($t_2,$a_5,$b_6) | |
da4d239d AP |
1599 | $ADDU $c_3,$t_1 |
1600 | sltu $at,$c_3,$t_1 | |
947716c1 | 1601 | $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2); |
da4d239d AP |
1602 | $ADDU $t_2,$at |
1603 | $ADDU $c_1,$t_2 | |
1604 | sltu $at,$c_1,$t_2 | |
1605 | $ADDU $c_2,$at | |
947716c1 AP |
1606 | mflo ($t_1,$a_6,$b_5) |
1607 | mfhi ($t_2,$a_6,$b_5) | |
da4d239d AP |
1608 | $ADDU $c_3,$t_1 |
1609 | sltu $at,$c_3,$t_1 | |
947716c1 | 1610 | $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2); |
da4d239d AP |
1611 | $ADDU $t_2,$at |
1612 | $ADDU $c_1,$t_2 | |
1613 | sltu $at,$c_1,$t_2 | |
1614 | $ADDU $c_2,$at | |
947716c1 AP |
1615 | mflo ($t_1,$a_7,$b_4) |
1616 | mfhi ($t_2,$a_7,$b_4) | |
da4d239d AP |
1617 | $ADDU $c_3,$t_1 |
1618 | sltu $at,$c_3,$t_1 | |
947716c1 | 1619 | $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3); |
da4d239d AP |
1620 | $ADDU $t_2,$at |
1621 | $ADDU $c_1,$t_2 | |
1622 | sltu $at,$c_1,$t_2 | |
1623 | $ADDU $c_2,$at | |
1624 | $ST $c_3,11*$BNSZ($a0) # r[11]=c3; | |
1625 | ||
947716c1 AP |
1626 | mflo ($t_1,$a_7,$b_5) |
1627 | mfhi ($t_2,$a_7,$b_5) | |
da4d239d AP |
1628 | $ADDU $c_1,$t_1 |
1629 | sltu $at,$c_1,$t_1 | |
947716c1 | 1630 | $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3); |
da4d239d AP |
1631 | $ADDU $t_2,$at |
1632 | $ADDU $c_2,$t_2 | |
1633 | sltu $c_3,$c_2,$t_2 | |
947716c1 AP |
1634 | mflo ($t_1,$a_6,$b_6) |
1635 | mfhi ($t_2,$a_6,$b_6) | |
da4d239d AP |
1636 | $ADDU $c_1,$t_1 |
1637 | sltu $at,$c_1,$t_1 | |
947716c1 | 1638 | $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3); |
da4d239d AP |
1639 | $ADDU $t_2,$at |
1640 | $ADDU $c_2,$t_2 | |
1641 | sltu $at,$c_2,$t_2 | |
1642 | $ADDU $c_3,$at | |
947716c1 AP |
1643 | mflo ($t_1,$a_5,$b_7) |
1644 | mfhi ($t_2,$a_5,$b_7) | |
da4d239d AP |
1645 | $ADDU $c_1,$t_1 |
1646 | sltu $at,$c_1,$t_1 | |
947716c1 | 1647 | $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1); |
da4d239d AP |
1648 | $ADDU $t_2,$at |
1649 | $ADDU $c_2,$t_2 | |
1650 | sltu $at,$c_2,$t_2 | |
1651 | $ADDU $c_3,$at | |
1652 | $ST $c_1,12*$BNSZ($a0) # r[12]=c1; | |
1653 | ||
947716c1 AP |
1654 | mflo ($t_1,$a_6,$b_7) |
1655 | mfhi ($t_2,$a_6,$b_7) | |
da4d239d AP |
1656 | $ADDU $c_2,$t_1 |
1657 | sltu $at,$c_2,$t_1 | |
947716c1 | 1658 | $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1); |
da4d239d AP |
1659 | $ADDU $t_2,$at |
1660 | $ADDU $c_3,$t_2 | |
1661 | sltu $c_1,$c_3,$t_2 | |
947716c1 AP |
1662 | mflo ($t_1,$a_7,$b_6) |
1663 | mfhi ($t_2,$a_7,$b_6) | |
da4d239d AP |
1664 | $ADDU $c_2,$t_1 |
1665 | sltu $at,$c_2,$t_1 | |
947716c1 | 1666 | $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2); |
da4d239d AP |
1667 | $ADDU $t_2,$at |
1668 | $ADDU $c_3,$t_2 | |
1669 | sltu $at,$c_3,$t_2 | |
1670 | $ADDU $c_1,$at | |
1671 | $ST $c_2,13*$BNSZ($a0) # r[13]=c2; | |
1672 | ||
947716c1 AP |
1673 | mflo ($t_1,$a_7,$b_7) |
1674 | mfhi ($t_2,$a_7,$b_7) | |
da4d239d AP |
1675 | $ADDU $c_3,$t_1 |
1676 | sltu $at,$c_3,$t_1 | |
1677 | $ADDU $t_2,$at | |
1678 | $ADDU $c_1,$t_2 | |
1679 | $ST $c_3,14*$BNSZ($a0) # r[14]=c3; | |
1680 | $ST $c_1,15*$BNSZ($a0) # r[15]=c1; | |
1681 | ||
1682 | .set noreorder | |
1683 | ___ | |
1684 | $code.=<<___ if ($flavour =~ /nubi/i); | |
1685 | $REG_L $s5,10*$SZREG($sp) | |
1686 | $REG_L $s4,9*$SZREG($sp) | |
1687 | $REG_L $s3,8*$SZREG($sp) | |
1688 | $REG_L $s2,7*$SZREG($sp) | |
1689 | $REG_L $s1,6*$SZREG($sp) | |
1690 | $REG_L $s0,5*$SZREG($sp) | |
1691 | $REG_L $t3,4*$SZREG($sp) | |
1692 | $REG_L $t2,3*$SZREG($sp) | |
1693 | $REG_L $t1,2*$SZREG($sp) | |
1694 | $REG_L $t0,1*$SZREG($sp) | |
1695 | $REG_L $gp,0*$SZREG($sp) | |
1696 | jr $ra | |
1697 | $PTR_ADD $sp,12*$SZREG | |
1698 | ___ | |
1699 | $code.=<<___ if ($flavour !~ /nubi/i); | |
1700 | $REG_L $s5,5*$SZREG($sp) | |
1701 | $REG_L $s4,4*$SZREG($sp) | |
1702 | $REG_L $s3,3*$SZREG($sp) | |
1703 | $REG_L $s2,2*$SZREG($sp) | |
1704 | $REG_L $s1,1*$SZREG($sp) | |
1705 | $REG_L $s0,0*$SZREG($sp) | |
1706 | jr $ra | |
1707 | $PTR_ADD $sp,6*$SZREG | |
1708 | ___ | |
1709 | $code.=<<___; | |
1710 | .end bn_mul_comba8 | |
1711 | ||
1712 | .align 5 | |
1713 | .globl bn_mul_comba4 | |
1714 | .ent bn_mul_comba4 | |
1715 | bn_mul_comba4: | |
1716 | ___ | |
1717 | $code.=<<___ if ($flavour =~ /nubi/i); | |
1718 | .frame $sp,6*$SZREG,$ra | |
1719 | .mask 0x8000f008,-$SZREG | |
1720 | .set noreorder | |
1721 | $PTR_SUB $sp,6*$SZREG | |
1722 | $REG_S $ra,5*$SZREG($sp) | |
1723 | $REG_S $t3,4*$SZREG($sp) | |
1724 | $REG_S $t2,3*$SZREG($sp) | |
1725 | $REG_S $t1,2*$SZREG($sp) | |
1726 | $REG_S $t0,1*$SZREG($sp) | |
1727 | $REG_S $gp,0*$SZREG($sp) | |
1728 | ___ | |
1729 | $code.=<<___; | |
1730 | .set reorder | |
1731 | $LD $a_0,0($a1) | |
1732 | $LD $b_0,0($a2) | |
1733 | $LD $a_1,$BNSZ($a1) | |
1734 | $LD $a_2,2*$BNSZ($a1) | |
947716c1 | 1735 | $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); |
da4d239d AP |
1736 | $LD $a_3,3*$BNSZ($a1) |
1737 | $LD $b_1,$BNSZ($a2) | |
1738 | $LD $b_2,2*$BNSZ($a2) | |
1739 | $LD $b_3,3*$BNSZ($a2) | |
947716c1 AP |
1740 | mflo ($c_1,$a_0,$b_0) |
1741 | mfhi ($c_2,$a_0,$b_0) | |
da4d239d AP |
1742 | $ST $c_1,0($a0) |
1743 | ||
947716c1 AP |
1744 | $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); |
1745 | mflo ($t_1,$a_0,$b_1) | |
1746 | mfhi ($t_2,$a_0,$b_1) | |
da4d239d AP |
1747 | $ADDU $c_2,$t_1 |
1748 | sltu $at,$c_2,$t_1 | |
947716c1 | 1749 | $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); |
da4d239d | 1750 | $ADDU $c_3,$t_2,$at |
947716c1 AP |
1751 | mflo ($t_1,$a_1,$b_0) |
1752 | mfhi ($t_2,$a_1,$b_0) | |
da4d239d AP |
1753 | $ADDU $c_2,$t_1 |
1754 | sltu $at,$c_2,$t_1 | |
947716c1 | 1755 | $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); |
da4d239d AP |
1756 | $ADDU $t_2,$at |
1757 | $ADDU $c_3,$t_2 | |
1758 | sltu $c_1,$c_3,$t_2 | |
1759 | $ST $c_2,$BNSZ($a0) | |
1760 | ||
947716c1 AP |
1761 | mflo ($t_1,$a_2,$b_0) |
1762 | mfhi ($t_2,$a_2,$b_0) | |
da4d239d AP |
1763 | $ADDU $c_3,$t_1 |
1764 | sltu $at,$c_3,$t_1 | |
947716c1 | 1765 | $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); |
da4d239d AP |
1766 | $ADDU $t_2,$at |
1767 | $ADDU $c_1,$t_2 | |
947716c1 AP |
1768 | mflo ($t_1,$a_1,$b_1) |
1769 | mfhi ($t_2,$a_1,$b_1) | |
da4d239d AP |
1770 | $ADDU $c_3,$t_1 |
1771 | sltu $at,$c_3,$t_1 | |
947716c1 | 1772 | $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); |
da4d239d AP |
1773 | $ADDU $t_2,$at |
1774 | $ADDU $c_1,$t_2 | |
1775 | sltu $c_2,$c_1,$t_2 | |
947716c1 AP |
1776 | mflo ($t_1,$a_0,$b_2) |
1777 | mfhi ($t_2,$a_0,$b_2) | |
da4d239d AP |
1778 | $ADDU $c_3,$t_1 |
1779 | sltu $at,$c_3,$t_1 | |
947716c1 | 1780 | $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); |
da4d239d AP |
1781 | $ADDU $t_2,$at |
1782 | $ADDU $c_1,$t_2 | |
1783 | sltu $at,$c_1,$t_2 | |
1784 | $ADDU $c_2,$at | |
1785 | $ST $c_3,2*$BNSZ($a0) | |
1786 | ||
947716c1 AP |
1787 | mflo ($t_1,$a_0,$b_3) |
1788 | mfhi ($t_2,$a_0,$b_3) | |
da4d239d AP |
1789 | $ADDU $c_1,$t_1 |
1790 | sltu $at,$c_1,$t_1 | |
947716c1 | 1791 | $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); |
da4d239d AP |
1792 | $ADDU $t_2,$at |
1793 | $ADDU $c_2,$t_2 | |
1794 | sltu $c_3,$c_2,$t_2 | |
947716c1 AP |
1795 | mflo ($t_1,$a_1,$b_2) |
1796 | mfhi ($t_2,$a_1,$b_2) | |
da4d239d AP |
1797 | $ADDU $c_1,$t_1 |
1798 | sltu $at,$c_1,$t_1 | |
947716c1 | 1799 | $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); |
da4d239d AP |
1800 | $ADDU $t_2,$at |
1801 | $ADDU $c_2,$t_2 | |
1802 | sltu $at,$c_2,$t_2 | |
1803 | $ADDU $c_3,$at | |
947716c1 AP |
1804 | mflo ($t_1,$a_2,$b_1) |
1805 | mfhi ($t_2,$a_2,$b_1) | |
da4d239d AP |
1806 | $ADDU $c_1,$t_1 |
1807 | sltu $at,$c_1,$t_1 | |
947716c1 | 1808 | $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); |
da4d239d AP |
1809 | $ADDU $t_2,$at |
1810 | $ADDU $c_2,$t_2 | |
1811 | sltu $at,$c_2,$t_2 | |
1812 | $ADDU $c_3,$at | |
947716c1 AP |
1813 | mflo ($t_1,$a_3,$b_0) |
1814 | mfhi ($t_2,$a_3,$b_0) | |
da4d239d AP |
1815 | $ADDU $c_1,$t_1 |
1816 | sltu $at,$c_1,$t_1 | |
947716c1 | 1817 | $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); |
da4d239d AP |
1818 | $ADDU $t_2,$at |
1819 | $ADDU $c_2,$t_2 | |
1820 | sltu $at,$c_2,$t_2 | |
1821 | $ADDU $c_3,$at | |
1822 | $ST $c_1,3*$BNSZ($a0) | |
1823 | ||
947716c1 AP |
1824 | mflo ($t_1,$a_3,$b_1) |
1825 | mfhi ($t_2,$a_3,$b_1) | |
da4d239d AP |
1826 | $ADDU $c_2,$t_1 |
1827 | sltu $at,$c_2,$t_1 | |
947716c1 | 1828 | $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); |
da4d239d AP |
1829 | $ADDU $t_2,$at |
1830 | $ADDU $c_3,$t_2 | |
1831 | sltu $c_1,$c_3,$t_2 | |
947716c1 AP |
1832 | mflo ($t_1,$a_2,$b_2) |
1833 | mfhi ($t_2,$a_2,$b_2) | |
da4d239d AP |
1834 | $ADDU $c_2,$t_1 |
1835 | sltu $at,$c_2,$t_1 | |
947716c1 | 1836 | $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); |
da4d239d AP |
1837 | $ADDU $t_2,$at |
1838 | $ADDU $c_3,$t_2 | |
1839 | sltu $at,$c_3,$t_2 | |
1840 | $ADDU $c_1,$at | |
947716c1 AP |
1841 | mflo ($t_1,$a_1,$b_3) |
1842 | mfhi ($t_2,$a_1,$b_3) | |
da4d239d AP |
1843 | $ADDU $c_2,$t_1 |
1844 | sltu $at,$c_2,$t_1 | |
947716c1 | 1845 | $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); |
da4d239d AP |
1846 | $ADDU $t_2,$at |
1847 | $ADDU $c_3,$t_2 | |
1848 | sltu $at,$c_3,$t_2 | |
1849 | $ADDU $c_1,$at | |
1850 | $ST $c_2,4*$BNSZ($a0) | |
1851 | ||
947716c1 AP |
1852 | mflo ($t_1,$a_2,$b_3) |
1853 | mfhi ($t_2,$a_2,$b_3) | |
da4d239d AP |
1854 | $ADDU $c_3,$t_1 |
1855 | sltu $at,$c_3,$t_1 | |
947716c1 | 1856 | $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); |
da4d239d AP |
1857 | $ADDU $t_2,$at |
1858 | $ADDU $c_1,$t_2 | |
1859 | sltu $c_2,$c_1,$t_2 | |
947716c1 AP |
1860 | mflo ($t_1,$a_3,$b_2) |
1861 | mfhi ($t_2,$a_3,$b_2) | |
da4d239d AP |
1862 | $ADDU $c_3,$t_1 |
1863 | sltu $at,$c_3,$t_1 | |
947716c1 | 1864 | $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); |
da4d239d AP |
1865 | $ADDU $t_2,$at |
1866 | $ADDU $c_1,$t_2 | |
1867 | sltu $at,$c_1,$t_2 | |
1868 | $ADDU $c_2,$at | |
1869 | $ST $c_3,5*$BNSZ($a0) | |
1870 | ||
947716c1 AP |
1871 | mflo ($t_1,$a_3,$b_3) |
1872 | mfhi ($t_2,$a_3,$b_3) | |
da4d239d AP |
1873 | $ADDU $c_1,$t_1 |
1874 | sltu $at,$c_1,$t_1 | |
1875 | $ADDU $t_2,$at | |
1876 | $ADDU $c_2,$t_2 | |
1877 | $ST $c_1,6*$BNSZ($a0) | |
1878 | $ST $c_2,7*$BNSZ($a0) | |
1879 | ||
1880 | .set noreorder | |
1881 | ___ | |
1882 | $code.=<<___ if ($flavour =~ /nubi/i); | |
1883 | $REG_L $t3,4*$SZREG($sp) | |
1884 | $REG_L $t2,3*$SZREG($sp) | |
1885 | $REG_L $t1,2*$SZREG($sp) | |
1886 | $REG_L $t0,1*$SZREG($sp) | |
1887 | $REG_L $gp,0*$SZREG($sp) | |
1888 | $PTR_ADD $sp,6*$SZREG | |
1889 | ___ | |
1890 | $code.=<<___; | |
1891 | jr $ra | |
1892 | nop | |
1893 | .end bn_mul_comba4 | |
1894 | ___ | |
1895 | ||
1896 | ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); | |
1897 | ||
a7a44ba5 AP |
1898 | sub add_c2 () { |
1899 | my ($hi,$lo,$c0,$c1,$c2, | |
1900 | $warm, # !$warm denotes first call with specific sequence of | |
1901 | # $c_[XYZ] when there is no Z-carry to accumulate yet; | |
1902 | $an,$bn # these two are arguments for multiplication which | |
1903 | # result is used in *next* step [which is why it's | |
1904 | # commented as "forward multiplication" below]; | |
1905 | )=@_; | |
1906 | $code.=<<___; | |
a7a44ba5 AP |
1907 | $ADDU $c0,$lo |
1908 | sltu $at,$c0,$lo | |
947716c1 | 1909 | $MULTU ($an,$bn) # forward multiplication |
a7a44ba5 AP |
1910 | $ADDU $c0,$lo |
1911 | $ADDU $at,$hi | |
1912 | sltu $lo,$c0,$lo | |
1913 | $ADDU $c1,$at | |
1914 | $ADDU $hi,$lo | |
1915 | ___ | |
1916 | $code.=<<___ if (!$warm); | |
1917 | sltu $c2,$c1,$at | |
1918 | $ADDU $c1,$hi | |
a7a44ba5 AP |
1919 | ___ |
1920 | $code.=<<___ if ($warm); | |
1921 | sltu $at,$c1,$at | |
1922 | $ADDU $c1,$hi | |
1923 | $ADDU $c2,$at | |
947716c1 AP |
1924 | ___ |
1925 | $code.=<<___; | |
a7a44ba5 AP |
1926 | sltu $hi,$c1,$hi |
1927 | $ADDU $c2,$hi | |
947716c1 AP |
1928 | mflo ($lo,$an,$bn) |
1929 | mfhi ($hi,$an,$bn) | |
a7a44ba5 AP |
1930 | ___ |
1931 | } | |
1932 | ||
da4d239d AP |
1933 | $code.=<<___; |
1934 | ||
1935 | .align 5 | |
1936 | .globl bn_sqr_comba8 | |
1937 | .ent bn_sqr_comba8 | |
1938 | bn_sqr_comba8: | |
1939 | ___ | |
1940 | $code.=<<___ if ($flavour =~ /nubi/i); | |
1941 | .frame $sp,6*$SZREG,$ra | |
1942 | .mask 0x8000f008,-$SZREG | |
1943 | .set noreorder | |
1944 | $PTR_SUB $sp,6*$SZREG | |
1945 | $REG_S $ra,5*$SZREG($sp) | |
1946 | $REG_S $t3,4*$SZREG($sp) | |
1947 | $REG_S $t2,3*$SZREG($sp) | |
1948 | $REG_S $t1,2*$SZREG($sp) | |
1949 | $REG_S $t0,1*$SZREG($sp) | |
1950 | $REG_S $gp,0*$SZREG($sp) | |
1951 | ___ | |
1952 | $code.=<<___; | |
1953 | .set reorder | |
1954 | $LD $a_0,0($a1) | |
1955 | $LD $a_1,$BNSZ($a1) | |
1956 | $LD $a_2,2*$BNSZ($a1) | |
1957 | $LD $a_3,3*$BNSZ($a1) | |
1958 | ||
947716c1 | 1959 | $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); |
da4d239d AP |
1960 | $LD $a_4,4*$BNSZ($a1) |
1961 | $LD $a_5,5*$BNSZ($a1) | |
1962 | $LD $a_6,6*$BNSZ($a1) | |
1963 | $LD $a_7,7*$BNSZ($a1) | |
947716c1 AP |
1964 | mflo ($c_1,$a_0,$a_0) |
1965 | mfhi ($c_2,$a_0,$a_0) | |
da4d239d AP |
1966 | $ST $c_1,0($a0) |
1967 | ||
947716c1 AP |
1968 | $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); |
1969 | mflo ($t_1,$a_0,$a_1) | |
1970 | mfhi ($t_2,$a_0,$a_1) | |
da4d239d AP |
1971 | slt $c_1,$t_2,$zero |
1972 | $SLL $t_2,1 | |
947716c1 | 1973 | $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); |
da4d239d AP |
1974 | slt $a2,$t_1,$zero |
1975 | $ADDU $t_2,$a2 | |
1976 | $SLL $t_1,1 | |
1977 | $ADDU $c_2,$t_1 | |
1978 | sltu $at,$c_2,$t_1 | |
1979 | $ADDU $c_3,$t_2,$at | |
1980 | $ST $c_2,$BNSZ($a0) | |
947716c1 AP |
1981 | mflo ($t_1,$a_2,$a_0) |
1982 | mfhi ($t_2,$a_2,$a_0) | |
a7a44ba5 AP |
1983 | ___ |
1984 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | |
1985 | $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); | |
1986 | $code.=<<___; | |
da4d239d AP |
1987 | $ADDU $c_3,$t_1 |
1988 | sltu $at,$c_3,$t_1 | |
947716c1 | 1989 | $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); |
da4d239d AP |
1990 | $ADDU $t_2,$at |
1991 | $ADDU $c_1,$t_2 | |
1992 | sltu $at,$c_1,$t_2 | |
1993 | $ADDU $c_2,$at | |
1994 | $ST $c_3,2*$BNSZ($a0) | |
947716c1 AP |
1995 | mflo ($t_1,$a_0,$a_3) |
1996 | mfhi ($t_2,$a_0,$a_3) | |
a7a44ba5 AP |
1997 | ___ |
1998 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | |
1999 | $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); | |
2000 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | |
2001 | $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); | |
2002 | $code.=<<___; | |
da4d239d | 2003 | $ST $c_1,3*$BNSZ($a0) |
a7a44ba5 AP |
2004 | ___ |
2005 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | |
2006 | $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); | |
2007 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | |
2008 | $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); | |
2009 | $code.=<<___; | |
da4d239d AP |
2010 | $ADDU $c_2,$t_1 |
2011 | sltu $at,$c_2,$t_1 | |
947716c1 | 2012 | $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2); |
da4d239d AP |
2013 | $ADDU $t_2,$at |
2014 | $ADDU $c_3,$t_2 | |
2015 | sltu $at,$c_3,$t_2 | |
2016 | $ADDU $c_1,$at | |
2017 | $ST $c_2,4*$BNSZ($a0) | |
947716c1 AP |
2018 | mflo ($t_1,$a_0,$a_5) |
2019 | mfhi ($t_2,$a_0,$a_5) | |
a7a44ba5 AP |
2020 | ___ |
2021 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | |
2022 | $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); | |
2023 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | |
2024 | $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); | |
2025 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | |
2026 | $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); | |
2027 | $code.=<<___; | |
da4d239d | 2028 | $ST $c_3,5*$BNSZ($a0) |
a7a44ba5 AP |
2029 | ___ |
2030 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | |
2031 | $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); | |
2032 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | |
2033 | $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); | |
2034 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | |
2035 | $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); | |
2036 | $code.=<<___; | |
da4d239d AP |
2037 | $ADDU $c_1,$t_1 |
2038 | sltu $at,$c_1,$t_1 | |
947716c1 | 2039 | $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1); |
da4d239d AP |
2040 | $ADDU $t_2,$at |
2041 | $ADDU $c_2,$t_2 | |
2042 | sltu $at,$c_2,$t_2 | |
2043 | $ADDU $c_3,$at | |
2044 | $ST $c_1,6*$BNSZ($a0) | |
947716c1 AP |
2045 | mflo ($t_1,$a_0,$a_7) |
2046 | mfhi ($t_2,$a_0,$a_7) | |
a7a44ba5 AP |
2047 | ___ |
2048 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | |
2049 | $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); | |
2050 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | |
2051 | $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); | |
2052 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | |
2053 | $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); | |
2054 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | |
2055 | $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); | |
2056 | $code.=<<___; | |
da4d239d | 2057 | $ST $c_2,7*$BNSZ($a0) |
a7a44ba5 AP |
2058 | ___ |
2059 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | |
2060 | $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); | |
2061 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | |
2062 | $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); | |
2063 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | |
2064 | $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); | |
2065 | $code.=<<___; | |
da4d239d AP |
2066 | $ADDU $c_3,$t_1 |
2067 | sltu $at,$c_3,$t_1 | |
947716c1 | 2068 | $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3); |
da4d239d AP |
2069 | $ADDU $t_2,$at |
2070 | $ADDU $c_1,$t_2 | |
2071 | sltu $at,$c_1,$t_2 | |
2072 | $ADDU $c_2,$at | |
2073 | $ST $c_3,8*$BNSZ($a0) | |
947716c1 AP |
2074 | mflo ($t_1,$a_2,$a_7) |
2075 | mfhi ($t_2,$a_2,$a_7) | |
a7a44ba5 AP |
2076 | ___ |
2077 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | |
2078 | $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); | |
2079 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | |
2080 | $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); | |
2081 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | |
2082 | $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); | |
2083 | $code.=<<___; | |
da4d239d | 2084 | $ST $c_1,9*$BNSZ($a0) |
a7a44ba5 AP |
2085 | ___ |
2086 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | |
2087 | $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); | |
2088 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, | |
2089 | $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); | |
2090 | $code.=<<___; | |
da4d239d AP |
2091 | $ADDU $c_2,$t_1 |
2092 | sltu $at,$c_2,$t_1 | |
947716c1 | 2093 | $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2); |
da4d239d AP |
2094 | $ADDU $t_2,$at |
2095 | $ADDU $c_3,$t_2 | |
2096 | sltu $at,$c_3,$t_2 | |
2097 | $ADDU $c_1,$at | |
2098 | $ST $c_2,10*$BNSZ($a0) | |
947716c1 AP |
2099 | mflo ($t_1,$a_4,$a_7) |
2100 | mfhi ($t_2,$a_4,$a_7) | |
a7a44ba5 AP |
2101 | ___ |
2102 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | |
2103 | $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); | |
2104 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, | |
2105 | $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); | |
2106 | $code.=<<___; | |
da4d239d | 2107 | $ST $c_3,11*$BNSZ($a0) |
a7a44ba5 AP |
2108 | ___ |
2109 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | |
2110 | $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); | |
2111 | $code.=<<___; | |
da4d239d AP |
2112 | $ADDU $c_1,$t_1 |
2113 | sltu $at,$c_1,$t_1 | |
947716c1 | 2114 | $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1); |
da4d239d AP |
2115 | $ADDU $t_2,$at |
2116 | $ADDU $c_2,$t_2 | |
2117 | sltu $at,$c_2,$t_2 | |
2118 | $ADDU $c_3,$at | |
2119 | $ST $c_1,12*$BNSZ($a0) | |
947716c1 AP |
2120 | mflo ($t_1,$a_6,$a_7) |
2121 | mfhi ($t_2,$a_6,$a_7) | |
a7a44ba5 AP |
2122 | ___ |
2123 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | |
2124 | $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); | |
2125 | $code.=<<___; | |
da4d239d AP |
2126 | $ST $c_2,13*$BNSZ($a0) |
2127 | ||
da4d239d AP |
2128 | $ADDU $c_3,$t_1 |
2129 | sltu $at,$c_3,$t_1 | |
2130 | $ADDU $t_2,$at | |
2131 | $ADDU $c_1,$t_2 | |
2132 | $ST $c_3,14*$BNSZ($a0) | |
2133 | $ST $c_1,15*$BNSZ($a0) | |
2134 | ||
2135 | .set noreorder | |
2136 | ___ | |
2137 | $code.=<<___ if ($flavour =~ /nubi/i); | |
2138 | $REG_L $t3,4*$SZREG($sp) | |
2139 | $REG_L $t2,3*$SZREG($sp) | |
2140 | $REG_L $t1,2*$SZREG($sp) | |
2141 | $REG_L $t0,1*$SZREG($sp) | |
2142 | $REG_L $gp,0*$SZREG($sp) | |
2143 | $PTR_ADD $sp,6*$SZREG | |
2144 | ___ | |
2145 | $code.=<<___; | |
2146 | jr $ra | |
2147 | nop | |
2148 | .end bn_sqr_comba8 | |
2149 | ||
2150 | .align 5 | |
2151 | .globl bn_sqr_comba4 | |
2152 | .ent bn_sqr_comba4 | |
2153 | bn_sqr_comba4: | |
2154 | ___ | |
2155 | $code.=<<___ if ($flavour =~ /nubi/i); | |
2156 | .frame $sp,6*$SZREG,$ra | |
2157 | .mask 0x8000f008,-$SZREG | |
2158 | .set noreorder | |
2159 | $PTR_SUB $sp,6*$SZREG | |
2160 | $REG_S $ra,5*$SZREG($sp) | |
2161 | $REG_S $t3,4*$SZREG($sp) | |
2162 | $REG_S $t2,3*$SZREG($sp) | |
2163 | $REG_S $t1,2*$SZREG($sp) | |
2164 | $REG_S $t0,1*$SZREG($sp) | |
2165 | $REG_S $gp,0*$SZREG($sp) | |
2166 | ___ | |
2167 | $code.=<<___; | |
2168 | .set reorder | |
2169 | $LD $a_0,0($a1) | |
2170 | $LD $a_1,$BNSZ($a1) | |
947716c1 | 2171 | $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); |
da4d239d AP |
2172 | $LD $a_2,2*$BNSZ($a1) |
2173 | $LD $a_3,3*$BNSZ($a1) | |
947716c1 AP |
2174 | mflo ($c_1,$a_0,$a_0) |
2175 | mfhi ($c_2,$a_0,$a_0) | |
da4d239d AP |
2176 | $ST $c_1,0($a0) |
2177 | ||
947716c1 AP |
2178 | $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); |
2179 | mflo ($t_1,$a_0,$a_1) | |
2180 | mfhi ($t_2,$a_0,$a_1) | |
da4d239d AP |
2181 | slt $c_1,$t_2,$zero |
2182 | $SLL $t_2,1 | |
947716c1 | 2183 | $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); |
da4d239d AP |
2184 | slt $a2,$t_1,$zero |
2185 | $ADDU $t_2,$a2 | |
2186 | $SLL $t_1,1 | |
2187 | $ADDU $c_2,$t_1 | |
2188 | sltu $at,$c_2,$t_1 | |
2189 | $ADDU $c_3,$t_2,$at | |
2190 | $ST $c_2,$BNSZ($a0) | |
947716c1 AP |
2191 | mflo ($t_1,$a_2,$a_0) |
2192 | mfhi ($t_2,$a_2,$a_0) | |
a7a44ba5 AP |
2193 | ___ |
2194 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | |
2195 | $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); | |
2196 | $code.=<<___; | |
da4d239d AP |
2197 | $ADDU $c_3,$t_1 |
2198 | sltu $at,$c_3,$t_1 | |
947716c1 | 2199 | $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); |
da4d239d AP |
2200 | $ADDU $t_2,$at |
2201 | $ADDU $c_1,$t_2 | |
2202 | sltu $at,$c_1,$t_2 | |
2203 | $ADDU $c_2,$at | |
2204 | $ST $c_3,2*$BNSZ($a0) | |
947716c1 AP |
2205 | mflo ($t_1,$a_0,$a_3) |
2206 | mfhi ($t_2,$a_0,$a_3) | |
a7a44ba5 AP |
2207 | ___ |
2208 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, | |
2209 | $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); | |
2210 | &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, | |
2211 | $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); | |
2212 | $code.=<<___; | |
da4d239d | 2213 | $ST $c_1,3*$BNSZ($a0) |
a7a44ba5 AP |
2214 | ___ |
2215 | &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, | |
2216 | $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); | |
2217 | $code.=<<___; | |
da4d239d AP |
2218 | $ADDU $c_2,$t_1 |
2219 | sltu $at,$c_2,$t_1 | |
947716c1 | 2220 | $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2); |
da4d239d AP |
2221 | $ADDU $t_2,$at |
2222 | $ADDU $c_3,$t_2 | |
2223 | sltu $at,$c_3,$t_2 | |
2224 | $ADDU $c_1,$at | |
2225 | $ST $c_2,4*$BNSZ($a0) | |
947716c1 AP |
2226 | mflo ($t_1,$a_2,$a_3) |
2227 | mfhi ($t_2,$a_2,$a_3) | |
a7a44ba5 AP |
2228 | ___ |
2229 | &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, | |
2230 | $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); | |
2231 | $code.=<<___; | |
da4d239d AP |
2232 | $ST $c_3,5*$BNSZ($a0) |
2233 | ||
da4d239d AP |
2234 | $ADDU $c_1,$t_1 |
2235 | sltu $at,$c_1,$t_1 | |
2236 | $ADDU $t_2,$at | |
2237 | $ADDU $c_2,$t_2 | |
2238 | $ST $c_1,6*$BNSZ($a0) | |
2239 | $ST $c_2,7*$BNSZ($a0) | |
2240 | ||
2241 | .set noreorder | |
2242 | ___ | |
2243 | $code.=<<___ if ($flavour =~ /nubi/i); | |
2244 | $REG_L $t3,4*$SZREG($sp) | |
2245 | $REG_L $t2,3*$SZREG($sp) | |
2246 | $REG_L $t1,2*$SZREG($sp) | |
2247 | $REG_L $t0,1*$SZREG($sp) | |
2248 | $REG_L $gp,0*$SZREG($sp) | |
2249 | $PTR_ADD $sp,6*$SZREG | |
2250 | ___ | |
2251 | $code.=<<___; | |
2252 | jr $ra | |
2253 | nop | |
2254 | .end bn_sqr_comba4 | |
2255 | ___ | |
2256 | print $code; | |
2257 | close STDOUT; |