]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/mips.pl
Update copyright year
[thirdparty/openssl.git] / crypto / bn / asm / mips.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
367ace68 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
da4d239d
AP
9#
10# ====================================================================
e3713c36 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
da4d239d
AP
12# project.
13#
14# Rights for redistribution and usage in source and binary forms are
389c09fa 15# granted according to the License. Warranty of any kind is disclaimed.
da4d239d
AP
16# ====================================================================
17
18
19# July 1999
20#
21# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
22#
23# The module is designed to work with either of the "new" MIPS ABI(5),
60250017 24# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
da4d239d
AP
25# IRIX 5.x not only because it doesn't support new ABIs but also
26# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
27# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
28# cause illegal instruction exception:-(
29#
30# In addition the code depends on preprocessor flags set up by MIPSpro
31# compiler driver (either as or cc) and therefore (probably?) can't be
32# compiled by the GNU assembler. GNU C driver manages fine though...
33# I mean as long as -mmips-as is specified or is the default option,
34# because then it simply invokes /usr/bin/as which in turn takes
35# perfect care of the preprocessor definitions. Another neat feature
36# offered by the MIPSpro assembler is an optimization pass. This gave
37# me the opportunity to have the code looking more regular as all those
38# architecture dependent instruction rescheduling details were left to
39# the assembler. Cool, huh?
40#
41# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
42# goes way over 3 times faster!
43#
e3713c36 44# <appro@openssl.org>
da4d239d
AP
45
46# October 2010
47#
48# Adapt the module even for 32-bit ABIs and other OSes. The former was
49# achieved by mechanical replacement of 64-bit arithmetic instructions
50# such as dmultu, daddu, etc. with their 32-bit counterparts and
51# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
52# >3x performance improvement naturally does not apply to 32-bit code
53# [because there is no instruction 32-bit compiler can't use], one
54# has to content with 40-85% improvement depending on benchmark and
55# key length, more for longer keys.
56
1aa89a7a
RL
57# $output is the last argument if it looks like a file (it has an extension)
58# $flavour is the first argument if it doesn't look like a file
59$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
60$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
da4d239d
AP
61
62if ($flavour =~ /64|n32/i) {
63 $LD="ld";
64 $ST="sd";
65 $MULTU="dmultu";
66 $DIVU="ddivu";
67 $ADDU="daddu";
68 $SUBU="dsubu";
69 $SRL="dsrl";
70 $SLL="dsll";
71 $BNSZ=8;
72 $PTR_ADD="daddu";
73 $PTR_SUB="dsubu";
74 $SZREG=8;
75 $REG_S="sd";
76 $REG_L="ld";
77} else {
78 $LD="lw";
79 $ST="sw";
80 $MULTU="multu";
81 $DIVU="divu";
82 $ADDU="addu";
83 $SUBU="subu";
84 $SRL="srl";
85 $SLL="sll";
86 $BNSZ=4;
87 $PTR_ADD="addu";
88 $PTR_SUB="subu";
89 $SZREG=4;
90 $REG_S="sw";
91 $REG_L="lw";
1b9c5f2e 92 $code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n";
da4d239d
AP
93}
94
1aa89a7a
RL
95$output and open STDOUT,">$output";
96
da4d239d
AP
97# Below is N32/64 register layout used in the original module.
98#
99($zero,$at,$v0,$v1)=map("\$$_",(0..3));
100($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
101($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
102($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
103($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
104($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
105#
106# No special adaptation is required for O32. NUBI on the other hand
107# is treated by saving/restoring ($v1,$t0..$t3).
108
109$gp=$v1 if ($flavour =~ /nubi/i);
110
111$minus4=$v1;
112
113$code.=<<___;
947716c1
AP
114#include "mips_arch.h"
115
116#if defined(_MIPS_ARCH_MIPS64R6)
117# define ddivu(rs,rt)
118# define mfqt(rd,rs,rt) ddivu rd,rs,rt
119# define mfrm(rd,rs,rt) dmodu rd,rs,rt
120#elif defined(_MIPS_ARCH_MIPS32R6)
121# define divu(rs,rt)
122# define mfqt(rd,rs,rt) divu rd,rs,rt
123# define mfrm(rd,rs,rt) modu rd,rs,rt
124#else
125# define $DIVU(rs,rt) $DIVU $zero,rs,rt
126# define mfqt(rd,rs,rt) mflo rd
127# define mfrm(rd,rs,rt) mfhi rd
128#endif
129
da4d239d
AP
130.rdata
131.asciiz "mips3.s, Version 1.2"
132.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
133
134.text
135.set noat
136
137.align 5
138.globl bn_mul_add_words
139.ent bn_mul_add_words
140bn_mul_add_words:
141 .set noreorder
142 bgtz $a2,bn_mul_add_words_internal
143 move $v0,$zero
144 jr $ra
145 move $a0,$v0
146.end bn_mul_add_words
147
148.align 5
149.ent bn_mul_add_words_internal
150bn_mul_add_words_internal:
151___
152$code.=<<___ if ($flavour =~ /nubi/i);
153 .frame $sp,6*$SZREG,$ra
154 .mask 0x8000f008,-$SZREG
155 .set noreorder
156 $PTR_SUB $sp,6*$SZREG
157 $REG_S $ra,5*$SZREG($sp)
158 $REG_S $t3,4*$SZREG($sp)
159 $REG_S $t2,3*$SZREG($sp)
160 $REG_S $t1,2*$SZREG($sp)
161 $REG_S $t0,1*$SZREG($sp)
162 $REG_S $gp,0*$SZREG($sp)
163___
164$code.=<<___;
165 .set reorder
166 li $minus4,-4
167 and $ta0,$a2,$minus4
da4d239d
AP
168 beqz $ta0,.L_bn_mul_add_words_tail
169
170.L_bn_mul_add_words_loop:
0c2adb0a 171 $LD $t0,0($a1)
947716c1 172 $MULTU ($t0,$a3)
da4d239d
AP
173 $LD $t1,0($a0)
174 $LD $t2,$BNSZ($a1)
175 $LD $t3,$BNSZ($a0)
176 $LD $ta0,2*$BNSZ($a1)
177 $LD $ta1,2*$BNSZ($a0)
178 $ADDU $t1,$v0
179 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
180 # values", but it seems to work fine
181 # even on 64-bit registers.
947716c1
AP
182 mflo ($at,$t0,$a3)
183 mfhi ($t0,$t0,$a3)
da4d239d
AP
184 $ADDU $t1,$at
185 $ADDU $v0,$t0
947716c1 186 $MULTU ($t2,$a3)
da4d239d
AP
187 sltu $at,$t1,$at
188 $ST $t1,0($a0)
189 $ADDU $v0,$at
190
191 $LD $ta2,3*$BNSZ($a1)
192 $LD $ta3,3*$BNSZ($a0)
193 $ADDU $t3,$v0
194 sltu $v0,$t3,$v0
947716c1
AP
195 mflo ($at,$t2,$a3)
196 mfhi ($t2,$t2,$a3)
da4d239d
AP
197 $ADDU $t3,$at
198 $ADDU $v0,$t2
947716c1 199 $MULTU ($ta0,$a3)
da4d239d
AP
200 sltu $at,$t3,$at
201 $ST $t3,$BNSZ($a0)
202 $ADDU $v0,$at
203
204 subu $a2,4
205 $PTR_ADD $a0,4*$BNSZ
206 $PTR_ADD $a1,4*$BNSZ
207 $ADDU $ta1,$v0
208 sltu $v0,$ta1,$v0
947716c1
AP
209 mflo ($at,$ta0,$a3)
210 mfhi ($ta0,$ta0,$a3)
da4d239d
AP
211 $ADDU $ta1,$at
212 $ADDU $v0,$ta0
947716c1 213 $MULTU ($ta2,$a3)
da4d239d
AP
214 sltu $at,$ta1,$at
215 $ST $ta1,-2*$BNSZ($a0)
216 $ADDU $v0,$at
217
218
219 and $ta0,$a2,$minus4
220 $ADDU $ta3,$v0
221 sltu $v0,$ta3,$v0
947716c1
AP
222 mflo ($at,$ta2,$a3)
223 mfhi ($ta2,$ta2,$a3)
da4d239d
AP
224 $ADDU $ta3,$at
225 $ADDU $v0,$ta2
226 sltu $at,$ta3,$at
227 $ST $ta3,-$BNSZ($a0)
da4d239d 228 .set noreorder
0c2adb0a
AP
229 bgtz $ta0,.L_bn_mul_add_words_loop
230 $ADDU $v0,$at
da4d239d
AP
231
232 beqz $a2,.L_bn_mul_add_words_return
233 nop
234
235.L_bn_mul_add_words_tail:
236 .set reorder
237 $LD $t0,0($a1)
947716c1 238 $MULTU ($t0,$a3)
da4d239d
AP
239 $LD $t1,0($a0)
240 subu $a2,1
241 $ADDU $t1,$v0
242 sltu $v0,$t1,$v0
947716c1
AP
243 mflo ($at,$t0,$a3)
244 mfhi ($t0,$t0,$a3)
da4d239d
AP
245 $ADDU $t1,$at
246 $ADDU $v0,$t0
247 sltu $at,$t1,$at
248 $ST $t1,0($a0)
249 $ADDU $v0,$at
250 beqz $a2,.L_bn_mul_add_words_return
251
252 $LD $t0,$BNSZ($a1)
947716c1 253 $MULTU ($t0,$a3)
da4d239d
AP
254 $LD $t1,$BNSZ($a0)
255 subu $a2,1
256 $ADDU $t1,$v0
257 sltu $v0,$t1,$v0
947716c1
AP
258 mflo ($at,$t0,$a3)
259 mfhi ($t0,$t0,$a3)
da4d239d
AP
260 $ADDU $t1,$at
261 $ADDU $v0,$t0
262 sltu $at,$t1,$at
263 $ST $t1,$BNSZ($a0)
264 $ADDU $v0,$at
265 beqz $a2,.L_bn_mul_add_words_return
266
267 $LD $t0,2*$BNSZ($a1)
947716c1 268 $MULTU ($t0,$a3)
da4d239d
AP
269 $LD $t1,2*$BNSZ($a0)
270 $ADDU $t1,$v0
271 sltu $v0,$t1,$v0
947716c1
AP
272 mflo ($at,$t0,$a3)
273 mfhi ($t0,$t0,$a3)
da4d239d
AP
274 $ADDU $t1,$at
275 $ADDU $v0,$t0
276 sltu $at,$t1,$at
277 $ST $t1,2*$BNSZ($a0)
278 $ADDU $v0,$at
279
280.L_bn_mul_add_words_return:
281 .set noreorder
282___
283$code.=<<___ if ($flavour =~ /nubi/i);
284 $REG_L $t3,4*$SZREG($sp)
285 $REG_L $t2,3*$SZREG($sp)
286 $REG_L $t1,2*$SZREG($sp)
287 $REG_L $t0,1*$SZREG($sp)
288 $REG_L $gp,0*$SZREG($sp)
289 $PTR_ADD $sp,6*$SZREG
290___
291$code.=<<___;
292 jr $ra
293 move $a0,$v0
66001268 294.end bn_mul_add_words_internal
da4d239d
AP
295
296.align 5
297.globl bn_mul_words
298.ent bn_mul_words
299bn_mul_words:
300 .set noreorder
301 bgtz $a2,bn_mul_words_internal
302 move $v0,$zero
303 jr $ra
304 move $a0,$v0
305.end bn_mul_words
306
307.align 5
308.ent bn_mul_words_internal
309bn_mul_words_internal:
310___
311$code.=<<___ if ($flavour =~ /nubi/i);
312 .frame $sp,6*$SZREG,$ra
313 .mask 0x8000f008,-$SZREG
314 .set noreorder
315 $PTR_SUB $sp,6*$SZREG
316 $REG_S $ra,5*$SZREG($sp)
317 $REG_S $t3,4*$SZREG($sp)
318 $REG_S $t2,3*$SZREG($sp)
319 $REG_S $t1,2*$SZREG($sp)
320 $REG_S $t0,1*$SZREG($sp)
321 $REG_S $gp,0*$SZREG($sp)
322___
323$code.=<<___;
324 .set reorder
325 li $minus4,-4
326 and $ta0,$a2,$minus4
da4d239d
AP
327 beqz $ta0,.L_bn_mul_words_tail
328
329.L_bn_mul_words_loop:
0c2adb0a 330 $LD $t0,0($a1)
947716c1 331 $MULTU ($t0,$a3)
da4d239d
AP
332 $LD $t2,$BNSZ($a1)
333 $LD $ta0,2*$BNSZ($a1)
334 $LD $ta2,3*$BNSZ($a1)
947716c1
AP
335 mflo ($at,$t0,$a3)
336 mfhi ($t0,$t0,$a3)
da4d239d
AP
337 $ADDU $v0,$at
338 sltu $t1,$v0,$at
947716c1 339 $MULTU ($t2,$a3)
da4d239d
AP
340 $ST $v0,0($a0)
341 $ADDU $v0,$t1,$t0
342
343 subu $a2,4
344 $PTR_ADD $a0,4*$BNSZ
345 $PTR_ADD $a1,4*$BNSZ
947716c1
AP
346 mflo ($at,$t2,$a3)
347 mfhi ($t2,$t2,$a3)
da4d239d
AP
348 $ADDU $v0,$at
349 sltu $t3,$v0,$at
947716c1 350 $MULTU ($ta0,$a3)
da4d239d
AP
351 $ST $v0,-3*$BNSZ($a0)
352 $ADDU $v0,$t3,$t2
353
947716c1
AP
354 mflo ($at,$ta0,$a3)
355 mfhi ($ta0,$ta0,$a3)
da4d239d
AP
356 $ADDU $v0,$at
357 sltu $ta1,$v0,$at
947716c1 358 $MULTU ($ta2,$a3)
da4d239d
AP
359 $ST $v0,-2*$BNSZ($a0)
360 $ADDU $v0,$ta1,$ta0
361
362 and $ta0,$a2,$minus4
947716c1
AP
363 mflo ($at,$ta2,$a3)
364 mfhi ($ta2,$ta2,$a3)
da4d239d
AP
365 $ADDU $v0,$at
366 sltu $ta3,$v0,$at
367 $ST $v0,-$BNSZ($a0)
da4d239d 368 .set noreorder
0c2adb0a
AP
369 bgtz $ta0,.L_bn_mul_words_loop
370 $ADDU $v0,$ta3,$ta2
da4d239d
AP
371
372 beqz $a2,.L_bn_mul_words_return
373 nop
374
375.L_bn_mul_words_tail:
376 .set reorder
377 $LD $t0,0($a1)
947716c1 378 $MULTU ($t0,$a3)
da4d239d 379 subu $a2,1
947716c1
AP
380 mflo ($at,$t0,$a3)
381 mfhi ($t0,$t0,$a3)
da4d239d
AP
382 $ADDU $v0,$at
383 sltu $t1,$v0,$at
384 $ST $v0,0($a0)
385 $ADDU $v0,$t1,$t0
386 beqz $a2,.L_bn_mul_words_return
387
388 $LD $t0,$BNSZ($a1)
947716c1 389 $MULTU ($t0,$a3)
da4d239d 390 subu $a2,1
947716c1
AP
391 mflo ($at,$t0,$a3)
392 mfhi ($t0,$t0,$a3)
da4d239d
AP
393 $ADDU $v0,$at
394 sltu $t1,$v0,$at
395 $ST $v0,$BNSZ($a0)
396 $ADDU $v0,$t1,$t0
397 beqz $a2,.L_bn_mul_words_return
398
399 $LD $t0,2*$BNSZ($a1)
947716c1
AP
400 $MULTU ($t0,$a3)
401 mflo ($at,$t0,$a3)
402 mfhi ($t0,$t0,$a3)
da4d239d
AP
403 $ADDU $v0,$at
404 sltu $t1,$v0,$at
405 $ST $v0,2*$BNSZ($a0)
406 $ADDU $v0,$t1,$t0
407
408.L_bn_mul_words_return:
409 .set noreorder
410___
411$code.=<<___ if ($flavour =~ /nubi/i);
412 $REG_L $t3,4*$SZREG($sp)
413 $REG_L $t2,3*$SZREG($sp)
414 $REG_L $t1,2*$SZREG($sp)
415 $REG_L $t0,1*$SZREG($sp)
416 $REG_L $gp,0*$SZREG($sp)
417 $PTR_ADD $sp,6*$SZREG
418___
419$code.=<<___;
420 jr $ra
421 move $a0,$v0
422.end bn_mul_words_internal
423
424.align 5
425.globl bn_sqr_words
426.ent bn_sqr_words
427bn_sqr_words:
428 .set noreorder
429 bgtz $a2,bn_sqr_words_internal
430 move $v0,$zero
431 jr $ra
432 move $a0,$v0
433.end bn_sqr_words
434
435.align 5
436.ent bn_sqr_words_internal
437bn_sqr_words_internal:
438___
439$code.=<<___ if ($flavour =~ /nubi/i);
440 .frame $sp,6*$SZREG,$ra
441 .mask 0x8000f008,-$SZREG
442 .set noreorder
443 $PTR_SUB $sp,6*$SZREG
444 $REG_S $ra,5*$SZREG($sp)
445 $REG_S $t3,4*$SZREG($sp)
446 $REG_S $t2,3*$SZREG($sp)
447 $REG_S $t1,2*$SZREG($sp)
448 $REG_S $t0,1*$SZREG($sp)
449 $REG_S $gp,0*$SZREG($sp)
450___
451$code.=<<___;
452 .set reorder
453 li $minus4,-4
454 and $ta0,$a2,$minus4
da4d239d
AP
455 beqz $ta0,.L_bn_sqr_words_tail
456
457.L_bn_sqr_words_loop:
0c2adb0a 458 $LD $t0,0($a1)
947716c1 459 $MULTU ($t0,$t0)
da4d239d
AP
460 $LD $t2,$BNSZ($a1)
461 $LD $ta0,2*$BNSZ($a1)
462 $LD $ta2,3*$BNSZ($a1)
947716c1
AP
463 mflo ($t1,$t0,$t0)
464 mfhi ($t0,$t0,$t0)
da4d239d
AP
465 $ST $t1,0($a0)
466 $ST $t0,$BNSZ($a0)
467
947716c1 468 $MULTU ($t2,$t2)
da4d239d
AP
469 subu $a2,4
470 $PTR_ADD $a0,8*$BNSZ
471 $PTR_ADD $a1,4*$BNSZ
947716c1
AP
472 mflo ($t3,$t2,$t2)
473 mfhi ($t2,$t2,$t2)
da4d239d
AP
474 $ST $t3,-6*$BNSZ($a0)
475 $ST $t2,-5*$BNSZ($a0)
476
947716c1
AP
477 $MULTU ($ta0,$ta0)
478 mflo ($ta1,$ta0,$ta0)
479 mfhi ($ta0,$ta0,$ta0)
da4d239d
AP
480 $ST $ta1,-4*$BNSZ($a0)
481 $ST $ta0,-3*$BNSZ($a0)
482
483
947716c1 484 $MULTU ($ta2,$ta2)
da4d239d 485 and $ta0,$a2,$minus4
947716c1
AP
486 mflo ($ta3,$ta2,$ta2)
487 mfhi ($ta2,$ta2,$ta2)
da4d239d 488 $ST $ta3,-2*$BNSZ($a0)
da4d239d
AP
489
490 .set noreorder
0c2adb0a
AP
491 bgtz $ta0,.L_bn_sqr_words_loop
492 $ST $ta2,-$BNSZ($a0)
da4d239d
AP
493
494 beqz $a2,.L_bn_sqr_words_return
495 nop
496
497.L_bn_sqr_words_tail:
498 .set reorder
499 $LD $t0,0($a1)
947716c1 500 $MULTU ($t0,$t0)
da4d239d 501 subu $a2,1
947716c1
AP
502 mflo ($t1,$t0,$t0)
503 mfhi ($t0,$t0,$t0)
da4d239d
AP
504 $ST $t1,0($a0)
505 $ST $t0,$BNSZ($a0)
506 beqz $a2,.L_bn_sqr_words_return
507
508 $LD $t0,$BNSZ($a1)
947716c1 509 $MULTU ($t0,$t0)
da4d239d 510 subu $a2,1
947716c1
AP
511 mflo ($t1,$t0,$t0)
512 mfhi ($t0,$t0,$t0)
da4d239d
AP
513 $ST $t1,2*$BNSZ($a0)
514 $ST $t0,3*$BNSZ($a0)
515 beqz $a2,.L_bn_sqr_words_return
516
517 $LD $t0,2*$BNSZ($a1)
947716c1
AP
518 $MULTU ($t0,$t0)
519 mflo ($t1,$t0,$t0)
520 mfhi ($t0,$t0,$t0)
da4d239d
AP
521 $ST $t1,4*$BNSZ($a0)
522 $ST $t0,5*$BNSZ($a0)
523
524.L_bn_sqr_words_return:
525 .set noreorder
526___
527$code.=<<___ if ($flavour =~ /nubi/i);
528 $REG_L $t3,4*$SZREG($sp)
529 $REG_L $t2,3*$SZREG($sp)
530 $REG_L $t1,2*$SZREG($sp)
531 $REG_L $t0,1*$SZREG($sp)
532 $REG_L $gp,0*$SZREG($sp)
533 $PTR_ADD $sp,6*$SZREG
534___
535$code.=<<___;
536 jr $ra
537 move $a0,$v0
538
539.end bn_sqr_words_internal
540
541.align 5
542.globl bn_add_words
543.ent bn_add_words
544bn_add_words:
545 .set noreorder
546 bgtz $a3,bn_add_words_internal
547 move $v0,$zero
548 jr $ra
549 move $a0,$v0
550.end bn_add_words
551
552.align 5
553.ent bn_add_words_internal
554bn_add_words_internal:
555___
556$code.=<<___ if ($flavour =~ /nubi/i);
557 .frame $sp,6*$SZREG,$ra
558 .mask 0x8000f008,-$SZREG
559 .set noreorder
560 $PTR_SUB $sp,6*$SZREG
561 $REG_S $ra,5*$SZREG($sp)
562 $REG_S $t3,4*$SZREG($sp)
563 $REG_S $t2,3*$SZREG($sp)
564 $REG_S $t1,2*$SZREG($sp)
565 $REG_S $t0,1*$SZREG($sp)
566 $REG_S $gp,0*$SZREG($sp)
567___
568$code.=<<___;
569 .set reorder
570 li $minus4,-4
571 and $at,$a3,$minus4
da4d239d
AP
572 beqz $at,.L_bn_add_words_tail
573
574.L_bn_add_words_loop:
0c2adb0a 575 $LD $t0,0($a1)
da4d239d
AP
576 $LD $ta0,0($a2)
577 subu $a3,4
578 $LD $t1,$BNSZ($a1)
579 and $at,$a3,$minus4
580 $LD $t2,2*$BNSZ($a1)
581 $PTR_ADD $a2,4*$BNSZ
582 $LD $t3,3*$BNSZ($a1)
583 $PTR_ADD $a0,4*$BNSZ
584 $LD $ta1,-3*$BNSZ($a2)
585 $PTR_ADD $a1,4*$BNSZ
586 $LD $ta2,-2*$BNSZ($a2)
587 $LD $ta3,-$BNSZ($a2)
588 $ADDU $ta0,$t0
589 sltu $t8,$ta0,$t0
590 $ADDU $t0,$ta0,$v0
591 sltu $v0,$t0,$ta0
592 $ST $t0,-4*$BNSZ($a0)
593 $ADDU $v0,$t8
594
595 $ADDU $ta1,$t1
596 sltu $t9,$ta1,$t1
597 $ADDU $t1,$ta1,$v0
598 sltu $v0,$t1,$ta1
599 $ST $t1,-3*$BNSZ($a0)
600 $ADDU $v0,$t9
601
602 $ADDU $ta2,$t2
603 sltu $t8,$ta2,$t2
604 $ADDU $t2,$ta2,$v0
605 sltu $v0,$t2,$ta2
606 $ST $t2,-2*$BNSZ($a0)
607 $ADDU $v0,$t8
609b0852 608
da4d239d
AP
609 $ADDU $ta3,$t3
610 sltu $t9,$ta3,$t3
611 $ADDU $t3,$ta3,$v0
612 sltu $v0,$t3,$ta3
613 $ST $t3,-$BNSZ($a0)
609b0852 614
da4d239d 615 .set noreorder
0c2adb0a
AP
616 bgtz $at,.L_bn_add_words_loop
617 $ADDU $v0,$t9
da4d239d
AP
618
619 beqz $a3,.L_bn_add_words_return
620 nop
621
622.L_bn_add_words_tail:
623 .set reorder
624 $LD $t0,0($a1)
625 $LD $ta0,0($a2)
626 $ADDU $ta0,$t0
627 subu $a3,1
628 sltu $t8,$ta0,$t0
629 $ADDU $t0,$ta0,$v0
630 sltu $v0,$t0,$ta0
631 $ST $t0,0($a0)
632 $ADDU $v0,$t8
633 beqz $a3,.L_bn_add_words_return
634
635 $LD $t1,$BNSZ($a1)
636 $LD $ta1,$BNSZ($a2)
637 $ADDU $ta1,$t1
638 subu $a3,1
639 sltu $t9,$ta1,$t1
640 $ADDU $t1,$ta1,$v0
641 sltu $v0,$t1,$ta1
642 $ST $t1,$BNSZ($a0)
643 $ADDU $v0,$t9
644 beqz $a3,.L_bn_add_words_return
645
646 $LD $t2,2*$BNSZ($a1)
647 $LD $ta2,2*$BNSZ($a2)
648 $ADDU $ta2,$t2
649 sltu $t8,$ta2,$t2
650 $ADDU $t2,$ta2,$v0
651 sltu $v0,$t2,$ta2
652 $ST $t2,2*$BNSZ($a0)
653 $ADDU $v0,$t8
654
655.L_bn_add_words_return:
656 .set noreorder
657___
658$code.=<<___ if ($flavour =~ /nubi/i);
659 $REG_L $t3,4*$SZREG($sp)
660 $REG_L $t2,3*$SZREG($sp)
661 $REG_L $t1,2*$SZREG($sp)
662 $REG_L $t0,1*$SZREG($sp)
663 $REG_L $gp,0*$SZREG($sp)
664 $PTR_ADD $sp,6*$SZREG
665___
666$code.=<<___;
667 jr $ra
668 move $a0,$v0
669
670.end bn_add_words_internal
671
672.align 5
673.globl bn_sub_words
674.ent bn_sub_words
675bn_sub_words:
676 .set noreorder
677 bgtz $a3,bn_sub_words_internal
678 move $v0,$zero
679 jr $ra
680 move $a0,$zero
681.end bn_sub_words
682
683.align 5
684.ent bn_sub_words_internal
685bn_sub_words_internal:
686___
687$code.=<<___ if ($flavour =~ /nubi/i);
688 .frame $sp,6*$SZREG,$ra
689 .mask 0x8000f008,-$SZREG
690 .set noreorder
691 $PTR_SUB $sp,6*$SZREG
692 $REG_S $ra,5*$SZREG($sp)
693 $REG_S $t3,4*$SZREG($sp)
694 $REG_S $t2,3*$SZREG($sp)
695 $REG_S $t1,2*$SZREG($sp)
696 $REG_S $t0,1*$SZREG($sp)
697 $REG_S $gp,0*$SZREG($sp)
698___
699$code.=<<___;
700 .set reorder
701 li $minus4,-4
702 and $at,$a3,$minus4
da4d239d
AP
703 beqz $at,.L_bn_sub_words_tail
704
705.L_bn_sub_words_loop:
0c2adb0a 706 $LD $t0,0($a1)
da4d239d
AP
707 $LD $ta0,0($a2)
708 subu $a3,4
709 $LD $t1,$BNSZ($a1)
710 and $at,$a3,$minus4
711 $LD $t2,2*$BNSZ($a1)
712 $PTR_ADD $a2,4*$BNSZ
713 $LD $t3,3*$BNSZ($a1)
714 $PTR_ADD $a0,4*$BNSZ
715 $LD $ta1,-3*$BNSZ($a2)
716 $PTR_ADD $a1,4*$BNSZ
717 $LD $ta2,-2*$BNSZ($a2)
718 $LD $ta3,-$BNSZ($a2)
719 sltu $t8,$t0,$ta0
720 $SUBU $ta0,$t0,$ta0
721 $SUBU $t0,$ta0,$v0
722 sgtu $v0,$t0,$ta0
723 $ST $t0,-4*$BNSZ($a0)
724 $ADDU $v0,$t8
725
726 sltu $t9,$t1,$ta1
727 $SUBU $ta1,$t1,$ta1
728 $SUBU $t1,$ta1,$v0
729 sgtu $v0,$t1,$ta1
730 $ST $t1,-3*$BNSZ($a0)
731 $ADDU $v0,$t9
732
733
734 sltu $t8,$t2,$ta2
735 $SUBU $ta2,$t2,$ta2
736 $SUBU $t2,$ta2,$v0
737 sgtu $v0,$t2,$ta2
738 $ST $t2,-2*$BNSZ($a0)
739 $ADDU $v0,$t8
740
741 sltu $t9,$t3,$ta3
742 $SUBU $ta3,$t3,$ta3
743 $SUBU $t3,$ta3,$v0
744 sgtu $v0,$t3,$ta3
745 $ST $t3,-$BNSZ($a0)
da4d239d
AP
746
747 .set noreorder
0c2adb0a
AP
748 bgtz $at,.L_bn_sub_words_loop
749 $ADDU $v0,$t9
da4d239d
AP
750
751 beqz $a3,.L_bn_sub_words_return
752 nop
753
754.L_bn_sub_words_tail:
755 .set reorder
756 $LD $t0,0($a1)
757 $LD $ta0,0($a2)
758 subu $a3,1
759 sltu $t8,$t0,$ta0
760 $SUBU $ta0,$t0,$ta0
761 $SUBU $t0,$ta0,$v0
762 sgtu $v0,$t0,$ta0
763 $ST $t0,0($a0)
764 $ADDU $v0,$t8
765 beqz $a3,.L_bn_sub_words_return
766
767 $LD $t1,$BNSZ($a1)
768 subu $a3,1
769 $LD $ta1,$BNSZ($a2)
770 sltu $t9,$t1,$ta1
771 $SUBU $ta1,$t1,$ta1
772 $SUBU $t1,$ta1,$v0
773 sgtu $v0,$t1,$ta1
774 $ST $t1,$BNSZ($a0)
775 $ADDU $v0,$t9
776 beqz $a3,.L_bn_sub_words_return
777
778 $LD $t2,2*$BNSZ($a1)
779 $LD $ta2,2*$BNSZ($a2)
780 sltu $t8,$t2,$ta2
781 $SUBU $ta2,$t2,$ta2
782 $SUBU $t2,$ta2,$v0
783 sgtu $v0,$t2,$ta2
784 $ST $t2,2*$BNSZ($a0)
785 $ADDU $v0,$t8
786
787.L_bn_sub_words_return:
788 .set noreorder
789___
790$code.=<<___ if ($flavour =~ /nubi/i);
791 $REG_L $t3,4*$SZREG($sp)
792 $REG_L $t2,3*$SZREG($sp)
793 $REG_L $t1,2*$SZREG($sp)
794 $REG_L $t0,1*$SZREG($sp)
795 $REG_L $gp,0*$SZREG($sp)
796 $PTR_ADD $sp,6*$SZREG
797___
798$code.=<<___;
799 jr $ra
800 move $a0,$v0
66001268 801.end bn_sub_words_internal
da4d239d 802
b34446cc
AP
803#if 0
804/*
805 * The bn_div_3_words entry point is re-used for constant-time interface.
c2969ff6 806 * Implementation is retained as historical reference.
b34446cc 807 */
da4d239d
AP
808.align 5
809.globl bn_div_3_words
810.ent bn_div_3_words
811bn_div_3_words:
812 .set noreorder
813 move $a3,$a0 # we know that bn_div_words does not
814 # touch $a3, $ta2, $ta3 and preserves $a2
815 # so that we can save two arguments
816 # and return address in registers
817 # instead of stack:-)
609b0852 818
da4d239d
AP
819 $LD $a0,($a3)
820 move $ta2,$a1
821 bne $a0,$a2,bn_div_3_words_internal
822 $LD $a1,-$BNSZ($a3)
823 li $v0,-1
824 jr $ra
825 move $a0,$v0
826.end bn_div_3_words
827
828.align 5
829.ent bn_div_3_words_internal
830bn_div_3_words_internal:
831___
832$code.=<<___ if ($flavour =~ /nubi/i);
833 .frame $sp,6*$SZREG,$ra
834 .mask 0x8000f008,-$SZREG
835 .set noreorder
836 $PTR_SUB $sp,6*$SZREG
837 $REG_S $ra,5*$SZREG($sp)
838 $REG_S $t3,4*$SZREG($sp)
839 $REG_S $t2,3*$SZREG($sp)
840 $REG_S $t1,2*$SZREG($sp)
841 $REG_S $t0,1*$SZREG($sp)
842 $REG_S $gp,0*$SZREG($sp)
843___
844$code.=<<___;
845 .set reorder
846 move $ta3,$ra
543fd854 847 bal bn_div_words_internal
da4d239d 848 move $ra,$ta3
947716c1 849 $MULTU ($ta2,$v0)
da4d239d
AP
850 $LD $t2,-2*$BNSZ($a3)
851 move $ta0,$zero
947716c1
AP
852 mfhi ($t1,$ta2,$v0)
853 mflo ($t0,$ta2,$v0)
da4d239d
AP
854 sltu $t8,$t1,$a1
855.L_bn_div_3_words_inner_loop:
856 bnez $t8,.L_bn_div_3_words_inner_loop_done
857 sgeu $at,$t2,$t0
858 seq $t9,$t1,$a1
859 and $at,$t9
860 sltu $t3,$t0,$ta2
861 $ADDU $a1,$a2
862 $SUBU $t1,$t3
863 $SUBU $t0,$ta2
864 sltu $t8,$t1,$a1
865 sltu $ta0,$a1,$a2
866 or $t8,$ta0
867 .set noreorder
0c2adb0a 868 beqz $at,.L_bn_div_3_words_inner_loop
da4d239d 869 $SUBU $v0,1
0c2adb0a 870 $ADDU $v0,1
da4d239d
AP
871 .set reorder
872.L_bn_div_3_words_inner_loop_done:
873 .set noreorder
874___
875$code.=<<___ if ($flavour =~ /nubi/i);
876 $REG_L $t3,4*$SZREG($sp)
877 $REG_L $t2,3*$SZREG($sp)
878 $REG_L $t1,2*$SZREG($sp)
879 $REG_L $t0,1*$SZREG($sp)
880 $REG_L $gp,0*$SZREG($sp)
881 $PTR_ADD $sp,6*$SZREG
882___
883$code.=<<___;
884 jr $ra
885 move $a0,$v0
886.end bn_div_3_words_internal
b34446cc 887#endif
da4d239d
AP
888
889.align 5
890.globl bn_div_words
891.ent bn_div_words
892bn_div_words:
893 .set noreorder
894 bnez $a2,bn_div_words_internal
895 li $v0,-1 # I would rather signal div-by-zero
896 # which can be done with 'break 7'
897 jr $ra
898 move $a0,$v0
899.end bn_div_words
900
901.align 5
902.ent bn_div_words_internal
903bn_div_words_internal:
904___
905$code.=<<___ if ($flavour =~ /nubi/i);
906 .frame $sp,6*$SZREG,$ra
907 .mask 0x8000f008,-$SZREG
908 .set noreorder
909 $PTR_SUB $sp,6*$SZREG
910 $REG_S $ra,5*$SZREG($sp)
911 $REG_S $t3,4*$SZREG($sp)
912 $REG_S $t2,3*$SZREG($sp)
913 $REG_S $t1,2*$SZREG($sp)
914 $REG_S $t0,1*$SZREG($sp)
915 $REG_S $gp,0*$SZREG($sp)
916___
917$code.=<<___;
918 move $v1,$zero
919 bltz $a2,.L_bn_div_words_body
920 move $t9,$v1
921 $SLL $a2,1
922 bgtz $a2,.-4
923 addu $t9,1
924
925 .set reorder
926 negu $t1,$t9
927 li $t2,-1
928 $SLL $t2,$t1
929 and $t2,$a0
930 $SRL $at,$a1,$t1
931 .set noreorder
0c2adb0a
AP
932 beqz $t2,.+12
933 nop
da4d239d
AP
934 break 6 # signal overflow
935 .set reorder
936 $SLL $a0,$t9
937 $SLL $a1,$t9
938 or $a0,$at
939___
940$QT=$ta0;
941$HH=$ta1;
942$DH=$v1;
943$code.=<<___;
944.L_bn_div_words_body:
945 $SRL $DH,$a2,4*$BNSZ # bits
946 sgeu $at,$a0,$a2
947 .set noreorder
0c2adb0a
AP
948 beqz $at,.+12
949 nop
da4d239d
AP
950 $SUBU $a0,$a2
951 .set reorder
952
953 li $QT,-1
954 $SRL $HH,$a0,4*$BNSZ # bits
955 $SRL $QT,4*$BNSZ # q=0xffffffff
956 beq $DH,$HH,.L_bn_div_words_skip_div1
947716c1
AP
957 $DIVU ($a0,$DH)
958 mfqt ($QT,$a0,$DH)
da4d239d 959.L_bn_div_words_skip_div1:
947716c1 960 $MULTU ($a2,$QT)
da4d239d
AP
961 $SLL $t3,$a0,4*$BNSZ # bits
962 $SRL $at,$a1,4*$BNSZ # bits
963 or $t3,$at
947716c1
AP
964 mflo ($t0,$a2,$QT)
965 mfhi ($t1,$a2,$QT)
da4d239d
AP
966.L_bn_div_words_inner_loop1:
967 sltu $t2,$t3,$t0
968 seq $t8,$HH,$t1
969 sltu $at,$HH,$t1
970 and $t2,$t8
971 sltu $v0,$t0,$a2
972 or $at,$t2
973 .set noreorder
974 beqz $at,.L_bn_div_words_inner_loop1_done
975 $SUBU $t1,$v0
976 $SUBU $t0,$a2
977 b .L_bn_div_words_inner_loop1
978 $SUBU $QT,1
979 .set reorder
980.L_bn_div_words_inner_loop1_done:
981
982 $SLL $a1,4*$BNSZ # bits
983 $SUBU $a0,$t3,$t0
984 $SLL $v0,$QT,4*$BNSZ # bits
985
986 li $QT,-1
987 $SRL $HH,$a0,4*$BNSZ # bits
988 $SRL $QT,4*$BNSZ # q=0xffffffff
989 beq $DH,$HH,.L_bn_div_words_skip_div2
947716c1
AP
990 $DIVU ($a0,$DH)
991 mfqt ($QT,$a0,$DH)
da4d239d 992.L_bn_div_words_skip_div2:
947716c1 993 $MULTU ($a2,$QT)
da4d239d
AP
994 $SLL $t3,$a0,4*$BNSZ # bits
995 $SRL $at,$a1,4*$BNSZ # bits
996 or $t3,$at
947716c1
AP
997 mflo ($t0,$a2,$QT)
998 mfhi ($t1,$a2,$QT)
da4d239d
AP
999.L_bn_div_words_inner_loop2:
1000 sltu $t2,$t3,$t0
1001 seq $t8,$HH,$t1
1002 sltu $at,$HH,$t1
1003 and $t2,$t8
1004 sltu $v1,$t0,$a2
1005 or $at,$t2
1006 .set noreorder
1007 beqz $at,.L_bn_div_words_inner_loop2_done
1008 $SUBU $t1,$v1
1009 $SUBU $t0,$a2
1010 b .L_bn_div_words_inner_loop2
1011 $SUBU $QT,1
1012 .set reorder
1013.L_bn_div_words_inner_loop2_done:
1014
1015 $SUBU $a0,$t3,$t0
1016 or $v0,$QT
1017 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1018 $SRL $a2,$t9 # restore $a2
1019
1020 .set noreorder
1021 move $a1,$v1
1022___
1023$code.=<<___ if ($flavour =~ /nubi/i);
1024 $REG_L $t3,4*$SZREG($sp)
1025 $REG_L $t2,3*$SZREG($sp)
1026 $REG_L $t1,2*$SZREG($sp)
1027 $REG_L $t0,1*$SZREG($sp)
1028 $REG_L $gp,0*$SZREG($sp)
1029 $PTR_ADD $sp,6*$SZREG
1030___
1031$code.=<<___;
1032 jr $ra
1033 move $a0,$v0
1034.end bn_div_words_internal
1035___
1036undef $HH; undef $QT; undef $DH;
1037
1038($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1039($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1040
1041($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1042($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1043
1044($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1045
1046$code.=<<___;
1047
1048.align 5
1049.globl bn_mul_comba8
1050.ent bn_mul_comba8
1051bn_mul_comba8:
1052 .set noreorder
1053___
1054$code.=<<___ if ($flavour =~ /nubi/i);
1055 .frame $sp,12*$SZREG,$ra
1056 .mask 0x803ff008,-$SZREG
1057 $PTR_SUB $sp,12*$SZREG
1058 $REG_S $ra,11*$SZREG($sp)
1059 $REG_S $s5,10*$SZREG($sp)
1060 $REG_S $s4,9*$SZREG($sp)
1061 $REG_S $s3,8*$SZREG($sp)
1062 $REG_S $s2,7*$SZREG($sp)
1063 $REG_S $s1,6*$SZREG($sp)
1064 $REG_S $s0,5*$SZREG($sp)
1065 $REG_S $t3,4*$SZREG($sp)
1066 $REG_S $t2,3*$SZREG($sp)
1067 $REG_S $t1,2*$SZREG($sp)
1068 $REG_S $t0,1*$SZREG($sp)
1069 $REG_S $gp,0*$SZREG($sp)
1070___
1071$code.=<<___ if ($flavour !~ /nubi/i);
1072 .frame $sp,6*$SZREG,$ra
1073 .mask 0x003f0000,-$SZREG
1074 $PTR_SUB $sp,6*$SZREG
1075 $REG_S $s5,5*$SZREG($sp)
1076 $REG_S $s4,4*$SZREG($sp)
1077 $REG_S $s3,3*$SZREG($sp)
1078 $REG_S $s2,2*$SZREG($sp)
1079 $REG_S $s1,1*$SZREG($sp)
1080 $REG_S $s0,0*$SZREG($sp)
1081___
1082$code.=<<___;
1083
1084 .set reorder
1085 $LD $a_0,0($a1) # If compiled with -mips3 option on
1086 # R5000 box assembler barks on this
1087 # 1ine with "should not have mult/div
1088 # as last instruction in bb (R10K
1089 # bug)" warning. If anybody out there
1090 # has a clue about how to circumvent
1091 # this do send me a note.
1092 # <appro\@fy.chalmers.se>
1093
1094 $LD $b_0,0($a2)
1095 $LD $a_1,$BNSZ($a1)
1096 $LD $a_2,2*$BNSZ($a1)
947716c1 1097 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
1098 $LD $a_3,3*$BNSZ($a1)
1099 $LD $b_1,$BNSZ($a2)
1100 $LD $b_2,2*$BNSZ($a2)
1101 $LD $b_3,3*$BNSZ($a2)
947716c1
AP
1102 mflo ($c_1,$a_0,$b_0)
1103 mfhi ($c_2,$a_0,$b_0)
da4d239d
AP
1104
1105 $LD $a_4,4*$BNSZ($a1)
1106 $LD $a_5,5*$BNSZ($a1)
947716c1 1107 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
da4d239d
AP
1108 $LD $a_6,6*$BNSZ($a1)
1109 $LD $a_7,7*$BNSZ($a1)
1110 $LD $b_4,4*$BNSZ($a2)
1111 $LD $b_5,5*$BNSZ($a2)
947716c1
AP
1112 mflo ($t_1,$a_0,$b_1)
1113 mfhi ($t_2,$a_0,$b_1)
da4d239d
AP
1114 $ADDU $c_2,$t_1
1115 sltu $at,$c_2,$t_1
947716c1 1116 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
da4d239d
AP
1117 $ADDU $c_3,$t_2,$at
1118 $LD $b_6,6*$BNSZ($a2)
1119 $LD $b_7,7*$BNSZ($a2)
1120 $ST $c_1,0($a0) # r[0]=c1;
947716c1
AP
1121 mflo ($t_1,$a_1,$b_0)
1122 mfhi ($t_2,$a_1,$b_0)
da4d239d
AP
1123 $ADDU $c_2,$t_1
1124 sltu $at,$c_2,$t_1
947716c1 1125 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
da4d239d
AP
1126 $ADDU $t_2,$at
1127 $ADDU $c_3,$t_2
1128 sltu $c_1,$c_3,$t_2
1129 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1130
947716c1
AP
1131 mflo ($t_1,$a_2,$b_0)
1132 mfhi ($t_2,$a_2,$b_0)
da4d239d
AP
1133 $ADDU $c_3,$t_1
1134 sltu $at,$c_3,$t_1
947716c1 1135 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
da4d239d
AP
1136 $ADDU $t_2,$at
1137 $ADDU $c_1,$t_2
947716c1
AP
1138 mflo ($t_1,$a_1,$b_1)
1139 mfhi ($t_2,$a_1,$b_1)
da4d239d
AP
1140 $ADDU $c_3,$t_1
1141 sltu $at,$c_3,$t_1
947716c1 1142 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
da4d239d
AP
1143 $ADDU $t_2,$at
1144 $ADDU $c_1,$t_2
1145 sltu $c_2,$c_1,$t_2
947716c1
AP
1146 mflo ($t_1,$a_0,$b_2)
1147 mfhi ($t_2,$a_0,$b_2)
da4d239d
AP
1148 $ADDU $c_3,$t_1
1149 sltu $at,$c_3,$t_1
947716c1 1150 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
da4d239d
AP
1151 $ADDU $t_2,$at
1152 $ADDU $c_1,$t_2
1153 sltu $at,$c_1,$t_2
1154 $ADDU $c_2,$at
1155 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1156
947716c1
AP
1157 mflo ($t_1,$a_0,$b_3)
1158 mfhi ($t_2,$a_0,$b_3)
da4d239d
AP
1159 $ADDU $c_1,$t_1
1160 sltu $at,$c_1,$t_1
947716c1 1161 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
da4d239d
AP
1162 $ADDU $t_2,$at
1163 $ADDU $c_2,$t_2
1164 sltu $c_3,$c_2,$t_2
947716c1
AP
1165 mflo ($t_1,$a_1,$b_2)
1166 mfhi ($t_2,$a_1,$b_2)
da4d239d
AP
1167 $ADDU $c_1,$t_1
1168 sltu $at,$c_1,$t_1
947716c1 1169 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
da4d239d
AP
1170 $ADDU $t_2,$at
1171 $ADDU $c_2,$t_2
1172 sltu $at,$c_2,$t_2
1173 $ADDU $c_3,$at
947716c1
AP
1174 mflo ($t_1,$a_2,$b_1)
1175 mfhi ($t_2,$a_2,$b_1)
da4d239d
AP
1176 $ADDU $c_1,$t_1
1177 sltu $at,$c_1,$t_1
947716c1 1178 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
da4d239d
AP
1179 $ADDU $t_2,$at
1180 $ADDU $c_2,$t_2
1181 sltu $at,$c_2,$t_2
1182 $ADDU $c_3,$at
947716c1
AP
1183 mflo ($t_1,$a_3,$b_0)
1184 mfhi ($t_2,$a_3,$b_0)
da4d239d
AP
1185 $ADDU $c_1,$t_1
1186 sltu $at,$c_1,$t_1
947716c1 1187 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
da4d239d
AP
1188 $ADDU $t_2,$at
1189 $ADDU $c_2,$t_2
1190 sltu $at,$c_2,$t_2
1191 $ADDU $c_3,$at
1192 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1193
947716c1
AP
1194 mflo ($t_1,$a_4,$b_0)
1195 mfhi ($t_2,$a_4,$b_0)
da4d239d
AP
1196 $ADDU $c_2,$t_1
1197 sltu $at,$c_2,$t_1
947716c1 1198 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
da4d239d
AP
1199 $ADDU $t_2,$at
1200 $ADDU $c_3,$t_2
1201 sltu $c_1,$c_3,$t_2
947716c1
AP
1202 mflo ($t_1,$a_3,$b_1)
1203 mfhi ($t_2,$a_3,$b_1)
da4d239d
AP
1204 $ADDU $c_2,$t_1
1205 sltu $at,$c_2,$t_1
947716c1 1206 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
da4d239d
AP
1207 $ADDU $t_2,$at
1208 $ADDU $c_3,$t_2
1209 sltu $at,$c_3,$t_2
1210 $ADDU $c_1,$at
947716c1
AP
1211 mflo ($t_1,$a_2,$b_2)
1212 mfhi ($t_2,$a_2,$b_2)
da4d239d
AP
1213 $ADDU $c_2,$t_1
1214 sltu $at,$c_2,$t_1
947716c1 1215 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
da4d239d
AP
1216 $ADDU $t_2,$at
1217 $ADDU $c_3,$t_2
1218 sltu $at,$c_3,$t_2
1219 $ADDU $c_1,$at
947716c1
AP
1220 mflo ($t_1,$a_1,$b_3)
1221 mfhi ($t_2,$a_1,$b_3)
da4d239d
AP
1222 $ADDU $c_2,$t_1
1223 sltu $at,$c_2,$t_1
947716c1 1224 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
da4d239d
AP
1225 $ADDU $t_2,$at
1226 $ADDU $c_3,$t_2
1227 sltu $at,$c_3,$t_2
1228 $ADDU $c_1,$at
947716c1
AP
1229 mflo ($t_1,$a_0,$b_4)
1230 mfhi ($t_2,$a_0,$b_4)
da4d239d
AP
1231 $ADDU $c_2,$t_1
1232 sltu $at,$c_2,$t_1
947716c1 1233 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
da4d239d
AP
1234 $ADDU $t_2,$at
1235 $ADDU $c_3,$t_2
1236 sltu $at,$c_3,$t_2
1237 $ADDU $c_1,$at
1238 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1239
947716c1
AP
1240 mflo ($t_1,$a_0,$b_5)
1241 mfhi ($t_2,$a_0,$b_5)
da4d239d
AP
1242 $ADDU $c_3,$t_1
1243 sltu $at,$c_3,$t_1
947716c1 1244 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
da4d239d
AP
1245 $ADDU $t_2,$at
1246 $ADDU $c_1,$t_2
1247 sltu $c_2,$c_1,$t_2
947716c1
AP
1248 mflo ($t_1,$a_1,$b_4)
1249 mfhi ($t_2,$a_1,$b_4)
da4d239d
AP
1250 $ADDU $c_3,$t_1
1251 sltu $at,$c_3,$t_1
947716c1 1252 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
da4d239d
AP
1253 $ADDU $t_2,$at
1254 $ADDU $c_1,$t_2
1255 sltu $at,$c_1,$t_2
1256 $ADDU $c_2,$at
947716c1
AP
1257 mflo ($t_1,$a_2,$b_3)
1258 mfhi ($t_2,$a_2,$b_3)
da4d239d
AP
1259 $ADDU $c_3,$t_1
1260 sltu $at,$c_3,$t_1
947716c1 1261 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
da4d239d
AP
1262 $ADDU $t_2,$at
1263 $ADDU $c_1,$t_2
1264 sltu $at,$c_1,$t_2
1265 $ADDU $c_2,$at
947716c1
AP
1266 mflo ($t_1,$a_3,$b_2)
1267 mfhi ($t_2,$a_3,$b_2)
da4d239d
AP
1268 $ADDU $c_3,$t_1
1269 sltu $at,$c_3,$t_1
947716c1 1270 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
da4d239d
AP
1271 $ADDU $t_2,$at
1272 $ADDU $c_1,$t_2
1273 sltu $at,$c_1,$t_2
1274 $ADDU $c_2,$at
947716c1
AP
1275 mflo ($t_1,$a_4,$b_1)
1276 mfhi ($t_2,$a_4,$b_1)
da4d239d
AP
1277 $ADDU $c_3,$t_1
1278 sltu $at,$c_3,$t_1
947716c1 1279 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
da4d239d
AP
1280 $ADDU $t_2,$at
1281 $ADDU $c_1,$t_2
1282 sltu $at,$c_1,$t_2
1283 $ADDU $c_2,$at
947716c1
AP
1284 mflo ($t_1,$a_5,$b_0)
1285 mfhi ($t_2,$a_5,$b_0)
da4d239d
AP
1286 $ADDU $c_3,$t_1
1287 sltu $at,$c_3,$t_1
947716c1 1288 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
da4d239d
AP
1289 $ADDU $t_2,$at
1290 $ADDU $c_1,$t_2
1291 sltu $at,$c_1,$t_2
1292 $ADDU $c_2,$at
1293 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1294
947716c1
AP
1295 mflo ($t_1,$a_6,$b_0)
1296 mfhi ($t_2,$a_6,$b_0)
da4d239d
AP
1297 $ADDU $c_1,$t_1
1298 sltu $at,$c_1,$t_1
947716c1 1299 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
da4d239d
AP
1300 $ADDU $t_2,$at
1301 $ADDU $c_2,$t_2
1302 sltu $c_3,$c_2,$t_2
947716c1
AP
1303 mflo ($t_1,$a_5,$b_1)
1304 mfhi ($t_2,$a_5,$b_1)
da4d239d
AP
1305 $ADDU $c_1,$t_1
1306 sltu $at,$c_1,$t_1
947716c1 1307 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
da4d239d
AP
1308 $ADDU $t_2,$at
1309 $ADDU $c_2,$t_2
1310 sltu $at,$c_2,$t_2
1311 $ADDU $c_3,$at
947716c1
AP
1312 mflo ($t_1,$a_4,$b_2)
1313 mfhi ($t_2,$a_4,$b_2)
da4d239d
AP
1314 $ADDU $c_1,$t_1
1315 sltu $at,$c_1,$t_1
947716c1 1316 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
da4d239d
AP
1317 $ADDU $t_2,$at
1318 $ADDU $c_2,$t_2
1319 sltu $at,$c_2,$t_2
1320 $ADDU $c_3,$at
947716c1
AP
1321 mflo ($t_1,$a_3,$b_3)
1322 mfhi ($t_2,$a_3,$b_3)
da4d239d
AP
1323 $ADDU $c_1,$t_1
1324 sltu $at,$c_1,$t_1
947716c1 1325 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
da4d239d
AP
1326 $ADDU $t_2,$at
1327 $ADDU $c_2,$t_2
1328 sltu $at,$c_2,$t_2
1329 $ADDU $c_3,$at
947716c1
AP
1330 mflo ($t_1,$a_2,$b_4)
1331 mfhi ($t_2,$a_2,$b_4)
da4d239d
AP
1332 $ADDU $c_1,$t_1
1333 sltu $at,$c_1,$t_1
947716c1 1334 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
da4d239d
AP
1335 $ADDU $t_2,$at
1336 $ADDU $c_2,$t_2
1337 sltu $at,$c_2,$t_2
1338 $ADDU $c_3,$at
947716c1
AP
1339 mflo ($t_1,$a_1,$b_5)
1340 mfhi ($t_2,$a_1,$b_5)
da4d239d
AP
1341 $ADDU $c_1,$t_1
1342 sltu $at,$c_1,$t_1
947716c1 1343 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
da4d239d
AP
1344 $ADDU $t_2,$at
1345 $ADDU $c_2,$t_2
1346 sltu $at,$c_2,$t_2
1347 $ADDU $c_3,$at
947716c1
AP
1348 mflo ($t_1,$a_0,$b_6)
1349 mfhi ($t_2,$a_0,$b_6)
da4d239d
AP
1350 $ADDU $c_1,$t_1
1351 sltu $at,$c_1,$t_1
947716c1 1352 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
da4d239d
AP
1353 $ADDU $t_2,$at
1354 $ADDU $c_2,$t_2
1355 sltu $at,$c_2,$t_2
1356 $ADDU $c_3,$at
1357 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1358
947716c1
AP
1359 mflo ($t_1,$a_0,$b_7)
1360 mfhi ($t_2,$a_0,$b_7)
da4d239d
AP
1361 $ADDU $c_2,$t_1
1362 sltu $at,$c_2,$t_1
947716c1 1363 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
da4d239d
AP
1364 $ADDU $t_2,$at
1365 $ADDU $c_3,$t_2
1366 sltu $c_1,$c_3,$t_2
947716c1
AP
1367 mflo ($t_1,$a_1,$b_6)
1368 mfhi ($t_2,$a_1,$b_6)
da4d239d
AP
1369 $ADDU $c_2,$t_1
1370 sltu $at,$c_2,$t_1
947716c1 1371 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
da4d239d
AP
1372 $ADDU $t_2,$at
1373 $ADDU $c_3,$t_2
1374 sltu $at,$c_3,$t_2
1375 $ADDU $c_1,$at
947716c1
AP
1376 mflo ($t_1,$a_2,$b_5)
1377 mfhi ($t_2,$a_2,$b_5)
da4d239d
AP
1378 $ADDU $c_2,$t_1
1379 sltu $at,$c_2,$t_1
947716c1 1380 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
da4d239d
AP
1381 $ADDU $t_2,$at
1382 $ADDU $c_3,$t_2
1383 sltu $at,$c_3,$t_2
1384 $ADDU $c_1,$at
947716c1
AP
1385 mflo ($t_1,$a_3,$b_4)
1386 mfhi ($t_2,$a_3,$b_4)
da4d239d
AP
1387 $ADDU $c_2,$t_1
1388 sltu $at,$c_2,$t_1
947716c1 1389 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
da4d239d
AP
1390 $ADDU $t_2,$at
1391 $ADDU $c_3,$t_2
1392 sltu $at,$c_3,$t_2
1393 $ADDU $c_1,$at
947716c1
AP
1394 mflo ($t_1,$a_4,$b_3)
1395 mfhi ($t_2,$a_4,$b_3)
da4d239d
AP
1396 $ADDU $c_2,$t_1
1397 sltu $at,$c_2,$t_1
947716c1 1398 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
da4d239d
AP
1399 $ADDU $t_2,$at
1400 $ADDU $c_3,$t_2
1401 sltu $at,$c_3,$t_2
1402 $ADDU $c_1,$at
947716c1
AP
1403 mflo ($t_1,$a_5,$b_2)
1404 mfhi ($t_2,$a_5,$b_2)
da4d239d
AP
1405 $ADDU $c_2,$t_1
1406 sltu $at,$c_2,$t_1
947716c1 1407 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
da4d239d
AP
1408 $ADDU $t_2,$at
1409 $ADDU $c_3,$t_2
1410 sltu $at,$c_3,$t_2
1411 $ADDU $c_1,$at
947716c1
AP
1412 mflo ($t_1,$a_6,$b_1)
1413 mfhi ($t_2,$a_6,$b_1)
da4d239d
AP
1414 $ADDU $c_2,$t_1
1415 sltu $at,$c_2,$t_1
947716c1 1416 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
da4d239d
AP
1417 $ADDU $t_2,$at
1418 $ADDU $c_3,$t_2
1419 sltu $at,$c_3,$t_2
1420 $ADDU $c_1,$at
947716c1
AP
1421 mflo ($t_1,$a_7,$b_0)
1422 mfhi ($t_2,$a_7,$b_0)
da4d239d
AP
1423 $ADDU $c_2,$t_1
1424 sltu $at,$c_2,$t_1
947716c1 1425 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
da4d239d
AP
1426 $ADDU $t_2,$at
1427 $ADDU $c_3,$t_2
1428 sltu $at,$c_3,$t_2
1429 $ADDU $c_1,$at
1430 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1431
947716c1
AP
1432 mflo ($t_1,$a_7,$b_1)
1433 mfhi ($t_2,$a_7,$b_1)
da4d239d
AP
1434 $ADDU $c_3,$t_1
1435 sltu $at,$c_3,$t_1
947716c1 1436 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
da4d239d
AP
1437 $ADDU $t_2,$at
1438 $ADDU $c_1,$t_2
1439 sltu $c_2,$c_1,$t_2
947716c1
AP
1440 mflo ($t_1,$a_6,$b_2)
1441 mfhi ($t_2,$a_6,$b_2)
da4d239d
AP
1442 $ADDU $c_3,$t_1
1443 sltu $at,$c_3,$t_1
947716c1 1444 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
da4d239d
AP
1445 $ADDU $t_2,$at
1446 $ADDU $c_1,$t_2
1447 sltu $at,$c_1,$t_2
1448 $ADDU $c_2,$at
947716c1
AP
1449 mflo ($t_1,$a_5,$b_3)
1450 mfhi ($t_2,$a_5,$b_3)
da4d239d
AP
1451 $ADDU $c_3,$t_1
1452 sltu $at,$c_3,$t_1
947716c1 1453 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
da4d239d
AP
1454 $ADDU $t_2,$at
1455 $ADDU $c_1,$t_2
1456 sltu $at,$c_1,$t_2
1457 $ADDU $c_2,$at
947716c1
AP
1458 mflo ($t_1,$a_4,$b_4)
1459 mfhi ($t_2,$a_4,$b_4)
da4d239d
AP
1460 $ADDU $c_3,$t_1
1461 sltu $at,$c_3,$t_1
947716c1 1462 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
da4d239d
AP
1463 $ADDU $t_2,$at
1464 $ADDU $c_1,$t_2
1465 sltu $at,$c_1,$t_2
1466 $ADDU $c_2,$at
947716c1
AP
1467 mflo ($t_1,$a_3,$b_5)
1468 mfhi ($t_2,$a_3,$b_5)
da4d239d
AP
1469 $ADDU $c_3,$t_1
1470 sltu $at,$c_3,$t_1
947716c1 1471 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
da4d239d
AP
1472 $ADDU $t_2,$at
1473 $ADDU $c_1,$t_2
1474 sltu $at,$c_1,$t_2
1475 $ADDU $c_2,$at
947716c1
AP
1476 mflo ($t_1,$a_2,$b_6)
1477 mfhi ($t_2,$a_2,$b_6)
da4d239d
AP
1478 $ADDU $c_3,$t_1
1479 sltu $at,$c_3,$t_1
947716c1 1480 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
da4d239d
AP
1481 $ADDU $t_2,$at
1482 $ADDU $c_1,$t_2
1483 sltu $at,$c_1,$t_2
1484 $ADDU $c_2,$at
947716c1
AP
1485 mflo ($t_1,$a_1,$b_7)
1486 mfhi ($t_2,$a_1,$b_7)
da4d239d
AP
1487 $ADDU $c_3,$t_1
1488 sltu $at,$c_3,$t_1
947716c1 1489 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
da4d239d
AP
1490 $ADDU $t_2,$at
1491 $ADDU $c_1,$t_2
1492 sltu $at,$c_1,$t_2
1493 $ADDU $c_2,$at
1494 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1495
947716c1
AP
1496 mflo ($t_1,$a_2,$b_7)
1497 mfhi ($t_2,$a_2,$b_7)
da4d239d
AP
1498 $ADDU $c_1,$t_1
1499 sltu $at,$c_1,$t_1
947716c1 1500 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
da4d239d
AP
1501 $ADDU $t_2,$at
1502 $ADDU $c_2,$t_2
1503 sltu $c_3,$c_2,$t_2
947716c1
AP
1504 mflo ($t_1,$a_3,$b_6)
1505 mfhi ($t_2,$a_3,$b_6)
da4d239d
AP
1506 $ADDU $c_1,$t_1
1507 sltu $at,$c_1,$t_1
947716c1 1508 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
da4d239d
AP
1509 $ADDU $t_2,$at
1510 $ADDU $c_2,$t_2
1511 sltu $at,$c_2,$t_2
1512 $ADDU $c_3,$at
947716c1
AP
1513 mflo ($t_1,$a_4,$b_5)
1514 mfhi ($t_2,$a_4,$b_5)
da4d239d
AP
1515 $ADDU $c_1,$t_1
1516 sltu $at,$c_1,$t_1
947716c1 1517 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
da4d239d
AP
1518 $ADDU $t_2,$at
1519 $ADDU $c_2,$t_2
1520 sltu $at,$c_2,$t_2
1521 $ADDU $c_3,$at
947716c1
AP
1522 mflo ($t_1,$a_5,$b_4)
1523 mfhi ($t_2,$a_5,$b_4)
da4d239d
AP
1524 $ADDU $c_1,$t_1
1525 sltu $at,$c_1,$t_1
947716c1 1526 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
da4d239d
AP
1527 $ADDU $t_2,$at
1528 $ADDU $c_2,$t_2
1529 sltu $at,$c_2,$t_2
1530 $ADDU $c_3,$at
947716c1
AP
1531 mflo ($t_1,$a_6,$b_3)
1532 mfhi ($t_2,$a_6,$b_3)
da4d239d
AP
1533 $ADDU $c_1,$t_1
1534 sltu $at,$c_1,$t_1
947716c1 1535 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
da4d239d
AP
1536 $ADDU $t_2,$at
1537 $ADDU $c_2,$t_2
1538 sltu $at,$c_2,$t_2
1539 $ADDU $c_3,$at
947716c1
AP
1540 mflo ($t_1,$a_7,$b_2)
1541 mfhi ($t_2,$a_7,$b_2)
da4d239d
AP
1542 $ADDU $c_1,$t_1
1543 sltu $at,$c_1,$t_1
947716c1 1544 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
da4d239d
AP
1545 $ADDU $t_2,$at
1546 $ADDU $c_2,$t_2
1547 sltu $at,$c_2,$t_2
1548 $ADDU $c_3,$at
1549 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1550
947716c1
AP
1551 mflo ($t_1,$a_7,$b_3)
1552 mfhi ($t_2,$a_7,$b_3)
da4d239d
AP
1553 $ADDU $c_2,$t_1
1554 sltu $at,$c_2,$t_1
947716c1 1555 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
da4d239d
AP
1556 $ADDU $t_2,$at
1557 $ADDU $c_3,$t_2
1558 sltu $c_1,$c_3,$t_2
947716c1
AP
1559 mflo ($t_1,$a_6,$b_4)
1560 mfhi ($t_2,$a_6,$b_4)
da4d239d
AP
1561 $ADDU $c_2,$t_1
1562 sltu $at,$c_2,$t_1
947716c1 1563 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
da4d239d
AP
1564 $ADDU $t_2,$at
1565 $ADDU $c_3,$t_2
1566 sltu $at,$c_3,$t_2
1567 $ADDU $c_1,$at
947716c1
AP
1568 mflo ($t_1,$a_5,$b_5)
1569 mfhi ($t_2,$a_5,$b_5)
da4d239d
AP
1570 $ADDU $c_2,$t_1
1571 sltu $at,$c_2,$t_1
947716c1 1572 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
da4d239d
AP
1573 $ADDU $t_2,$at
1574 $ADDU $c_3,$t_2
1575 sltu $at,$c_3,$t_2
1576 $ADDU $c_1,$at
947716c1
AP
1577 mflo ($t_1,$a_4,$b_6)
1578 mfhi ($t_2,$a_4,$b_6)
da4d239d
AP
1579 $ADDU $c_2,$t_1
1580 sltu $at,$c_2,$t_1
947716c1 1581 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
da4d239d
AP
1582 $ADDU $t_2,$at
1583 $ADDU $c_3,$t_2
1584 sltu $at,$c_3,$t_2
1585 $ADDU $c_1,$at
947716c1
AP
1586 mflo ($t_1,$a_3,$b_7)
1587 mfhi ($t_2,$a_3,$b_7)
da4d239d
AP
1588 $ADDU $c_2,$t_1
1589 sltu $at,$c_2,$t_1
947716c1 1590 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
da4d239d
AP
1591 $ADDU $t_2,$at
1592 $ADDU $c_3,$t_2
1593 sltu $at,$c_3,$t_2
1594 $ADDU $c_1,$at
1595 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1596
947716c1
AP
1597 mflo ($t_1,$a_4,$b_7)
1598 mfhi ($t_2,$a_4,$b_7)
da4d239d
AP
1599 $ADDU $c_3,$t_1
1600 sltu $at,$c_3,$t_1
947716c1 1601 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
da4d239d
AP
1602 $ADDU $t_2,$at
1603 $ADDU $c_1,$t_2
1604 sltu $c_2,$c_1,$t_2
947716c1
AP
1605 mflo ($t_1,$a_5,$b_6)
1606 mfhi ($t_2,$a_5,$b_6)
da4d239d
AP
1607 $ADDU $c_3,$t_1
1608 sltu $at,$c_3,$t_1
947716c1 1609 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
da4d239d
AP
1610 $ADDU $t_2,$at
1611 $ADDU $c_1,$t_2
1612 sltu $at,$c_1,$t_2
1613 $ADDU $c_2,$at
947716c1
AP
1614 mflo ($t_1,$a_6,$b_5)
1615 mfhi ($t_2,$a_6,$b_5)
da4d239d
AP
1616 $ADDU $c_3,$t_1
1617 sltu $at,$c_3,$t_1
947716c1 1618 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
da4d239d
AP
1619 $ADDU $t_2,$at
1620 $ADDU $c_1,$t_2
1621 sltu $at,$c_1,$t_2
1622 $ADDU $c_2,$at
947716c1
AP
1623 mflo ($t_1,$a_7,$b_4)
1624 mfhi ($t_2,$a_7,$b_4)
da4d239d
AP
1625 $ADDU $c_3,$t_1
1626 sltu $at,$c_3,$t_1
947716c1 1627 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
da4d239d
AP
1628 $ADDU $t_2,$at
1629 $ADDU $c_1,$t_2
1630 sltu $at,$c_1,$t_2
1631 $ADDU $c_2,$at
1632 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1633
947716c1
AP
1634 mflo ($t_1,$a_7,$b_5)
1635 mfhi ($t_2,$a_7,$b_5)
da4d239d
AP
1636 $ADDU $c_1,$t_1
1637 sltu $at,$c_1,$t_1
947716c1 1638 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
da4d239d
AP
1639 $ADDU $t_2,$at
1640 $ADDU $c_2,$t_2
1641 sltu $c_3,$c_2,$t_2
947716c1
AP
1642 mflo ($t_1,$a_6,$b_6)
1643 mfhi ($t_2,$a_6,$b_6)
da4d239d
AP
1644 $ADDU $c_1,$t_1
1645 sltu $at,$c_1,$t_1
947716c1 1646 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
da4d239d
AP
1647 $ADDU $t_2,$at
1648 $ADDU $c_2,$t_2
1649 sltu $at,$c_2,$t_2
1650 $ADDU $c_3,$at
947716c1
AP
1651 mflo ($t_1,$a_5,$b_7)
1652 mfhi ($t_2,$a_5,$b_7)
da4d239d
AP
1653 $ADDU $c_1,$t_1
1654 sltu $at,$c_1,$t_1
947716c1 1655 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
da4d239d
AP
1656 $ADDU $t_2,$at
1657 $ADDU $c_2,$t_2
1658 sltu $at,$c_2,$t_2
1659 $ADDU $c_3,$at
1660 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1661
947716c1
AP
1662 mflo ($t_1,$a_6,$b_7)
1663 mfhi ($t_2,$a_6,$b_7)
da4d239d
AP
1664 $ADDU $c_2,$t_1
1665 sltu $at,$c_2,$t_1
947716c1 1666 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
da4d239d
AP
1667 $ADDU $t_2,$at
1668 $ADDU $c_3,$t_2
1669 sltu $c_1,$c_3,$t_2
947716c1
AP
1670 mflo ($t_1,$a_7,$b_6)
1671 mfhi ($t_2,$a_7,$b_6)
da4d239d
AP
1672 $ADDU $c_2,$t_1
1673 sltu $at,$c_2,$t_1
947716c1 1674 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
da4d239d
AP
1675 $ADDU $t_2,$at
1676 $ADDU $c_3,$t_2
1677 sltu $at,$c_3,$t_2
1678 $ADDU $c_1,$at
1679 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1680
947716c1
AP
1681 mflo ($t_1,$a_7,$b_7)
1682 mfhi ($t_2,$a_7,$b_7)
da4d239d
AP
1683 $ADDU $c_3,$t_1
1684 sltu $at,$c_3,$t_1
1685 $ADDU $t_2,$at
1686 $ADDU $c_1,$t_2
1687 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1688 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1689
1690 .set noreorder
1691___
1692$code.=<<___ if ($flavour =~ /nubi/i);
1693 $REG_L $s5,10*$SZREG($sp)
1694 $REG_L $s4,9*$SZREG($sp)
1695 $REG_L $s3,8*$SZREG($sp)
1696 $REG_L $s2,7*$SZREG($sp)
1697 $REG_L $s1,6*$SZREG($sp)
1698 $REG_L $s0,5*$SZREG($sp)
1699 $REG_L $t3,4*$SZREG($sp)
1700 $REG_L $t2,3*$SZREG($sp)
1701 $REG_L $t1,2*$SZREG($sp)
1702 $REG_L $t0,1*$SZREG($sp)
1703 $REG_L $gp,0*$SZREG($sp)
1704 jr $ra
1705 $PTR_ADD $sp,12*$SZREG
1706___
1707$code.=<<___ if ($flavour !~ /nubi/i);
1708 $REG_L $s5,5*$SZREG($sp)
1709 $REG_L $s4,4*$SZREG($sp)
1710 $REG_L $s3,3*$SZREG($sp)
1711 $REG_L $s2,2*$SZREG($sp)
1712 $REG_L $s1,1*$SZREG($sp)
1713 $REG_L $s0,0*$SZREG($sp)
1714 jr $ra
1715 $PTR_ADD $sp,6*$SZREG
1716___
1717$code.=<<___;
1718.end bn_mul_comba8
1719
1720.align 5
1721.globl bn_mul_comba4
1722.ent bn_mul_comba4
1723bn_mul_comba4:
1724___
1725$code.=<<___ if ($flavour =~ /nubi/i);
1726 .frame $sp,6*$SZREG,$ra
1727 .mask 0x8000f008,-$SZREG
1728 .set noreorder
1729 $PTR_SUB $sp,6*$SZREG
1730 $REG_S $ra,5*$SZREG($sp)
1731 $REG_S $t3,4*$SZREG($sp)
1732 $REG_S $t2,3*$SZREG($sp)
1733 $REG_S $t1,2*$SZREG($sp)
1734 $REG_S $t0,1*$SZREG($sp)
1735 $REG_S $gp,0*$SZREG($sp)
1736___
1737$code.=<<___;
1738 .set reorder
1739 $LD $a_0,0($a1)
1740 $LD $b_0,0($a2)
1741 $LD $a_1,$BNSZ($a1)
1742 $LD $a_2,2*$BNSZ($a1)
947716c1 1743 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
1744 $LD $a_3,3*$BNSZ($a1)
1745 $LD $b_1,$BNSZ($a2)
1746 $LD $b_2,2*$BNSZ($a2)
1747 $LD $b_3,3*$BNSZ($a2)
947716c1
AP
1748 mflo ($c_1,$a_0,$b_0)
1749 mfhi ($c_2,$a_0,$b_0)
da4d239d
AP
1750 $ST $c_1,0($a0)
1751
947716c1
AP
1752 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1753 mflo ($t_1,$a_0,$b_1)
1754 mfhi ($t_2,$a_0,$b_1)
da4d239d
AP
1755 $ADDU $c_2,$t_1
1756 sltu $at,$c_2,$t_1
947716c1 1757 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
da4d239d 1758 $ADDU $c_3,$t_2,$at
947716c1
AP
1759 mflo ($t_1,$a_1,$b_0)
1760 mfhi ($t_2,$a_1,$b_0)
da4d239d
AP
1761 $ADDU $c_2,$t_1
1762 sltu $at,$c_2,$t_1
947716c1 1763 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
da4d239d
AP
1764 $ADDU $t_2,$at
1765 $ADDU $c_3,$t_2
1766 sltu $c_1,$c_3,$t_2
1767 $ST $c_2,$BNSZ($a0)
1768
947716c1
AP
1769 mflo ($t_1,$a_2,$b_0)
1770 mfhi ($t_2,$a_2,$b_0)
da4d239d
AP
1771 $ADDU $c_3,$t_1
1772 sltu $at,$c_3,$t_1
947716c1 1773 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
da4d239d
AP
1774 $ADDU $t_2,$at
1775 $ADDU $c_1,$t_2
947716c1
AP
1776 mflo ($t_1,$a_1,$b_1)
1777 mfhi ($t_2,$a_1,$b_1)
da4d239d
AP
1778 $ADDU $c_3,$t_1
1779 sltu $at,$c_3,$t_1
947716c1 1780 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
da4d239d
AP
1781 $ADDU $t_2,$at
1782 $ADDU $c_1,$t_2
1783 sltu $c_2,$c_1,$t_2
947716c1
AP
1784 mflo ($t_1,$a_0,$b_2)
1785 mfhi ($t_2,$a_0,$b_2)
da4d239d
AP
1786 $ADDU $c_3,$t_1
1787 sltu $at,$c_3,$t_1
947716c1 1788 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
da4d239d
AP
1789 $ADDU $t_2,$at
1790 $ADDU $c_1,$t_2
1791 sltu $at,$c_1,$t_2
1792 $ADDU $c_2,$at
1793 $ST $c_3,2*$BNSZ($a0)
1794
947716c1
AP
1795 mflo ($t_1,$a_0,$b_3)
1796 mfhi ($t_2,$a_0,$b_3)
da4d239d
AP
1797 $ADDU $c_1,$t_1
1798 sltu $at,$c_1,$t_1
947716c1 1799 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
da4d239d
AP
1800 $ADDU $t_2,$at
1801 $ADDU $c_2,$t_2
1802 sltu $c_3,$c_2,$t_2
947716c1
AP
1803 mflo ($t_1,$a_1,$b_2)
1804 mfhi ($t_2,$a_1,$b_2)
da4d239d
AP
1805 $ADDU $c_1,$t_1
1806 sltu $at,$c_1,$t_1
947716c1 1807 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
da4d239d
AP
1808 $ADDU $t_2,$at
1809 $ADDU $c_2,$t_2
1810 sltu $at,$c_2,$t_2
1811 $ADDU $c_3,$at
947716c1
AP
1812 mflo ($t_1,$a_2,$b_1)
1813 mfhi ($t_2,$a_2,$b_1)
da4d239d
AP
1814 $ADDU $c_1,$t_1
1815 sltu $at,$c_1,$t_1
947716c1 1816 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
da4d239d
AP
1817 $ADDU $t_2,$at
1818 $ADDU $c_2,$t_2
1819 sltu $at,$c_2,$t_2
1820 $ADDU $c_3,$at
947716c1
AP
1821 mflo ($t_1,$a_3,$b_0)
1822 mfhi ($t_2,$a_3,$b_0)
da4d239d
AP
1823 $ADDU $c_1,$t_1
1824 sltu $at,$c_1,$t_1
947716c1 1825 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
da4d239d
AP
1826 $ADDU $t_2,$at
1827 $ADDU $c_2,$t_2
1828 sltu $at,$c_2,$t_2
1829 $ADDU $c_3,$at
1830 $ST $c_1,3*$BNSZ($a0)
1831
947716c1
AP
1832 mflo ($t_1,$a_3,$b_1)
1833 mfhi ($t_2,$a_3,$b_1)
da4d239d
AP
1834 $ADDU $c_2,$t_1
1835 sltu $at,$c_2,$t_1
947716c1 1836 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
da4d239d
AP
1837 $ADDU $t_2,$at
1838 $ADDU $c_3,$t_2
1839 sltu $c_1,$c_3,$t_2
947716c1
AP
1840 mflo ($t_1,$a_2,$b_2)
1841 mfhi ($t_2,$a_2,$b_2)
da4d239d
AP
1842 $ADDU $c_2,$t_1
1843 sltu $at,$c_2,$t_1
947716c1 1844 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
da4d239d
AP
1845 $ADDU $t_2,$at
1846 $ADDU $c_3,$t_2
1847 sltu $at,$c_3,$t_2
1848 $ADDU $c_1,$at
947716c1
AP
1849 mflo ($t_1,$a_1,$b_3)
1850 mfhi ($t_2,$a_1,$b_3)
da4d239d
AP
1851 $ADDU $c_2,$t_1
1852 sltu $at,$c_2,$t_1
947716c1 1853 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
da4d239d
AP
1854 $ADDU $t_2,$at
1855 $ADDU $c_3,$t_2
1856 sltu $at,$c_3,$t_2
1857 $ADDU $c_1,$at
1858 $ST $c_2,4*$BNSZ($a0)
1859
947716c1
AP
1860 mflo ($t_1,$a_2,$b_3)
1861 mfhi ($t_2,$a_2,$b_3)
da4d239d
AP
1862 $ADDU $c_3,$t_1
1863 sltu $at,$c_3,$t_1
947716c1 1864 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
da4d239d
AP
1865 $ADDU $t_2,$at
1866 $ADDU $c_1,$t_2
1867 sltu $c_2,$c_1,$t_2
947716c1
AP
1868 mflo ($t_1,$a_3,$b_2)
1869 mfhi ($t_2,$a_3,$b_2)
da4d239d
AP
1870 $ADDU $c_3,$t_1
1871 sltu $at,$c_3,$t_1
947716c1 1872 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
da4d239d
AP
1873 $ADDU $t_2,$at
1874 $ADDU $c_1,$t_2
1875 sltu $at,$c_1,$t_2
1876 $ADDU $c_2,$at
1877 $ST $c_3,5*$BNSZ($a0)
1878
947716c1
AP
1879 mflo ($t_1,$a_3,$b_3)
1880 mfhi ($t_2,$a_3,$b_3)
da4d239d
AP
1881 $ADDU $c_1,$t_1
1882 sltu $at,$c_1,$t_1
1883 $ADDU $t_2,$at
1884 $ADDU $c_2,$t_2
1885 $ST $c_1,6*$BNSZ($a0)
1886 $ST $c_2,7*$BNSZ($a0)
1887
1888 .set noreorder
1889___
1890$code.=<<___ if ($flavour =~ /nubi/i);
1891 $REG_L $t3,4*$SZREG($sp)
1892 $REG_L $t2,3*$SZREG($sp)
1893 $REG_L $t1,2*$SZREG($sp)
1894 $REG_L $t0,1*$SZREG($sp)
1895 $REG_L $gp,0*$SZREG($sp)
1896 $PTR_ADD $sp,6*$SZREG
1897___
1898$code.=<<___;
1899 jr $ra
1900 nop
1901.end bn_mul_comba4
1902___
1903
1904($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1905
a7a44ba5
AP
1906sub add_c2 () {
1907my ($hi,$lo,$c0,$c1,$c2,
1908 $warm, # !$warm denotes first call with specific sequence of
1909 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1910 $an,$bn # these two are arguments for multiplication which
1911 # result is used in *next* step [which is why it's
1912 # commented as "forward multiplication" below];
1913 )=@_;
1914$code.=<<___;
a7a44ba5
AP
1915 $ADDU $c0,$lo
1916 sltu $at,$c0,$lo
947716c1 1917 $MULTU ($an,$bn) # forward multiplication
a7a44ba5
AP
1918 $ADDU $c0,$lo
1919 $ADDU $at,$hi
1920 sltu $lo,$c0,$lo
1921 $ADDU $c1,$at
1922 $ADDU $hi,$lo
1923___
1924$code.=<<___ if (!$warm);
1925 sltu $c2,$c1,$at
1926 $ADDU $c1,$hi
a7a44ba5
AP
1927___
1928$code.=<<___ if ($warm);
1929 sltu $at,$c1,$at
1930 $ADDU $c1,$hi
1931 $ADDU $c2,$at
947716c1
AP
1932___
1933$code.=<<___;
a7a44ba5
AP
1934 sltu $hi,$c1,$hi
1935 $ADDU $c2,$hi
947716c1
AP
1936 mflo ($lo,$an,$bn)
1937 mfhi ($hi,$an,$bn)
a7a44ba5
AP
1938___
1939}
1940
da4d239d
AP
1941$code.=<<___;
1942
1943.align 5
1944.globl bn_sqr_comba8
1945.ent bn_sqr_comba8
1946bn_sqr_comba8:
1947___
1948$code.=<<___ if ($flavour =~ /nubi/i);
1949 .frame $sp,6*$SZREG,$ra
1950 .mask 0x8000f008,-$SZREG
1951 .set noreorder
1952 $PTR_SUB $sp,6*$SZREG
1953 $REG_S $ra,5*$SZREG($sp)
1954 $REG_S $t3,4*$SZREG($sp)
1955 $REG_S $t2,3*$SZREG($sp)
1956 $REG_S $t1,2*$SZREG($sp)
1957 $REG_S $t0,1*$SZREG($sp)
1958 $REG_S $gp,0*$SZREG($sp)
1959___
1960$code.=<<___;
1961 .set reorder
1962 $LD $a_0,0($a1)
1963 $LD $a_1,$BNSZ($a1)
1964 $LD $a_2,2*$BNSZ($a1)
1965 $LD $a_3,3*$BNSZ($a1)
1966
947716c1 1967 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
1968 $LD $a_4,4*$BNSZ($a1)
1969 $LD $a_5,5*$BNSZ($a1)
1970 $LD $a_6,6*$BNSZ($a1)
1971 $LD $a_7,7*$BNSZ($a1)
947716c1
AP
1972 mflo ($c_1,$a_0,$a_0)
1973 mfhi ($c_2,$a_0,$a_0)
da4d239d
AP
1974 $ST $c_1,0($a0)
1975
947716c1
AP
1976 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1977 mflo ($t_1,$a_0,$a_1)
1978 mfhi ($t_2,$a_0,$a_1)
da4d239d
AP
1979 slt $c_1,$t_2,$zero
1980 $SLL $t_2,1
947716c1 1981 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
da4d239d
AP
1982 slt $a2,$t_1,$zero
1983 $ADDU $t_2,$a2
1984 $SLL $t_1,1
1985 $ADDU $c_2,$t_1
1986 sltu $at,$c_2,$t_1
1987 $ADDU $c_3,$t_2,$at
1988 $ST $c_2,$BNSZ($a0)
947716c1
AP
1989 mflo ($t_1,$a_2,$a_0)
1990 mfhi ($t_2,$a_2,$a_0)
a7a44ba5
AP
1991___
1992 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1993 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1994$code.=<<___;
da4d239d
AP
1995 $ADDU $c_3,$t_1
1996 sltu $at,$c_3,$t_1
947716c1 1997 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
da4d239d
AP
1998 $ADDU $t_2,$at
1999 $ADDU $c_1,$t_2
2000 sltu $at,$c_1,$t_2
2001 $ADDU $c_2,$at
2002 $ST $c_3,2*$BNSZ($a0)
947716c1
AP
2003 mflo ($t_1,$a_0,$a_3)
2004 mfhi ($t_2,$a_0,$a_3)
a7a44ba5
AP
2005___
2006 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2007 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2008 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2009 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2010$code.=<<___;
da4d239d 2011 $ST $c_1,3*$BNSZ($a0)
a7a44ba5
AP
2012___
2013 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2014 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2015 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2016 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2017$code.=<<___;
da4d239d
AP
2018 $ADDU $c_2,$t_1
2019 sltu $at,$c_2,$t_1
947716c1 2020 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
da4d239d
AP
2021 $ADDU $t_2,$at
2022 $ADDU $c_3,$t_2
2023 sltu $at,$c_3,$t_2
2024 $ADDU $c_1,$at
2025 $ST $c_2,4*$BNSZ($a0)
947716c1
AP
2026 mflo ($t_1,$a_0,$a_5)
2027 mfhi ($t_2,$a_0,$a_5)
a7a44ba5
AP
2028___
2029 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2030 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2031 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2032 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2033 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2034 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2035$code.=<<___;
da4d239d 2036 $ST $c_3,5*$BNSZ($a0)
a7a44ba5
AP
2037___
2038 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2039 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2040 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2041 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2042 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2043 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2044$code.=<<___;
da4d239d
AP
2045 $ADDU $c_1,$t_1
2046 sltu $at,$c_1,$t_1
947716c1 2047 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
da4d239d
AP
2048 $ADDU $t_2,$at
2049 $ADDU $c_2,$t_2
2050 sltu $at,$c_2,$t_2
2051 $ADDU $c_3,$at
2052 $ST $c_1,6*$BNSZ($a0)
947716c1
AP
2053 mflo ($t_1,$a_0,$a_7)
2054 mfhi ($t_2,$a_0,$a_7)
a7a44ba5
AP
2055___
2056 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2057 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2058 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2059 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2060 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2061 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2062 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2063 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2064$code.=<<___;
da4d239d 2065 $ST $c_2,7*$BNSZ($a0)
a7a44ba5
AP
2066___
2067 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2068 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2069 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2070 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2071 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2072 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2073$code.=<<___;
da4d239d
AP
2074 $ADDU $c_3,$t_1
2075 sltu $at,$c_3,$t_1
947716c1 2076 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
da4d239d
AP
2077 $ADDU $t_2,$at
2078 $ADDU $c_1,$t_2
2079 sltu $at,$c_1,$t_2
2080 $ADDU $c_2,$at
2081 $ST $c_3,8*$BNSZ($a0)
947716c1
AP
2082 mflo ($t_1,$a_2,$a_7)
2083 mfhi ($t_2,$a_2,$a_7)
a7a44ba5
AP
2084___
2085 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2086 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2087 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2088 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2089 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2090 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2091$code.=<<___;
da4d239d 2092 $ST $c_1,9*$BNSZ($a0)
a7a44ba5
AP
2093___
2094 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2095 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2096 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2097 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2098$code.=<<___;
da4d239d
AP
2099 $ADDU $c_2,$t_1
2100 sltu $at,$c_2,$t_1
947716c1 2101 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
da4d239d
AP
2102 $ADDU $t_2,$at
2103 $ADDU $c_3,$t_2
2104 sltu $at,$c_3,$t_2
2105 $ADDU $c_1,$at
2106 $ST $c_2,10*$BNSZ($a0)
947716c1
AP
2107 mflo ($t_1,$a_4,$a_7)
2108 mfhi ($t_2,$a_4,$a_7)
a7a44ba5
AP
2109___
2110 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2111 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2112 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2113 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2114$code.=<<___;
da4d239d 2115 $ST $c_3,11*$BNSZ($a0)
a7a44ba5
AP
2116___
2117 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2118 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2119$code.=<<___;
da4d239d
AP
2120 $ADDU $c_1,$t_1
2121 sltu $at,$c_1,$t_1
947716c1 2122 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
da4d239d
AP
2123 $ADDU $t_2,$at
2124 $ADDU $c_2,$t_2
2125 sltu $at,$c_2,$t_2
2126 $ADDU $c_3,$at
2127 $ST $c_1,12*$BNSZ($a0)
947716c1
AP
2128 mflo ($t_1,$a_6,$a_7)
2129 mfhi ($t_2,$a_6,$a_7)
a7a44ba5
AP
2130___
2131 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2132 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2133$code.=<<___;
da4d239d
AP
2134 $ST $c_2,13*$BNSZ($a0)
2135
da4d239d
AP
2136 $ADDU $c_3,$t_1
2137 sltu $at,$c_3,$t_1
2138 $ADDU $t_2,$at
2139 $ADDU $c_1,$t_2
2140 $ST $c_3,14*$BNSZ($a0)
2141 $ST $c_1,15*$BNSZ($a0)
2142
2143 .set noreorder
2144___
2145$code.=<<___ if ($flavour =~ /nubi/i);
2146 $REG_L $t3,4*$SZREG($sp)
2147 $REG_L $t2,3*$SZREG($sp)
2148 $REG_L $t1,2*$SZREG($sp)
2149 $REG_L $t0,1*$SZREG($sp)
2150 $REG_L $gp,0*$SZREG($sp)
2151 $PTR_ADD $sp,6*$SZREG
2152___
2153$code.=<<___;
2154 jr $ra
2155 nop
2156.end bn_sqr_comba8
2157
2158.align 5
2159.globl bn_sqr_comba4
2160.ent bn_sqr_comba4
2161bn_sqr_comba4:
2162___
2163$code.=<<___ if ($flavour =~ /nubi/i);
2164 .frame $sp,6*$SZREG,$ra
2165 .mask 0x8000f008,-$SZREG
2166 .set noreorder
2167 $PTR_SUB $sp,6*$SZREG
2168 $REG_S $ra,5*$SZREG($sp)
2169 $REG_S $t3,4*$SZREG($sp)
2170 $REG_S $t2,3*$SZREG($sp)
2171 $REG_S $t1,2*$SZREG($sp)
2172 $REG_S $t0,1*$SZREG($sp)
2173 $REG_S $gp,0*$SZREG($sp)
2174___
2175$code.=<<___;
2176 .set reorder
2177 $LD $a_0,0($a1)
2178 $LD $a_1,$BNSZ($a1)
947716c1 2179 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
2180 $LD $a_2,2*$BNSZ($a1)
2181 $LD $a_3,3*$BNSZ($a1)
947716c1
AP
2182 mflo ($c_1,$a_0,$a_0)
2183 mfhi ($c_2,$a_0,$a_0)
da4d239d
AP
2184 $ST $c_1,0($a0)
2185
947716c1
AP
2186 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2187 mflo ($t_1,$a_0,$a_1)
2188 mfhi ($t_2,$a_0,$a_1)
da4d239d
AP
2189 slt $c_1,$t_2,$zero
2190 $SLL $t_2,1
947716c1 2191 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
da4d239d
AP
2192 slt $a2,$t_1,$zero
2193 $ADDU $t_2,$a2
2194 $SLL $t_1,1
2195 $ADDU $c_2,$t_1
2196 sltu $at,$c_2,$t_1
2197 $ADDU $c_3,$t_2,$at
2198 $ST $c_2,$BNSZ($a0)
947716c1
AP
2199 mflo ($t_1,$a_2,$a_0)
2200 mfhi ($t_2,$a_2,$a_0)
a7a44ba5
AP
2201___
2202 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2203 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2204$code.=<<___;
da4d239d
AP
2205 $ADDU $c_3,$t_1
2206 sltu $at,$c_3,$t_1
947716c1 2207 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
da4d239d
AP
2208 $ADDU $t_2,$at
2209 $ADDU $c_1,$t_2
2210 sltu $at,$c_1,$t_2
2211 $ADDU $c_2,$at
2212 $ST $c_3,2*$BNSZ($a0)
947716c1
AP
2213 mflo ($t_1,$a_0,$a_3)
2214 mfhi ($t_2,$a_0,$a_3)
a7a44ba5
AP
2215___
2216 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2217 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2218 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2219 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2220$code.=<<___;
da4d239d 2221 $ST $c_1,3*$BNSZ($a0)
a7a44ba5
AP
2222___
2223 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2224 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2225$code.=<<___;
da4d239d
AP
2226 $ADDU $c_2,$t_1
2227 sltu $at,$c_2,$t_1
947716c1 2228 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
da4d239d
AP
2229 $ADDU $t_2,$at
2230 $ADDU $c_3,$t_2
2231 sltu $at,$c_3,$t_2
2232 $ADDU $c_1,$at
2233 $ST $c_2,4*$BNSZ($a0)
947716c1
AP
2234 mflo ($t_1,$a_2,$a_3)
2235 mfhi ($t_2,$a_2,$a_3)
a7a44ba5
AP
2236___
2237 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2238 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2239$code.=<<___;
da4d239d
AP
2240 $ST $c_3,5*$BNSZ($a0)
2241
da4d239d
AP
2242 $ADDU $c_1,$t_1
2243 sltu $at,$c_1,$t_1
2244 $ADDU $t_2,$at
2245 $ADDU $c_2,$t_2
2246 $ST $c_1,6*$BNSZ($a0)
2247 $ST $c_2,7*$BNSZ($a0)
2248
2249 .set noreorder
2250___
2251$code.=<<___ if ($flavour =~ /nubi/i);
2252 $REG_L $t3,4*$SZREG($sp)
2253 $REG_L $t2,3*$SZREG($sp)
2254 $REG_L $t1,2*$SZREG($sp)
2255 $REG_L $t0,1*$SZREG($sp)
2256 $REG_L $gp,0*$SZREG($sp)
2257 $PTR_ADD $sp,6*$SZREG
2258___
2259$code.=<<___;
2260 jr $ra
2261 nop
2262.end bn_sqr_comba4
2263___
2264print $code;
a21314db 2265close STDOUT or die "error closing STDOUT: $!";