]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/mips.pl
Ignore an auto-generated documentation file
[thirdparty/openssl.git] / crypto / bn / asm / mips.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
da4d239d
AP
9#
10# ====================================================================
e3713c36 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
da4d239d
AP
12# project.
13#
14# Rights for redistribution and usage in source and binary forms are
15# granted according to the OpenSSL license. Warranty of any kind is
16# disclaimed.
17# ====================================================================
18
19
20# July 1999
21#
22# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
23#
24# The module is designed to work with either of the "new" MIPS ABI(5),
60250017 25# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
da4d239d
AP
26# IRIX 5.x not only because it doesn't support new ABIs but also
27# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
28# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
29# cause illegal instruction exception:-(
30#
31# In addition the code depends on preprocessor flags set up by MIPSpro
32# compiler driver (either as or cc) and therefore (probably?) can't be
33# compiled by the GNU assembler. GNU C driver manages fine though...
34# I mean as long as -mmips-as is specified or is the default option,
35# because then it simply invokes /usr/bin/as which in turn takes
36# perfect care of the preprocessor definitions. Another neat feature
37# offered by the MIPSpro assembler is an optimization pass. This gave
38# me the opportunity to have the code looking more regular as all those
39# architecture dependent instruction rescheduling details were left to
40# the assembler. Cool, huh?
41#
42# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
43# goes way over 3 times faster!
44#
e3713c36 45# <appro@openssl.org>
da4d239d
AP
46
47# October 2010
48#
49# Adapt the module even for 32-bit ABIs and other OSes. The former was
50# achieved by mechanical replacement of 64-bit arithmetic instructions
51# such as dmultu, daddu, etc. with their 32-bit counterparts and
52# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
53# >3x performance improvement naturally does not apply to 32-bit code
54# [because there is no instruction 32-bit compiler can't use], one
55# has to content with 40-85% improvement depending on benchmark and
56# key length, more for longer keys.
57
1a002d88 58$flavour = shift || "o32";
a5aa63a4 59while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
d4665887 60open STDOUT,">$output";
da4d239d
AP
61
62if ($flavour =~ /64|n32/i) {
63 $LD="ld";
64 $ST="sd";
65 $MULTU="dmultu";
66 $DIVU="ddivu";
67 $ADDU="daddu";
68 $SUBU="dsubu";
69 $SRL="dsrl";
70 $SLL="dsll";
71 $BNSZ=8;
72 $PTR_ADD="daddu";
73 $PTR_SUB="dsubu";
74 $SZREG=8;
75 $REG_S="sd";
76 $REG_L="ld";
77} else {
78 $LD="lw";
79 $ST="sw";
80 $MULTU="multu";
81 $DIVU="divu";
82 $ADDU="addu";
83 $SUBU="subu";
84 $SRL="srl";
85 $SLL="sll";
86 $BNSZ=4;
87 $PTR_ADD="addu";
88 $PTR_SUB="subu";
89 $SZREG=4;
90 $REG_S="sw";
91 $REG_L="lw";
92 $code=".set mips2\n";
93}
94
95# Below is N32/64 register layout used in the original module.
96#
97($zero,$at,$v0,$v1)=map("\$$_",(0..3));
98($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
99($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
100($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
101($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
102($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
103#
104# No special adaptation is required for O32. NUBI on the other hand
105# is treated by saving/restoring ($v1,$t0..$t3).
106
107$gp=$v1 if ($flavour =~ /nubi/i);
108
109$minus4=$v1;
110
111$code.=<<___;
947716c1
AP
112#include "mips_arch.h"
113
114#if defined(_MIPS_ARCH_MIPS64R6)
115# define ddivu(rs,rt)
116# define mfqt(rd,rs,rt) ddivu rd,rs,rt
117# define mfrm(rd,rs,rt) dmodu rd,rs,rt
118#elif defined(_MIPS_ARCH_MIPS32R6)
119# define divu(rs,rt)
120# define mfqt(rd,rs,rt) divu rd,rs,rt
121# define mfrm(rd,rs,rt) modu rd,rs,rt
122#else
123# define $DIVU(rs,rt) $DIVU $zero,rs,rt
124# define mfqt(rd,rs,rt) mflo rd
125# define mfrm(rd,rs,rt) mfhi rd
126#endif
127
da4d239d
AP
128.rdata
129.asciiz "mips3.s, Version 1.2"
130.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
131
132.text
133.set noat
134
135.align 5
136.globl bn_mul_add_words
137.ent bn_mul_add_words
138bn_mul_add_words:
139 .set noreorder
140 bgtz $a2,bn_mul_add_words_internal
141 move $v0,$zero
142 jr $ra
143 move $a0,$v0
144.end bn_mul_add_words
145
146.align 5
147.ent bn_mul_add_words_internal
148bn_mul_add_words_internal:
149___
150$code.=<<___ if ($flavour =~ /nubi/i);
151 .frame $sp,6*$SZREG,$ra
152 .mask 0x8000f008,-$SZREG
153 .set noreorder
154 $PTR_SUB $sp,6*$SZREG
155 $REG_S $ra,5*$SZREG($sp)
156 $REG_S $t3,4*$SZREG($sp)
157 $REG_S $t2,3*$SZREG($sp)
158 $REG_S $t1,2*$SZREG($sp)
159 $REG_S $t0,1*$SZREG($sp)
160 $REG_S $gp,0*$SZREG($sp)
161___
162$code.=<<___;
163 .set reorder
164 li $minus4,-4
165 and $ta0,$a2,$minus4
da4d239d
AP
166 beqz $ta0,.L_bn_mul_add_words_tail
167
168.L_bn_mul_add_words_loop:
0c2adb0a 169 $LD $t0,0($a1)
947716c1 170 $MULTU ($t0,$a3)
da4d239d
AP
171 $LD $t1,0($a0)
172 $LD $t2,$BNSZ($a1)
173 $LD $t3,$BNSZ($a0)
174 $LD $ta0,2*$BNSZ($a1)
175 $LD $ta1,2*$BNSZ($a0)
176 $ADDU $t1,$v0
177 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
178 # values", but it seems to work fine
179 # even on 64-bit registers.
947716c1
AP
180 mflo ($at,$t0,$a3)
181 mfhi ($t0,$t0,$a3)
da4d239d
AP
182 $ADDU $t1,$at
183 $ADDU $v0,$t0
947716c1 184 $MULTU ($t2,$a3)
da4d239d
AP
185 sltu $at,$t1,$at
186 $ST $t1,0($a0)
187 $ADDU $v0,$at
188
189 $LD $ta2,3*$BNSZ($a1)
190 $LD $ta3,3*$BNSZ($a0)
191 $ADDU $t3,$v0
192 sltu $v0,$t3,$v0
947716c1
AP
193 mflo ($at,$t2,$a3)
194 mfhi ($t2,$t2,$a3)
da4d239d
AP
195 $ADDU $t3,$at
196 $ADDU $v0,$t2
947716c1 197 $MULTU ($ta0,$a3)
da4d239d
AP
198 sltu $at,$t3,$at
199 $ST $t3,$BNSZ($a0)
200 $ADDU $v0,$at
201
202 subu $a2,4
203 $PTR_ADD $a0,4*$BNSZ
204 $PTR_ADD $a1,4*$BNSZ
205 $ADDU $ta1,$v0
206 sltu $v0,$ta1,$v0
947716c1
AP
207 mflo ($at,$ta0,$a3)
208 mfhi ($ta0,$ta0,$a3)
da4d239d
AP
209 $ADDU $ta1,$at
210 $ADDU $v0,$ta0
947716c1 211 $MULTU ($ta2,$a3)
da4d239d
AP
212 sltu $at,$ta1,$at
213 $ST $ta1,-2*$BNSZ($a0)
214 $ADDU $v0,$at
215
216
217 and $ta0,$a2,$minus4
218 $ADDU $ta3,$v0
219 sltu $v0,$ta3,$v0
947716c1
AP
220 mflo ($at,$ta2,$a3)
221 mfhi ($ta2,$ta2,$a3)
da4d239d
AP
222 $ADDU $ta3,$at
223 $ADDU $v0,$ta2
224 sltu $at,$ta3,$at
225 $ST $ta3,-$BNSZ($a0)
da4d239d 226 .set noreorder
0c2adb0a
AP
227 bgtz $ta0,.L_bn_mul_add_words_loop
228 $ADDU $v0,$at
da4d239d
AP
229
230 beqz $a2,.L_bn_mul_add_words_return
231 nop
232
233.L_bn_mul_add_words_tail:
234 .set reorder
235 $LD $t0,0($a1)
947716c1 236 $MULTU ($t0,$a3)
da4d239d
AP
237 $LD $t1,0($a0)
238 subu $a2,1
239 $ADDU $t1,$v0
240 sltu $v0,$t1,$v0
947716c1
AP
241 mflo ($at,$t0,$a3)
242 mfhi ($t0,$t0,$a3)
da4d239d
AP
243 $ADDU $t1,$at
244 $ADDU $v0,$t0
245 sltu $at,$t1,$at
246 $ST $t1,0($a0)
247 $ADDU $v0,$at
248 beqz $a2,.L_bn_mul_add_words_return
249
250 $LD $t0,$BNSZ($a1)
947716c1 251 $MULTU ($t0,$a3)
da4d239d
AP
252 $LD $t1,$BNSZ($a0)
253 subu $a2,1
254 $ADDU $t1,$v0
255 sltu $v0,$t1,$v0
947716c1
AP
256 mflo ($at,$t0,$a3)
257 mfhi ($t0,$t0,$a3)
da4d239d
AP
258 $ADDU $t1,$at
259 $ADDU $v0,$t0
260 sltu $at,$t1,$at
261 $ST $t1,$BNSZ($a0)
262 $ADDU $v0,$at
263 beqz $a2,.L_bn_mul_add_words_return
264
265 $LD $t0,2*$BNSZ($a1)
947716c1 266 $MULTU ($t0,$a3)
da4d239d
AP
267 $LD $t1,2*$BNSZ($a0)
268 $ADDU $t1,$v0
269 sltu $v0,$t1,$v0
947716c1
AP
270 mflo ($at,$t0,$a3)
271 mfhi ($t0,$t0,$a3)
da4d239d
AP
272 $ADDU $t1,$at
273 $ADDU $v0,$t0
274 sltu $at,$t1,$at
275 $ST $t1,2*$BNSZ($a0)
276 $ADDU $v0,$at
277
278.L_bn_mul_add_words_return:
279 .set noreorder
280___
281$code.=<<___ if ($flavour =~ /nubi/i);
282 $REG_L $t3,4*$SZREG($sp)
283 $REG_L $t2,3*$SZREG($sp)
284 $REG_L $t1,2*$SZREG($sp)
285 $REG_L $t0,1*$SZREG($sp)
286 $REG_L $gp,0*$SZREG($sp)
287 $PTR_ADD $sp,6*$SZREG
288___
289$code.=<<___;
290 jr $ra
291 move $a0,$v0
66001268 292.end bn_mul_add_words_internal
da4d239d
AP
293
294.align 5
295.globl bn_mul_words
296.ent bn_mul_words
297bn_mul_words:
298 .set noreorder
299 bgtz $a2,bn_mul_words_internal
300 move $v0,$zero
301 jr $ra
302 move $a0,$v0
303.end bn_mul_words
304
305.align 5
306.ent bn_mul_words_internal
307bn_mul_words_internal:
308___
309$code.=<<___ if ($flavour =~ /nubi/i);
310 .frame $sp,6*$SZREG,$ra
311 .mask 0x8000f008,-$SZREG
312 .set noreorder
313 $PTR_SUB $sp,6*$SZREG
314 $REG_S $ra,5*$SZREG($sp)
315 $REG_S $t3,4*$SZREG($sp)
316 $REG_S $t2,3*$SZREG($sp)
317 $REG_S $t1,2*$SZREG($sp)
318 $REG_S $t0,1*$SZREG($sp)
319 $REG_S $gp,0*$SZREG($sp)
320___
321$code.=<<___;
322 .set reorder
323 li $minus4,-4
324 and $ta0,$a2,$minus4
da4d239d
AP
325 beqz $ta0,.L_bn_mul_words_tail
326
327.L_bn_mul_words_loop:
0c2adb0a 328 $LD $t0,0($a1)
947716c1 329 $MULTU ($t0,$a3)
da4d239d
AP
330 $LD $t2,$BNSZ($a1)
331 $LD $ta0,2*$BNSZ($a1)
332 $LD $ta2,3*$BNSZ($a1)
947716c1
AP
333 mflo ($at,$t0,$a3)
334 mfhi ($t0,$t0,$a3)
da4d239d
AP
335 $ADDU $v0,$at
336 sltu $t1,$v0,$at
947716c1 337 $MULTU ($t2,$a3)
da4d239d
AP
338 $ST $v0,0($a0)
339 $ADDU $v0,$t1,$t0
340
341 subu $a2,4
342 $PTR_ADD $a0,4*$BNSZ
343 $PTR_ADD $a1,4*$BNSZ
947716c1
AP
344 mflo ($at,$t2,$a3)
345 mfhi ($t2,$t2,$a3)
da4d239d
AP
346 $ADDU $v0,$at
347 sltu $t3,$v0,$at
947716c1 348 $MULTU ($ta0,$a3)
da4d239d
AP
349 $ST $v0,-3*$BNSZ($a0)
350 $ADDU $v0,$t3,$t2
351
947716c1
AP
352 mflo ($at,$ta0,$a3)
353 mfhi ($ta0,$ta0,$a3)
da4d239d
AP
354 $ADDU $v0,$at
355 sltu $ta1,$v0,$at
947716c1 356 $MULTU ($ta2,$a3)
da4d239d
AP
357 $ST $v0,-2*$BNSZ($a0)
358 $ADDU $v0,$ta1,$ta0
359
360 and $ta0,$a2,$minus4
947716c1
AP
361 mflo ($at,$ta2,$a3)
362 mfhi ($ta2,$ta2,$a3)
da4d239d
AP
363 $ADDU $v0,$at
364 sltu $ta3,$v0,$at
365 $ST $v0,-$BNSZ($a0)
da4d239d 366 .set noreorder
0c2adb0a
AP
367 bgtz $ta0,.L_bn_mul_words_loop
368 $ADDU $v0,$ta3,$ta2
da4d239d
AP
369
370 beqz $a2,.L_bn_mul_words_return
371 nop
372
373.L_bn_mul_words_tail:
374 .set reorder
375 $LD $t0,0($a1)
947716c1 376 $MULTU ($t0,$a3)
da4d239d 377 subu $a2,1
947716c1
AP
378 mflo ($at,$t0,$a3)
379 mfhi ($t0,$t0,$a3)
da4d239d
AP
380 $ADDU $v0,$at
381 sltu $t1,$v0,$at
382 $ST $v0,0($a0)
383 $ADDU $v0,$t1,$t0
384 beqz $a2,.L_bn_mul_words_return
385
386 $LD $t0,$BNSZ($a1)
947716c1 387 $MULTU ($t0,$a3)
da4d239d 388 subu $a2,1
947716c1
AP
389 mflo ($at,$t0,$a3)
390 mfhi ($t0,$t0,$a3)
da4d239d
AP
391 $ADDU $v0,$at
392 sltu $t1,$v0,$at
393 $ST $v0,$BNSZ($a0)
394 $ADDU $v0,$t1,$t0
395 beqz $a2,.L_bn_mul_words_return
396
397 $LD $t0,2*$BNSZ($a1)
947716c1
AP
398 $MULTU ($t0,$a3)
399 mflo ($at,$t0,$a3)
400 mfhi ($t0,$t0,$a3)
da4d239d
AP
401 $ADDU $v0,$at
402 sltu $t1,$v0,$at
403 $ST $v0,2*$BNSZ($a0)
404 $ADDU $v0,$t1,$t0
405
406.L_bn_mul_words_return:
407 .set noreorder
408___
409$code.=<<___ if ($flavour =~ /nubi/i);
410 $REG_L $t3,4*$SZREG($sp)
411 $REG_L $t2,3*$SZREG($sp)
412 $REG_L $t1,2*$SZREG($sp)
413 $REG_L $t0,1*$SZREG($sp)
414 $REG_L $gp,0*$SZREG($sp)
415 $PTR_ADD $sp,6*$SZREG
416___
417$code.=<<___;
418 jr $ra
419 move $a0,$v0
420.end bn_mul_words_internal
421
422.align 5
423.globl bn_sqr_words
424.ent bn_sqr_words
425bn_sqr_words:
426 .set noreorder
427 bgtz $a2,bn_sqr_words_internal
428 move $v0,$zero
429 jr $ra
430 move $a0,$v0
431.end bn_sqr_words
432
433.align 5
434.ent bn_sqr_words_internal
435bn_sqr_words_internal:
436___
437$code.=<<___ if ($flavour =~ /nubi/i);
438 .frame $sp,6*$SZREG,$ra
439 .mask 0x8000f008,-$SZREG
440 .set noreorder
441 $PTR_SUB $sp,6*$SZREG
442 $REG_S $ra,5*$SZREG($sp)
443 $REG_S $t3,4*$SZREG($sp)
444 $REG_S $t2,3*$SZREG($sp)
445 $REG_S $t1,2*$SZREG($sp)
446 $REG_S $t0,1*$SZREG($sp)
447 $REG_S $gp,0*$SZREG($sp)
448___
449$code.=<<___;
450 .set reorder
451 li $minus4,-4
452 and $ta0,$a2,$minus4
da4d239d
AP
453 beqz $ta0,.L_bn_sqr_words_tail
454
455.L_bn_sqr_words_loop:
0c2adb0a 456 $LD $t0,0($a1)
947716c1 457 $MULTU ($t0,$t0)
da4d239d
AP
458 $LD $t2,$BNSZ($a1)
459 $LD $ta0,2*$BNSZ($a1)
460 $LD $ta2,3*$BNSZ($a1)
947716c1
AP
461 mflo ($t1,$t0,$t0)
462 mfhi ($t0,$t0,$t0)
da4d239d
AP
463 $ST $t1,0($a0)
464 $ST $t0,$BNSZ($a0)
465
947716c1 466 $MULTU ($t2,$t2)
da4d239d
AP
467 subu $a2,4
468 $PTR_ADD $a0,8*$BNSZ
469 $PTR_ADD $a1,4*$BNSZ
947716c1
AP
470 mflo ($t3,$t2,$t2)
471 mfhi ($t2,$t2,$t2)
da4d239d
AP
472 $ST $t3,-6*$BNSZ($a0)
473 $ST $t2,-5*$BNSZ($a0)
474
947716c1
AP
475 $MULTU ($ta0,$ta0)
476 mflo ($ta1,$ta0,$ta0)
477 mfhi ($ta0,$ta0,$ta0)
da4d239d
AP
478 $ST $ta1,-4*$BNSZ($a0)
479 $ST $ta0,-3*$BNSZ($a0)
480
481
947716c1 482 $MULTU ($ta2,$ta2)
da4d239d 483 and $ta0,$a2,$minus4
947716c1
AP
484 mflo ($ta3,$ta2,$ta2)
485 mfhi ($ta2,$ta2,$ta2)
da4d239d 486 $ST $ta3,-2*$BNSZ($a0)
da4d239d
AP
487
488 .set noreorder
0c2adb0a
AP
489 bgtz $ta0,.L_bn_sqr_words_loop
490 $ST $ta2,-$BNSZ($a0)
da4d239d
AP
491
492 beqz $a2,.L_bn_sqr_words_return
493 nop
494
495.L_bn_sqr_words_tail:
496 .set reorder
497 $LD $t0,0($a1)
947716c1 498 $MULTU ($t0,$t0)
da4d239d 499 subu $a2,1
947716c1
AP
500 mflo ($t1,$t0,$t0)
501 mfhi ($t0,$t0,$t0)
da4d239d
AP
502 $ST $t1,0($a0)
503 $ST $t0,$BNSZ($a0)
504 beqz $a2,.L_bn_sqr_words_return
505
506 $LD $t0,$BNSZ($a1)
947716c1 507 $MULTU ($t0,$t0)
da4d239d 508 subu $a2,1
947716c1
AP
509 mflo ($t1,$t0,$t0)
510 mfhi ($t0,$t0,$t0)
da4d239d
AP
511 $ST $t1,2*$BNSZ($a0)
512 $ST $t0,3*$BNSZ($a0)
513 beqz $a2,.L_bn_sqr_words_return
514
515 $LD $t0,2*$BNSZ($a1)
947716c1
AP
516 $MULTU ($t0,$t0)
517 mflo ($t1,$t0,$t0)
518 mfhi ($t0,$t0,$t0)
da4d239d
AP
519 $ST $t1,4*$BNSZ($a0)
520 $ST $t0,5*$BNSZ($a0)
521
522.L_bn_sqr_words_return:
523 .set noreorder
524___
525$code.=<<___ if ($flavour =~ /nubi/i);
526 $REG_L $t3,4*$SZREG($sp)
527 $REG_L $t2,3*$SZREG($sp)
528 $REG_L $t1,2*$SZREG($sp)
529 $REG_L $t0,1*$SZREG($sp)
530 $REG_L $gp,0*$SZREG($sp)
531 $PTR_ADD $sp,6*$SZREG
532___
533$code.=<<___;
534 jr $ra
535 move $a0,$v0
536
537.end bn_sqr_words_internal
538
539.align 5
540.globl bn_add_words
541.ent bn_add_words
542bn_add_words:
543 .set noreorder
544 bgtz $a3,bn_add_words_internal
545 move $v0,$zero
546 jr $ra
547 move $a0,$v0
548.end bn_add_words
549
550.align 5
551.ent bn_add_words_internal
552bn_add_words_internal:
553___
554$code.=<<___ if ($flavour =~ /nubi/i);
555 .frame $sp,6*$SZREG,$ra
556 .mask 0x8000f008,-$SZREG
557 .set noreorder
558 $PTR_SUB $sp,6*$SZREG
559 $REG_S $ra,5*$SZREG($sp)
560 $REG_S $t3,4*$SZREG($sp)
561 $REG_S $t2,3*$SZREG($sp)
562 $REG_S $t1,2*$SZREG($sp)
563 $REG_S $t0,1*$SZREG($sp)
564 $REG_S $gp,0*$SZREG($sp)
565___
566$code.=<<___;
567 .set reorder
568 li $minus4,-4
569 and $at,$a3,$minus4
da4d239d
AP
570 beqz $at,.L_bn_add_words_tail
571
572.L_bn_add_words_loop:
0c2adb0a 573 $LD $t0,0($a1)
da4d239d
AP
574 $LD $ta0,0($a2)
575 subu $a3,4
576 $LD $t1,$BNSZ($a1)
577 and $at,$a3,$minus4
578 $LD $t2,2*$BNSZ($a1)
579 $PTR_ADD $a2,4*$BNSZ
580 $LD $t3,3*$BNSZ($a1)
581 $PTR_ADD $a0,4*$BNSZ
582 $LD $ta1,-3*$BNSZ($a2)
583 $PTR_ADD $a1,4*$BNSZ
584 $LD $ta2,-2*$BNSZ($a2)
585 $LD $ta3,-$BNSZ($a2)
586 $ADDU $ta0,$t0
587 sltu $t8,$ta0,$t0
588 $ADDU $t0,$ta0,$v0
589 sltu $v0,$t0,$ta0
590 $ST $t0,-4*$BNSZ($a0)
591 $ADDU $v0,$t8
592
593 $ADDU $ta1,$t1
594 sltu $t9,$ta1,$t1
595 $ADDU $t1,$ta1,$v0
596 sltu $v0,$t1,$ta1
597 $ST $t1,-3*$BNSZ($a0)
598 $ADDU $v0,$t9
599
600 $ADDU $ta2,$t2
601 sltu $t8,$ta2,$t2
602 $ADDU $t2,$ta2,$v0
603 sltu $v0,$t2,$ta2
604 $ST $t2,-2*$BNSZ($a0)
605 $ADDU $v0,$t8
609b0852 606
da4d239d
AP
607 $ADDU $ta3,$t3
608 sltu $t9,$ta3,$t3
609 $ADDU $t3,$ta3,$v0
610 sltu $v0,$t3,$ta3
611 $ST $t3,-$BNSZ($a0)
609b0852 612
da4d239d 613 .set noreorder
0c2adb0a
AP
614 bgtz $at,.L_bn_add_words_loop
615 $ADDU $v0,$t9
da4d239d
AP
616
617 beqz $a3,.L_bn_add_words_return
618 nop
619
620.L_bn_add_words_tail:
621 .set reorder
622 $LD $t0,0($a1)
623 $LD $ta0,0($a2)
624 $ADDU $ta0,$t0
625 subu $a3,1
626 sltu $t8,$ta0,$t0
627 $ADDU $t0,$ta0,$v0
628 sltu $v0,$t0,$ta0
629 $ST $t0,0($a0)
630 $ADDU $v0,$t8
631 beqz $a3,.L_bn_add_words_return
632
633 $LD $t1,$BNSZ($a1)
634 $LD $ta1,$BNSZ($a2)
635 $ADDU $ta1,$t1
636 subu $a3,1
637 sltu $t9,$ta1,$t1
638 $ADDU $t1,$ta1,$v0
639 sltu $v0,$t1,$ta1
640 $ST $t1,$BNSZ($a0)
641 $ADDU $v0,$t9
642 beqz $a3,.L_bn_add_words_return
643
644 $LD $t2,2*$BNSZ($a1)
645 $LD $ta2,2*$BNSZ($a2)
646 $ADDU $ta2,$t2
647 sltu $t8,$ta2,$t2
648 $ADDU $t2,$ta2,$v0
649 sltu $v0,$t2,$ta2
650 $ST $t2,2*$BNSZ($a0)
651 $ADDU $v0,$t8
652
653.L_bn_add_words_return:
654 .set noreorder
655___
656$code.=<<___ if ($flavour =~ /nubi/i);
657 $REG_L $t3,4*$SZREG($sp)
658 $REG_L $t2,3*$SZREG($sp)
659 $REG_L $t1,2*$SZREG($sp)
660 $REG_L $t0,1*$SZREG($sp)
661 $REG_L $gp,0*$SZREG($sp)
662 $PTR_ADD $sp,6*$SZREG
663___
664$code.=<<___;
665 jr $ra
666 move $a0,$v0
667
668.end bn_add_words_internal
669
670.align 5
671.globl bn_sub_words
672.ent bn_sub_words
673bn_sub_words:
674 .set noreorder
675 bgtz $a3,bn_sub_words_internal
676 move $v0,$zero
677 jr $ra
678 move $a0,$zero
679.end bn_sub_words
680
681.align 5
682.ent bn_sub_words_internal
683bn_sub_words_internal:
684___
685$code.=<<___ if ($flavour =~ /nubi/i);
686 .frame $sp,6*$SZREG,$ra
687 .mask 0x8000f008,-$SZREG
688 .set noreorder
689 $PTR_SUB $sp,6*$SZREG
690 $REG_S $ra,5*$SZREG($sp)
691 $REG_S $t3,4*$SZREG($sp)
692 $REG_S $t2,3*$SZREG($sp)
693 $REG_S $t1,2*$SZREG($sp)
694 $REG_S $t0,1*$SZREG($sp)
695 $REG_S $gp,0*$SZREG($sp)
696___
697$code.=<<___;
698 .set reorder
699 li $minus4,-4
700 and $at,$a3,$minus4
da4d239d
AP
701 beqz $at,.L_bn_sub_words_tail
702
703.L_bn_sub_words_loop:
0c2adb0a 704 $LD $t0,0($a1)
da4d239d
AP
705 $LD $ta0,0($a2)
706 subu $a3,4
707 $LD $t1,$BNSZ($a1)
708 and $at,$a3,$minus4
709 $LD $t2,2*$BNSZ($a1)
710 $PTR_ADD $a2,4*$BNSZ
711 $LD $t3,3*$BNSZ($a1)
712 $PTR_ADD $a0,4*$BNSZ
713 $LD $ta1,-3*$BNSZ($a2)
714 $PTR_ADD $a1,4*$BNSZ
715 $LD $ta2,-2*$BNSZ($a2)
716 $LD $ta3,-$BNSZ($a2)
717 sltu $t8,$t0,$ta0
718 $SUBU $ta0,$t0,$ta0
719 $SUBU $t0,$ta0,$v0
720 sgtu $v0,$t0,$ta0
721 $ST $t0,-4*$BNSZ($a0)
722 $ADDU $v0,$t8
723
724 sltu $t9,$t1,$ta1
725 $SUBU $ta1,$t1,$ta1
726 $SUBU $t1,$ta1,$v0
727 sgtu $v0,$t1,$ta1
728 $ST $t1,-3*$BNSZ($a0)
729 $ADDU $v0,$t9
730
731
732 sltu $t8,$t2,$ta2
733 $SUBU $ta2,$t2,$ta2
734 $SUBU $t2,$ta2,$v0
735 sgtu $v0,$t2,$ta2
736 $ST $t2,-2*$BNSZ($a0)
737 $ADDU $v0,$t8
738
739 sltu $t9,$t3,$ta3
740 $SUBU $ta3,$t3,$ta3
741 $SUBU $t3,$ta3,$v0
742 sgtu $v0,$t3,$ta3
743 $ST $t3,-$BNSZ($a0)
da4d239d
AP
744
745 .set noreorder
0c2adb0a
AP
746 bgtz $at,.L_bn_sub_words_loop
747 $ADDU $v0,$t9
da4d239d
AP
748
749 beqz $a3,.L_bn_sub_words_return
750 nop
751
752.L_bn_sub_words_tail:
753 .set reorder
754 $LD $t0,0($a1)
755 $LD $ta0,0($a2)
756 subu $a3,1
757 sltu $t8,$t0,$ta0
758 $SUBU $ta0,$t0,$ta0
759 $SUBU $t0,$ta0,$v0
760 sgtu $v0,$t0,$ta0
761 $ST $t0,0($a0)
762 $ADDU $v0,$t8
763 beqz $a3,.L_bn_sub_words_return
764
765 $LD $t1,$BNSZ($a1)
766 subu $a3,1
767 $LD $ta1,$BNSZ($a2)
768 sltu $t9,$t1,$ta1
769 $SUBU $ta1,$t1,$ta1
770 $SUBU $t1,$ta1,$v0
771 sgtu $v0,$t1,$ta1
772 $ST $t1,$BNSZ($a0)
773 $ADDU $v0,$t9
774 beqz $a3,.L_bn_sub_words_return
775
776 $LD $t2,2*$BNSZ($a1)
777 $LD $ta2,2*$BNSZ($a2)
778 sltu $t8,$t2,$ta2
779 $SUBU $ta2,$t2,$ta2
780 $SUBU $t2,$ta2,$v0
781 sgtu $v0,$t2,$ta2
782 $ST $t2,2*$BNSZ($a0)
783 $ADDU $v0,$t8
784
785.L_bn_sub_words_return:
786 .set noreorder
787___
788$code.=<<___ if ($flavour =~ /nubi/i);
789 $REG_L $t3,4*$SZREG($sp)
790 $REG_L $t2,3*$SZREG($sp)
791 $REG_L $t1,2*$SZREG($sp)
792 $REG_L $t0,1*$SZREG($sp)
793 $REG_L $gp,0*$SZREG($sp)
794 $PTR_ADD $sp,6*$SZREG
795___
796$code.=<<___;
797 jr $ra
798 move $a0,$v0
66001268 799.end bn_sub_words_internal
da4d239d
AP
800
801.align 5
802.globl bn_div_3_words
803.ent bn_div_3_words
804bn_div_3_words:
805 .set noreorder
806 move $a3,$a0 # we know that bn_div_words does not
807 # touch $a3, $ta2, $ta3 and preserves $a2
808 # so that we can save two arguments
809 # and return address in registers
810 # instead of stack:-)
609b0852 811
da4d239d
AP
812 $LD $a0,($a3)
813 move $ta2,$a1
814 bne $a0,$a2,bn_div_3_words_internal
815 $LD $a1,-$BNSZ($a3)
816 li $v0,-1
817 jr $ra
818 move $a0,$v0
819.end bn_div_3_words
820
821.align 5
822.ent bn_div_3_words_internal
823bn_div_3_words_internal:
824___
825$code.=<<___ if ($flavour =~ /nubi/i);
826 .frame $sp,6*$SZREG,$ra
827 .mask 0x8000f008,-$SZREG
828 .set noreorder
829 $PTR_SUB $sp,6*$SZREG
830 $REG_S $ra,5*$SZREG($sp)
831 $REG_S $t3,4*$SZREG($sp)
832 $REG_S $t2,3*$SZREG($sp)
833 $REG_S $t1,2*$SZREG($sp)
834 $REG_S $t0,1*$SZREG($sp)
835 $REG_S $gp,0*$SZREG($sp)
836___
837$code.=<<___;
838 .set reorder
839 move $ta3,$ra
543fd854 840 bal bn_div_words_internal
da4d239d 841 move $ra,$ta3
947716c1 842 $MULTU ($ta2,$v0)
da4d239d
AP
843 $LD $t2,-2*$BNSZ($a3)
844 move $ta0,$zero
947716c1
AP
845 mfhi ($t1,$ta2,$v0)
846 mflo ($t0,$ta2,$v0)
da4d239d
AP
847 sltu $t8,$t1,$a1
848.L_bn_div_3_words_inner_loop:
849 bnez $t8,.L_bn_div_3_words_inner_loop_done
850 sgeu $at,$t2,$t0
851 seq $t9,$t1,$a1
852 and $at,$t9
853 sltu $t3,$t0,$ta2
854 $ADDU $a1,$a2
855 $SUBU $t1,$t3
856 $SUBU $t0,$ta2
857 sltu $t8,$t1,$a1
858 sltu $ta0,$a1,$a2
859 or $t8,$ta0
860 .set noreorder
0c2adb0a 861 beqz $at,.L_bn_div_3_words_inner_loop
da4d239d 862 $SUBU $v0,1
0c2adb0a 863 $ADDU $v0,1
da4d239d
AP
864 .set reorder
865.L_bn_div_3_words_inner_loop_done:
866 .set noreorder
867___
868$code.=<<___ if ($flavour =~ /nubi/i);
869 $REG_L $t3,4*$SZREG($sp)
870 $REG_L $t2,3*$SZREG($sp)
871 $REG_L $t1,2*$SZREG($sp)
872 $REG_L $t0,1*$SZREG($sp)
873 $REG_L $gp,0*$SZREG($sp)
874 $PTR_ADD $sp,6*$SZREG
875___
876$code.=<<___;
877 jr $ra
878 move $a0,$v0
879.end bn_div_3_words_internal
880
881.align 5
882.globl bn_div_words
883.ent bn_div_words
884bn_div_words:
885 .set noreorder
886 bnez $a2,bn_div_words_internal
887 li $v0,-1 # I would rather signal div-by-zero
888 # which can be done with 'break 7'
889 jr $ra
890 move $a0,$v0
891.end bn_div_words
892
893.align 5
894.ent bn_div_words_internal
895bn_div_words_internal:
896___
897$code.=<<___ if ($flavour =~ /nubi/i);
898 .frame $sp,6*$SZREG,$ra
899 .mask 0x8000f008,-$SZREG
900 .set noreorder
901 $PTR_SUB $sp,6*$SZREG
902 $REG_S $ra,5*$SZREG($sp)
903 $REG_S $t3,4*$SZREG($sp)
904 $REG_S $t2,3*$SZREG($sp)
905 $REG_S $t1,2*$SZREG($sp)
906 $REG_S $t0,1*$SZREG($sp)
907 $REG_S $gp,0*$SZREG($sp)
908___
909$code.=<<___;
910 move $v1,$zero
911 bltz $a2,.L_bn_div_words_body
912 move $t9,$v1
913 $SLL $a2,1
914 bgtz $a2,.-4
915 addu $t9,1
916
917 .set reorder
918 negu $t1,$t9
919 li $t2,-1
920 $SLL $t2,$t1
921 and $t2,$a0
922 $SRL $at,$a1,$t1
923 .set noreorder
0c2adb0a
AP
924 beqz $t2,.+12
925 nop
da4d239d
AP
926 break 6 # signal overflow
927 .set reorder
928 $SLL $a0,$t9
929 $SLL $a1,$t9
930 or $a0,$at
931___
932$QT=$ta0;
933$HH=$ta1;
934$DH=$v1;
935$code.=<<___;
936.L_bn_div_words_body:
937 $SRL $DH,$a2,4*$BNSZ # bits
938 sgeu $at,$a0,$a2
939 .set noreorder
0c2adb0a
AP
940 beqz $at,.+12
941 nop
da4d239d
AP
942 $SUBU $a0,$a2
943 .set reorder
944
945 li $QT,-1
946 $SRL $HH,$a0,4*$BNSZ # bits
947 $SRL $QT,4*$BNSZ # q=0xffffffff
948 beq $DH,$HH,.L_bn_div_words_skip_div1
947716c1
AP
949 $DIVU ($a0,$DH)
950 mfqt ($QT,$a0,$DH)
da4d239d 951.L_bn_div_words_skip_div1:
947716c1 952 $MULTU ($a2,$QT)
da4d239d
AP
953 $SLL $t3,$a0,4*$BNSZ # bits
954 $SRL $at,$a1,4*$BNSZ # bits
955 or $t3,$at
947716c1
AP
956 mflo ($t0,$a2,$QT)
957 mfhi ($t1,$a2,$QT)
da4d239d
AP
958.L_bn_div_words_inner_loop1:
959 sltu $t2,$t3,$t0
960 seq $t8,$HH,$t1
961 sltu $at,$HH,$t1
962 and $t2,$t8
963 sltu $v0,$t0,$a2
964 or $at,$t2
965 .set noreorder
966 beqz $at,.L_bn_div_words_inner_loop1_done
967 $SUBU $t1,$v0
968 $SUBU $t0,$a2
969 b .L_bn_div_words_inner_loop1
970 $SUBU $QT,1
971 .set reorder
972.L_bn_div_words_inner_loop1_done:
973
974 $SLL $a1,4*$BNSZ # bits
975 $SUBU $a0,$t3,$t0
976 $SLL $v0,$QT,4*$BNSZ # bits
977
978 li $QT,-1
979 $SRL $HH,$a0,4*$BNSZ # bits
980 $SRL $QT,4*$BNSZ # q=0xffffffff
981 beq $DH,$HH,.L_bn_div_words_skip_div2
947716c1
AP
982 $DIVU ($a0,$DH)
983 mfqt ($QT,$a0,$DH)
da4d239d 984.L_bn_div_words_skip_div2:
947716c1 985 $MULTU ($a2,$QT)
da4d239d
AP
986 $SLL $t3,$a0,4*$BNSZ # bits
987 $SRL $at,$a1,4*$BNSZ # bits
988 or $t3,$at
947716c1
AP
989 mflo ($t0,$a2,$QT)
990 mfhi ($t1,$a2,$QT)
da4d239d
AP
991.L_bn_div_words_inner_loop2:
992 sltu $t2,$t3,$t0
993 seq $t8,$HH,$t1
994 sltu $at,$HH,$t1
995 and $t2,$t8
996 sltu $v1,$t0,$a2
997 or $at,$t2
998 .set noreorder
999 beqz $at,.L_bn_div_words_inner_loop2_done
1000 $SUBU $t1,$v1
1001 $SUBU $t0,$a2
1002 b .L_bn_div_words_inner_loop2
1003 $SUBU $QT,1
1004 .set reorder
1005.L_bn_div_words_inner_loop2_done:
1006
1007 $SUBU $a0,$t3,$t0
1008 or $v0,$QT
1009 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1010 $SRL $a2,$t9 # restore $a2
1011
1012 .set noreorder
1013 move $a1,$v1
1014___
1015$code.=<<___ if ($flavour =~ /nubi/i);
1016 $REG_L $t3,4*$SZREG($sp)
1017 $REG_L $t2,3*$SZREG($sp)
1018 $REG_L $t1,2*$SZREG($sp)
1019 $REG_L $t0,1*$SZREG($sp)
1020 $REG_L $gp,0*$SZREG($sp)
1021 $PTR_ADD $sp,6*$SZREG
1022___
1023$code.=<<___;
1024 jr $ra
1025 move $a0,$v0
1026.end bn_div_words_internal
1027___
1028undef $HH; undef $QT; undef $DH;
1029
1030($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1031($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1032
1033($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1034($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1035
1036($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1037
1038$code.=<<___;
1039
1040.align 5
1041.globl bn_mul_comba8
1042.ent bn_mul_comba8
1043bn_mul_comba8:
1044 .set noreorder
1045___
1046$code.=<<___ if ($flavour =~ /nubi/i);
1047 .frame $sp,12*$SZREG,$ra
1048 .mask 0x803ff008,-$SZREG
1049 $PTR_SUB $sp,12*$SZREG
1050 $REG_S $ra,11*$SZREG($sp)
1051 $REG_S $s5,10*$SZREG($sp)
1052 $REG_S $s4,9*$SZREG($sp)
1053 $REG_S $s3,8*$SZREG($sp)
1054 $REG_S $s2,7*$SZREG($sp)
1055 $REG_S $s1,6*$SZREG($sp)
1056 $REG_S $s0,5*$SZREG($sp)
1057 $REG_S $t3,4*$SZREG($sp)
1058 $REG_S $t2,3*$SZREG($sp)
1059 $REG_S $t1,2*$SZREG($sp)
1060 $REG_S $t0,1*$SZREG($sp)
1061 $REG_S $gp,0*$SZREG($sp)
1062___
1063$code.=<<___ if ($flavour !~ /nubi/i);
1064 .frame $sp,6*$SZREG,$ra
1065 .mask 0x003f0000,-$SZREG
1066 $PTR_SUB $sp,6*$SZREG
1067 $REG_S $s5,5*$SZREG($sp)
1068 $REG_S $s4,4*$SZREG($sp)
1069 $REG_S $s3,3*$SZREG($sp)
1070 $REG_S $s2,2*$SZREG($sp)
1071 $REG_S $s1,1*$SZREG($sp)
1072 $REG_S $s0,0*$SZREG($sp)
1073___
1074$code.=<<___;
1075
1076 .set reorder
1077 $LD $a_0,0($a1) # If compiled with -mips3 option on
1078 # R5000 box assembler barks on this
1079 # 1ine with "should not have mult/div
1080 # as last instruction in bb (R10K
1081 # bug)" warning. If anybody out there
1082 # has a clue about how to circumvent
1083 # this do send me a note.
1084 # <appro\@fy.chalmers.se>
1085
1086 $LD $b_0,0($a2)
1087 $LD $a_1,$BNSZ($a1)
1088 $LD $a_2,2*$BNSZ($a1)
947716c1 1089 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
1090 $LD $a_3,3*$BNSZ($a1)
1091 $LD $b_1,$BNSZ($a2)
1092 $LD $b_2,2*$BNSZ($a2)
1093 $LD $b_3,3*$BNSZ($a2)
947716c1
AP
1094 mflo ($c_1,$a_0,$b_0)
1095 mfhi ($c_2,$a_0,$b_0)
da4d239d
AP
1096
1097 $LD $a_4,4*$BNSZ($a1)
1098 $LD $a_5,5*$BNSZ($a1)
947716c1 1099 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
da4d239d
AP
1100 $LD $a_6,6*$BNSZ($a1)
1101 $LD $a_7,7*$BNSZ($a1)
1102 $LD $b_4,4*$BNSZ($a2)
1103 $LD $b_5,5*$BNSZ($a2)
947716c1
AP
1104 mflo ($t_1,$a_0,$b_1)
1105 mfhi ($t_2,$a_0,$b_1)
da4d239d
AP
1106 $ADDU $c_2,$t_1
1107 sltu $at,$c_2,$t_1
947716c1 1108 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
da4d239d
AP
1109 $ADDU $c_3,$t_2,$at
1110 $LD $b_6,6*$BNSZ($a2)
1111 $LD $b_7,7*$BNSZ($a2)
1112 $ST $c_1,0($a0) # r[0]=c1;
947716c1
AP
1113 mflo ($t_1,$a_1,$b_0)
1114 mfhi ($t_2,$a_1,$b_0)
da4d239d
AP
1115 $ADDU $c_2,$t_1
1116 sltu $at,$c_2,$t_1
947716c1 1117 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
da4d239d
AP
1118 $ADDU $t_2,$at
1119 $ADDU $c_3,$t_2
1120 sltu $c_1,$c_3,$t_2
1121 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1122
947716c1
AP
1123 mflo ($t_1,$a_2,$b_0)
1124 mfhi ($t_2,$a_2,$b_0)
da4d239d
AP
1125 $ADDU $c_3,$t_1
1126 sltu $at,$c_3,$t_1
947716c1 1127 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
da4d239d
AP
1128 $ADDU $t_2,$at
1129 $ADDU $c_1,$t_2
947716c1
AP
1130 mflo ($t_1,$a_1,$b_1)
1131 mfhi ($t_2,$a_1,$b_1)
da4d239d
AP
1132 $ADDU $c_3,$t_1
1133 sltu $at,$c_3,$t_1
947716c1 1134 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
da4d239d
AP
1135 $ADDU $t_2,$at
1136 $ADDU $c_1,$t_2
1137 sltu $c_2,$c_1,$t_2
947716c1
AP
1138 mflo ($t_1,$a_0,$b_2)
1139 mfhi ($t_2,$a_0,$b_2)
da4d239d
AP
1140 $ADDU $c_3,$t_1
1141 sltu $at,$c_3,$t_1
947716c1 1142 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
da4d239d
AP
1143 $ADDU $t_2,$at
1144 $ADDU $c_1,$t_2
1145 sltu $at,$c_1,$t_2
1146 $ADDU $c_2,$at
1147 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1148
947716c1
AP
1149 mflo ($t_1,$a_0,$b_3)
1150 mfhi ($t_2,$a_0,$b_3)
da4d239d
AP
1151 $ADDU $c_1,$t_1
1152 sltu $at,$c_1,$t_1
947716c1 1153 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
da4d239d
AP
1154 $ADDU $t_2,$at
1155 $ADDU $c_2,$t_2
1156 sltu $c_3,$c_2,$t_2
947716c1
AP
1157 mflo ($t_1,$a_1,$b_2)
1158 mfhi ($t_2,$a_1,$b_2)
da4d239d
AP
1159 $ADDU $c_1,$t_1
1160 sltu $at,$c_1,$t_1
947716c1 1161 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
da4d239d
AP
1162 $ADDU $t_2,$at
1163 $ADDU $c_2,$t_2
1164 sltu $at,$c_2,$t_2
1165 $ADDU $c_3,$at
947716c1
AP
1166 mflo ($t_1,$a_2,$b_1)
1167 mfhi ($t_2,$a_2,$b_1)
da4d239d
AP
1168 $ADDU $c_1,$t_1
1169 sltu $at,$c_1,$t_1
947716c1 1170 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
da4d239d
AP
1171 $ADDU $t_2,$at
1172 $ADDU $c_2,$t_2
1173 sltu $at,$c_2,$t_2
1174 $ADDU $c_3,$at
947716c1
AP
1175 mflo ($t_1,$a_3,$b_0)
1176 mfhi ($t_2,$a_3,$b_0)
da4d239d
AP
1177 $ADDU $c_1,$t_1
1178 sltu $at,$c_1,$t_1
947716c1 1179 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
da4d239d
AP
1180 $ADDU $t_2,$at
1181 $ADDU $c_2,$t_2
1182 sltu $at,$c_2,$t_2
1183 $ADDU $c_3,$at
1184 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1185
947716c1
AP
1186 mflo ($t_1,$a_4,$b_0)
1187 mfhi ($t_2,$a_4,$b_0)
da4d239d
AP
1188 $ADDU $c_2,$t_1
1189 sltu $at,$c_2,$t_1
947716c1 1190 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
da4d239d
AP
1191 $ADDU $t_2,$at
1192 $ADDU $c_3,$t_2
1193 sltu $c_1,$c_3,$t_2
947716c1
AP
1194 mflo ($t_1,$a_3,$b_1)
1195 mfhi ($t_2,$a_3,$b_1)
da4d239d
AP
1196 $ADDU $c_2,$t_1
1197 sltu $at,$c_2,$t_1
947716c1 1198 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
da4d239d
AP
1199 $ADDU $t_2,$at
1200 $ADDU $c_3,$t_2
1201 sltu $at,$c_3,$t_2
1202 $ADDU $c_1,$at
947716c1
AP
1203 mflo ($t_1,$a_2,$b_2)
1204 mfhi ($t_2,$a_2,$b_2)
da4d239d
AP
1205 $ADDU $c_2,$t_1
1206 sltu $at,$c_2,$t_1
947716c1 1207 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
da4d239d
AP
1208 $ADDU $t_2,$at
1209 $ADDU $c_3,$t_2
1210 sltu $at,$c_3,$t_2
1211 $ADDU $c_1,$at
947716c1
AP
1212 mflo ($t_1,$a_1,$b_3)
1213 mfhi ($t_2,$a_1,$b_3)
da4d239d
AP
1214 $ADDU $c_2,$t_1
1215 sltu $at,$c_2,$t_1
947716c1 1216 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
da4d239d
AP
1217 $ADDU $t_2,$at
1218 $ADDU $c_3,$t_2
1219 sltu $at,$c_3,$t_2
1220 $ADDU $c_1,$at
947716c1
AP
1221 mflo ($t_1,$a_0,$b_4)
1222 mfhi ($t_2,$a_0,$b_4)
da4d239d
AP
1223 $ADDU $c_2,$t_1
1224 sltu $at,$c_2,$t_1
947716c1 1225 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
da4d239d
AP
1226 $ADDU $t_2,$at
1227 $ADDU $c_3,$t_2
1228 sltu $at,$c_3,$t_2
1229 $ADDU $c_1,$at
1230 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1231
947716c1
AP
1232 mflo ($t_1,$a_0,$b_5)
1233 mfhi ($t_2,$a_0,$b_5)
da4d239d
AP
1234 $ADDU $c_3,$t_1
1235 sltu $at,$c_3,$t_1
947716c1 1236 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
da4d239d
AP
1237 $ADDU $t_2,$at
1238 $ADDU $c_1,$t_2
1239 sltu $c_2,$c_1,$t_2
947716c1
AP
1240 mflo ($t_1,$a_1,$b_4)
1241 mfhi ($t_2,$a_1,$b_4)
da4d239d
AP
1242 $ADDU $c_3,$t_1
1243 sltu $at,$c_3,$t_1
947716c1 1244 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
da4d239d
AP
1245 $ADDU $t_2,$at
1246 $ADDU $c_1,$t_2
1247 sltu $at,$c_1,$t_2
1248 $ADDU $c_2,$at
947716c1
AP
1249 mflo ($t_1,$a_2,$b_3)
1250 mfhi ($t_2,$a_2,$b_3)
da4d239d
AP
1251 $ADDU $c_3,$t_1
1252 sltu $at,$c_3,$t_1
947716c1 1253 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
da4d239d
AP
1254 $ADDU $t_2,$at
1255 $ADDU $c_1,$t_2
1256 sltu $at,$c_1,$t_2
1257 $ADDU $c_2,$at
947716c1
AP
1258 mflo ($t_1,$a_3,$b_2)
1259 mfhi ($t_2,$a_3,$b_2)
da4d239d
AP
1260 $ADDU $c_3,$t_1
1261 sltu $at,$c_3,$t_1
947716c1 1262 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
da4d239d
AP
1263 $ADDU $t_2,$at
1264 $ADDU $c_1,$t_2
1265 sltu $at,$c_1,$t_2
1266 $ADDU $c_2,$at
947716c1
AP
1267 mflo ($t_1,$a_4,$b_1)
1268 mfhi ($t_2,$a_4,$b_1)
da4d239d
AP
1269 $ADDU $c_3,$t_1
1270 sltu $at,$c_3,$t_1
947716c1 1271 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
da4d239d
AP
1272 $ADDU $t_2,$at
1273 $ADDU $c_1,$t_2
1274 sltu $at,$c_1,$t_2
1275 $ADDU $c_2,$at
947716c1
AP
1276 mflo ($t_1,$a_5,$b_0)
1277 mfhi ($t_2,$a_5,$b_0)
da4d239d
AP
1278 $ADDU $c_3,$t_1
1279 sltu $at,$c_3,$t_1
947716c1 1280 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
da4d239d
AP
1281 $ADDU $t_2,$at
1282 $ADDU $c_1,$t_2
1283 sltu $at,$c_1,$t_2
1284 $ADDU $c_2,$at
1285 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1286
947716c1
AP
1287 mflo ($t_1,$a_6,$b_0)
1288 mfhi ($t_2,$a_6,$b_0)
da4d239d
AP
1289 $ADDU $c_1,$t_1
1290 sltu $at,$c_1,$t_1
947716c1 1291 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
da4d239d
AP
1292 $ADDU $t_2,$at
1293 $ADDU $c_2,$t_2
1294 sltu $c_3,$c_2,$t_2
947716c1
AP
1295 mflo ($t_1,$a_5,$b_1)
1296 mfhi ($t_2,$a_5,$b_1)
da4d239d
AP
1297 $ADDU $c_1,$t_1
1298 sltu $at,$c_1,$t_1
947716c1 1299 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
da4d239d
AP
1300 $ADDU $t_2,$at
1301 $ADDU $c_2,$t_2
1302 sltu $at,$c_2,$t_2
1303 $ADDU $c_3,$at
947716c1
AP
1304 mflo ($t_1,$a_4,$b_2)
1305 mfhi ($t_2,$a_4,$b_2)
da4d239d
AP
1306 $ADDU $c_1,$t_1
1307 sltu $at,$c_1,$t_1
947716c1 1308 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
da4d239d
AP
1309 $ADDU $t_2,$at
1310 $ADDU $c_2,$t_2
1311 sltu $at,$c_2,$t_2
1312 $ADDU $c_3,$at
947716c1
AP
1313 mflo ($t_1,$a_3,$b_3)
1314 mfhi ($t_2,$a_3,$b_3)
da4d239d
AP
1315 $ADDU $c_1,$t_1
1316 sltu $at,$c_1,$t_1
947716c1 1317 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
da4d239d
AP
1318 $ADDU $t_2,$at
1319 $ADDU $c_2,$t_2
1320 sltu $at,$c_2,$t_2
1321 $ADDU $c_3,$at
947716c1
AP
1322 mflo ($t_1,$a_2,$b_4)
1323 mfhi ($t_2,$a_2,$b_4)
da4d239d
AP
1324 $ADDU $c_1,$t_1
1325 sltu $at,$c_1,$t_1
947716c1 1326 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
da4d239d
AP
1327 $ADDU $t_2,$at
1328 $ADDU $c_2,$t_2
1329 sltu $at,$c_2,$t_2
1330 $ADDU $c_3,$at
947716c1
AP
1331 mflo ($t_1,$a_1,$b_5)
1332 mfhi ($t_2,$a_1,$b_5)
da4d239d
AP
1333 $ADDU $c_1,$t_1
1334 sltu $at,$c_1,$t_1
947716c1 1335 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
da4d239d
AP
1336 $ADDU $t_2,$at
1337 $ADDU $c_2,$t_2
1338 sltu $at,$c_2,$t_2
1339 $ADDU $c_3,$at
947716c1
AP
1340 mflo ($t_1,$a_0,$b_6)
1341 mfhi ($t_2,$a_0,$b_6)
da4d239d
AP
1342 $ADDU $c_1,$t_1
1343 sltu $at,$c_1,$t_1
947716c1 1344 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
da4d239d
AP
1345 $ADDU $t_2,$at
1346 $ADDU $c_2,$t_2
1347 sltu $at,$c_2,$t_2
1348 $ADDU $c_3,$at
1349 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1350
947716c1
AP
1351 mflo ($t_1,$a_0,$b_7)
1352 mfhi ($t_2,$a_0,$b_7)
da4d239d
AP
1353 $ADDU $c_2,$t_1
1354 sltu $at,$c_2,$t_1
947716c1 1355 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
da4d239d
AP
1356 $ADDU $t_2,$at
1357 $ADDU $c_3,$t_2
1358 sltu $c_1,$c_3,$t_2
947716c1
AP
1359 mflo ($t_1,$a_1,$b_6)
1360 mfhi ($t_2,$a_1,$b_6)
da4d239d
AP
1361 $ADDU $c_2,$t_1
1362 sltu $at,$c_2,$t_1
947716c1 1363 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
da4d239d
AP
1364 $ADDU $t_2,$at
1365 $ADDU $c_3,$t_2
1366 sltu $at,$c_3,$t_2
1367 $ADDU $c_1,$at
947716c1
AP
1368 mflo ($t_1,$a_2,$b_5)
1369 mfhi ($t_2,$a_2,$b_5)
da4d239d
AP
1370 $ADDU $c_2,$t_1
1371 sltu $at,$c_2,$t_1
947716c1 1372 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
da4d239d
AP
1373 $ADDU $t_2,$at
1374 $ADDU $c_3,$t_2
1375 sltu $at,$c_3,$t_2
1376 $ADDU $c_1,$at
947716c1
AP
1377 mflo ($t_1,$a_3,$b_4)
1378 mfhi ($t_2,$a_3,$b_4)
da4d239d
AP
1379 $ADDU $c_2,$t_1
1380 sltu $at,$c_2,$t_1
947716c1 1381 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
da4d239d
AP
1382 $ADDU $t_2,$at
1383 $ADDU $c_3,$t_2
1384 sltu $at,$c_3,$t_2
1385 $ADDU $c_1,$at
947716c1
AP
1386 mflo ($t_1,$a_4,$b_3)
1387 mfhi ($t_2,$a_4,$b_3)
da4d239d
AP
1388 $ADDU $c_2,$t_1
1389 sltu $at,$c_2,$t_1
947716c1 1390 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
da4d239d
AP
1391 $ADDU $t_2,$at
1392 $ADDU $c_3,$t_2
1393 sltu $at,$c_3,$t_2
1394 $ADDU $c_1,$at
947716c1
AP
1395 mflo ($t_1,$a_5,$b_2)
1396 mfhi ($t_2,$a_5,$b_2)
da4d239d
AP
1397 $ADDU $c_2,$t_1
1398 sltu $at,$c_2,$t_1
947716c1 1399 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
da4d239d
AP
1400 $ADDU $t_2,$at
1401 $ADDU $c_3,$t_2
1402 sltu $at,$c_3,$t_2
1403 $ADDU $c_1,$at
947716c1
AP
1404 mflo ($t_1,$a_6,$b_1)
1405 mfhi ($t_2,$a_6,$b_1)
da4d239d
AP
1406 $ADDU $c_2,$t_1
1407 sltu $at,$c_2,$t_1
947716c1 1408 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
da4d239d
AP
1409 $ADDU $t_2,$at
1410 $ADDU $c_3,$t_2
1411 sltu $at,$c_3,$t_2
1412 $ADDU $c_1,$at
947716c1
AP
1413 mflo ($t_1,$a_7,$b_0)
1414 mfhi ($t_2,$a_7,$b_0)
da4d239d
AP
1415 $ADDU $c_2,$t_1
1416 sltu $at,$c_2,$t_1
947716c1 1417 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
da4d239d
AP
1418 $ADDU $t_2,$at
1419 $ADDU $c_3,$t_2
1420 sltu $at,$c_3,$t_2
1421 $ADDU $c_1,$at
1422 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1423
947716c1
AP
1424 mflo ($t_1,$a_7,$b_1)
1425 mfhi ($t_2,$a_7,$b_1)
da4d239d
AP
1426 $ADDU $c_3,$t_1
1427 sltu $at,$c_3,$t_1
947716c1 1428 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
da4d239d
AP
1429 $ADDU $t_2,$at
1430 $ADDU $c_1,$t_2
1431 sltu $c_2,$c_1,$t_2
947716c1
AP
1432 mflo ($t_1,$a_6,$b_2)
1433 mfhi ($t_2,$a_6,$b_2)
da4d239d
AP
1434 $ADDU $c_3,$t_1
1435 sltu $at,$c_3,$t_1
947716c1 1436 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
da4d239d
AP
1437 $ADDU $t_2,$at
1438 $ADDU $c_1,$t_2
1439 sltu $at,$c_1,$t_2
1440 $ADDU $c_2,$at
947716c1
AP
1441 mflo ($t_1,$a_5,$b_3)
1442 mfhi ($t_2,$a_5,$b_3)
da4d239d
AP
1443 $ADDU $c_3,$t_1
1444 sltu $at,$c_3,$t_1
947716c1 1445 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
da4d239d
AP
1446 $ADDU $t_2,$at
1447 $ADDU $c_1,$t_2
1448 sltu $at,$c_1,$t_2
1449 $ADDU $c_2,$at
947716c1
AP
1450 mflo ($t_1,$a_4,$b_4)
1451 mfhi ($t_2,$a_4,$b_4)
da4d239d
AP
1452 $ADDU $c_3,$t_1
1453 sltu $at,$c_3,$t_1
947716c1 1454 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
da4d239d
AP
1455 $ADDU $t_2,$at
1456 $ADDU $c_1,$t_2
1457 sltu $at,$c_1,$t_2
1458 $ADDU $c_2,$at
947716c1
AP
1459 mflo ($t_1,$a_3,$b_5)
1460 mfhi ($t_2,$a_3,$b_5)
da4d239d
AP
1461 $ADDU $c_3,$t_1
1462 sltu $at,$c_3,$t_1
947716c1 1463 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
da4d239d
AP
1464 $ADDU $t_2,$at
1465 $ADDU $c_1,$t_2
1466 sltu $at,$c_1,$t_2
1467 $ADDU $c_2,$at
947716c1
AP
1468 mflo ($t_1,$a_2,$b_6)
1469 mfhi ($t_2,$a_2,$b_6)
da4d239d
AP
1470 $ADDU $c_3,$t_1
1471 sltu $at,$c_3,$t_1
947716c1 1472 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
da4d239d
AP
1473 $ADDU $t_2,$at
1474 $ADDU $c_1,$t_2
1475 sltu $at,$c_1,$t_2
1476 $ADDU $c_2,$at
947716c1
AP
1477 mflo ($t_1,$a_1,$b_7)
1478 mfhi ($t_2,$a_1,$b_7)
da4d239d
AP
1479 $ADDU $c_3,$t_1
1480 sltu $at,$c_3,$t_1
947716c1 1481 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
da4d239d
AP
1482 $ADDU $t_2,$at
1483 $ADDU $c_1,$t_2
1484 sltu $at,$c_1,$t_2
1485 $ADDU $c_2,$at
1486 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1487
947716c1
AP
1488 mflo ($t_1,$a_2,$b_7)
1489 mfhi ($t_2,$a_2,$b_7)
da4d239d
AP
1490 $ADDU $c_1,$t_1
1491 sltu $at,$c_1,$t_1
947716c1 1492 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
da4d239d
AP
1493 $ADDU $t_2,$at
1494 $ADDU $c_2,$t_2
1495 sltu $c_3,$c_2,$t_2
947716c1
AP
1496 mflo ($t_1,$a_3,$b_6)
1497 mfhi ($t_2,$a_3,$b_6)
da4d239d
AP
1498 $ADDU $c_1,$t_1
1499 sltu $at,$c_1,$t_1
947716c1 1500 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
da4d239d
AP
1501 $ADDU $t_2,$at
1502 $ADDU $c_2,$t_2
1503 sltu $at,$c_2,$t_2
1504 $ADDU $c_3,$at
947716c1
AP
1505 mflo ($t_1,$a_4,$b_5)
1506 mfhi ($t_2,$a_4,$b_5)
da4d239d
AP
1507 $ADDU $c_1,$t_1
1508 sltu $at,$c_1,$t_1
947716c1 1509 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
da4d239d
AP
1510 $ADDU $t_2,$at
1511 $ADDU $c_2,$t_2
1512 sltu $at,$c_2,$t_2
1513 $ADDU $c_3,$at
947716c1
AP
1514 mflo ($t_1,$a_5,$b_4)
1515 mfhi ($t_2,$a_5,$b_4)
da4d239d
AP
1516 $ADDU $c_1,$t_1
1517 sltu $at,$c_1,$t_1
947716c1 1518 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
da4d239d
AP
1519 $ADDU $t_2,$at
1520 $ADDU $c_2,$t_2
1521 sltu $at,$c_2,$t_2
1522 $ADDU $c_3,$at
947716c1
AP
1523 mflo ($t_1,$a_6,$b_3)
1524 mfhi ($t_2,$a_6,$b_3)
da4d239d
AP
1525 $ADDU $c_1,$t_1
1526 sltu $at,$c_1,$t_1
947716c1 1527 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
da4d239d
AP
1528 $ADDU $t_2,$at
1529 $ADDU $c_2,$t_2
1530 sltu $at,$c_2,$t_2
1531 $ADDU $c_3,$at
947716c1
AP
1532 mflo ($t_1,$a_7,$b_2)
1533 mfhi ($t_2,$a_7,$b_2)
da4d239d
AP
1534 $ADDU $c_1,$t_1
1535 sltu $at,$c_1,$t_1
947716c1 1536 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
da4d239d
AP
1537 $ADDU $t_2,$at
1538 $ADDU $c_2,$t_2
1539 sltu $at,$c_2,$t_2
1540 $ADDU $c_3,$at
1541 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1542
947716c1
AP
1543 mflo ($t_1,$a_7,$b_3)
1544 mfhi ($t_2,$a_7,$b_3)
da4d239d
AP
1545 $ADDU $c_2,$t_1
1546 sltu $at,$c_2,$t_1
947716c1 1547 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
da4d239d
AP
1548 $ADDU $t_2,$at
1549 $ADDU $c_3,$t_2
1550 sltu $c_1,$c_3,$t_2
947716c1
AP
1551 mflo ($t_1,$a_6,$b_4)
1552 mfhi ($t_2,$a_6,$b_4)
da4d239d
AP
1553 $ADDU $c_2,$t_1
1554 sltu $at,$c_2,$t_1
947716c1 1555 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
da4d239d
AP
1556 $ADDU $t_2,$at
1557 $ADDU $c_3,$t_2
1558 sltu $at,$c_3,$t_2
1559 $ADDU $c_1,$at
947716c1
AP
1560 mflo ($t_1,$a_5,$b_5)
1561 mfhi ($t_2,$a_5,$b_5)
da4d239d
AP
1562 $ADDU $c_2,$t_1
1563 sltu $at,$c_2,$t_1
947716c1 1564 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
da4d239d
AP
1565 $ADDU $t_2,$at
1566 $ADDU $c_3,$t_2
1567 sltu $at,$c_3,$t_2
1568 $ADDU $c_1,$at
947716c1
AP
1569 mflo ($t_1,$a_4,$b_6)
1570 mfhi ($t_2,$a_4,$b_6)
da4d239d
AP
1571 $ADDU $c_2,$t_1
1572 sltu $at,$c_2,$t_1
947716c1 1573 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
da4d239d
AP
1574 $ADDU $t_2,$at
1575 $ADDU $c_3,$t_2
1576 sltu $at,$c_3,$t_2
1577 $ADDU $c_1,$at
947716c1
AP
1578 mflo ($t_1,$a_3,$b_7)
1579 mfhi ($t_2,$a_3,$b_7)
da4d239d
AP
1580 $ADDU $c_2,$t_1
1581 sltu $at,$c_2,$t_1
947716c1 1582 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
da4d239d
AP
1583 $ADDU $t_2,$at
1584 $ADDU $c_3,$t_2
1585 sltu $at,$c_3,$t_2
1586 $ADDU $c_1,$at
1587 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1588
947716c1
AP
1589 mflo ($t_1,$a_4,$b_7)
1590 mfhi ($t_2,$a_4,$b_7)
da4d239d
AP
1591 $ADDU $c_3,$t_1
1592 sltu $at,$c_3,$t_1
947716c1 1593 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
da4d239d
AP
1594 $ADDU $t_2,$at
1595 $ADDU $c_1,$t_2
1596 sltu $c_2,$c_1,$t_2
947716c1
AP
1597 mflo ($t_1,$a_5,$b_6)
1598 mfhi ($t_2,$a_5,$b_6)
da4d239d
AP
1599 $ADDU $c_3,$t_1
1600 sltu $at,$c_3,$t_1
947716c1 1601 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
da4d239d
AP
1602 $ADDU $t_2,$at
1603 $ADDU $c_1,$t_2
1604 sltu $at,$c_1,$t_2
1605 $ADDU $c_2,$at
947716c1
AP
1606 mflo ($t_1,$a_6,$b_5)
1607 mfhi ($t_2,$a_6,$b_5)
da4d239d
AP
1608 $ADDU $c_3,$t_1
1609 sltu $at,$c_3,$t_1
947716c1 1610 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
da4d239d
AP
1611 $ADDU $t_2,$at
1612 $ADDU $c_1,$t_2
1613 sltu $at,$c_1,$t_2
1614 $ADDU $c_2,$at
947716c1
AP
1615 mflo ($t_1,$a_7,$b_4)
1616 mfhi ($t_2,$a_7,$b_4)
da4d239d
AP
1617 $ADDU $c_3,$t_1
1618 sltu $at,$c_3,$t_1
947716c1 1619 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
da4d239d
AP
1620 $ADDU $t_2,$at
1621 $ADDU $c_1,$t_2
1622 sltu $at,$c_1,$t_2
1623 $ADDU $c_2,$at
1624 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1625
947716c1
AP
1626 mflo ($t_1,$a_7,$b_5)
1627 mfhi ($t_2,$a_7,$b_5)
da4d239d
AP
1628 $ADDU $c_1,$t_1
1629 sltu $at,$c_1,$t_1
947716c1 1630 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
da4d239d
AP
1631 $ADDU $t_2,$at
1632 $ADDU $c_2,$t_2
1633 sltu $c_3,$c_2,$t_2
947716c1
AP
1634 mflo ($t_1,$a_6,$b_6)
1635 mfhi ($t_2,$a_6,$b_6)
da4d239d
AP
1636 $ADDU $c_1,$t_1
1637 sltu $at,$c_1,$t_1
947716c1 1638 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
da4d239d
AP
1639 $ADDU $t_2,$at
1640 $ADDU $c_2,$t_2
1641 sltu $at,$c_2,$t_2
1642 $ADDU $c_3,$at
947716c1
AP
1643 mflo ($t_1,$a_5,$b_7)
1644 mfhi ($t_2,$a_5,$b_7)
da4d239d
AP
1645 $ADDU $c_1,$t_1
1646 sltu $at,$c_1,$t_1
947716c1 1647 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
da4d239d
AP
1648 $ADDU $t_2,$at
1649 $ADDU $c_2,$t_2
1650 sltu $at,$c_2,$t_2
1651 $ADDU $c_3,$at
1652 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1653
947716c1
AP
1654 mflo ($t_1,$a_6,$b_7)
1655 mfhi ($t_2,$a_6,$b_7)
da4d239d
AP
1656 $ADDU $c_2,$t_1
1657 sltu $at,$c_2,$t_1
947716c1 1658 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
da4d239d
AP
1659 $ADDU $t_2,$at
1660 $ADDU $c_3,$t_2
1661 sltu $c_1,$c_3,$t_2
947716c1
AP
1662 mflo ($t_1,$a_7,$b_6)
1663 mfhi ($t_2,$a_7,$b_6)
da4d239d
AP
1664 $ADDU $c_2,$t_1
1665 sltu $at,$c_2,$t_1
947716c1 1666 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
da4d239d
AP
1667 $ADDU $t_2,$at
1668 $ADDU $c_3,$t_2
1669 sltu $at,$c_3,$t_2
1670 $ADDU $c_1,$at
1671 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1672
947716c1
AP
1673 mflo ($t_1,$a_7,$b_7)
1674 mfhi ($t_2,$a_7,$b_7)
da4d239d
AP
1675 $ADDU $c_3,$t_1
1676 sltu $at,$c_3,$t_1
1677 $ADDU $t_2,$at
1678 $ADDU $c_1,$t_2
1679 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1680 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1681
1682 .set noreorder
1683___
1684$code.=<<___ if ($flavour =~ /nubi/i);
1685 $REG_L $s5,10*$SZREG($sp)
1686 $REG_L $s4,9*$SZREG($sp)
1687 $REG_L $s3,8*$SZREG($sp)
1688 $REG_L $s2,7*$SZREG($sp)
1689 $REG_L $s1,6*$SZREG($sp)
1690 $REG_L $s0,5*$SZREG($sp)
1691 $REG_L $t3,4*$SZREG($sp)
1692 $REG_L $t2,3*$SZREG($sp)
1693 $REG_L $t1,2*$SZREG($sp)
1694 $REG_L $t0,1*$SZREG($sp)
1695 $REG_L $gp,0*$SZREG($sp)
1696 jr $ra
1697 $PTR_ADD $sp,12*$SZREG
1698___
1699$code.=<<___ if ($flavour !~ /nubi/i);
1700 $REG_L $s5,5*$SZREG($sp)
1701 $REG_L $s4,4*$SZREG($sp)
1702 $REG_L $s3,3*$SZREG($sp)
1703 $REG_L $s2,2*$SZREG($sp)
1704 $REG_L $s1,1*$SZREG($sp)
1705 $REG_L $s0,0*$SZREG($sp)
1706 jr $ra
1707 $PTR_ADD $sp,6*$SZREG
1708___
1709$code.=<<___;
1710.end bn_mul_comba8
1711
1712.align 5
1713.globl bn_mul_comba4
1714.ent bn_mul_comba4
1715bn_mul_comba4:
1716___
1717$code.=<<___ if ($flavour =~ /nubi/i);
1718 .frame $sp,6*$SZREG,$ra
1719 .mask 0x8000f008,-$SZREG
1720 .set noreorder
1721 $PTR_SUB $sp,6*$SZREG
1722 $REG_S $ra,5*$SZREG($sp)
1723 $REG_S $t3,4*$SZREG($sp)
1724 $REG_S $t2,3*$SZREG($sp)
1725 $REG_S $t1,2*$SZREG($sp)
1726 $REG_S $t0,1*$SZREG($sp)
1727 $REG_S $gp,0*$SZREG($sp)
1728___
1729$code.=<<___;
1730 .set reorder
1731 $LD $a_0,0($a1)
1732 $LD $b_0,0($a2)
1733 $LD $a_1,$BNSZ($a1)
1734 $LD $a_2,2*$BNSZ($a1)
947716c1 1735 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
1736 $LD $a_3,3*$BNSZ($a1)
1737 $LD $b_1,$BNSZ($a2)
1738 $LD $b_2,2*$BNSZ($a2)
1739 $LD $b_3,3*$BNSZ($a2)
947716c1
AP
1740 mflo ($c_1,$a_0,$b_0)
1741 mfhi ($c_2,$a_0,$b_0)
da4d239d
AP
1742 $ST $c_1,0($a0)
1743
947716c1
AP
1744 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1745 mflo ($t_1,$a_0,$b_1)
1746 mfhi ($t_2,$a_0,$b_1)
da4d239d
AP
1747 $ADDU $c_2,$t_1
1748 sltu $at,$c_2,$t_1
947716c1 1749 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
da4d239d 1750 $ADDU $c_3,$t_2,$at
947716c1
AP
1751 mflo ($t_1,$a_1,$b_0)
1752 mfhi ($t_2,$a_1,$b_0)
da4d239d
AP
1753 $ADDU $c_2,$t_1
1754 sltu $at,$c_2,$t_1
947716c1 1755 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
da4d239d
AP
1756 $ADDU $t_2,$at
1757 $ADDU $c_3,$t_2
1758 sltu $c_1,$c_3,$t_2
1759 $ST $c_2,$BNSZ($a0)
1760
947716c1
AP
1761 mflo ($t_1,$a_2,$b_0)
1762 mfhi ($t_2,$a_2,$b_0)
da4d239d
AP
1763 $ADDU $c_3,$t_1
1764 sltu $at,$c_3,$t_1
947716c1 1765 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
da4d239d
AP
1766 $ADDU $t_2,$at
1767 $ADDU $c_1,$t_2
947716c1
AP
1768 mflo ($t_1,$a_1,$b_1)
1769 mfhi ($t_2,$a_1,$b_1)
da4d239d
AP
1770 $ADDU $c_3,$t_1
1771 sltu $at,$c_3,$t_1
947716c1 1772 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
da4d239d
AP
1773 $ADDU $t_2,$at
1774 $ADDU $c_1,$t_2
1775 sltu $c_2,$c_1,$t_2
947716c1
AP
1776 mflo ($t_1,$a_0,$b_2)
1777 mfhi ($t_2,$a_0,$b_2)
da4d239d
AP
1778 $ADDU $c_3,$t_1
1779 sltu $at,$c_3,$t_1
947716c1 1780 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
da4d239d
AP
1781 $ADDU $t_2,$at
1782 $ADDU $c_1,$t_2
1783 sltu $at,$c_1,$t_2
1784 $ADDU $c_2,$at
1785 $ST $c_3,2*$BNSZ($a0)
1786
947716c1
AP
1787 mflo ($t_1,$a_0,$b_3)
1788 mfhi ($t_2,$a_0,$b_3)
da4d239d
AP
1789 $ADDU $c_1,$t_1
1790 sltu $at,$c_1,$t_1
947716c1 1791 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
da4d239d
AP
1792 $ADDU $t_2,$at
1793 $ADDU $c_2,$t_2
1794 sltu $c_3,$c_2,$t_2
947716c1
AP
1795 mflo ($t_1,$a_1,$b_2)
1796 mfhi ($t_2,$a_1,$b_2)
da4d239d
AP
1797 $ADDU $c_1,$t_1
1798 sltu $at,$c_1,$t_1
947716c1 1799 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
da4d239d
AP
1800 $ADDU $t_2,$at
1801 $ADDU $c_2,$t_2
1802 sltu $at,$c_2,$t_2
1803 $ADDU $c_3,$at
947716c1
AP
1804 mflo ($t_1,$a_2,$b_1)
1805 mfhi ($t_2,$a_2,$b_1)
da4d239d
AP
1806 $ADDU $c_1,$t_1
1807 sltu $at,$c_1,$t_1
947716c1 1808 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
da4d239d
AP
1809 $ADDU $t_2,$at
1810 $ADDU $c_2,$t_2
1811 sltu $at,$c_2,$t_2
1812 $ADDU $c_3,$at
947716c1
AP
1813 mflo ($t_1,$a_3,$b_0)
1814 mfhi ($t_2,$a_3,$b_0)
da4d239d
AP
1815 $ADDU $c_1,$t_1
1816 sltu $at,$c_1,$t_1
947716c1 1817 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
da4d239d
AP
1818 $ADDU $t_2,$at
1819 $ADDU $c_2,$t_2
1820 sltu $at,$c_2,$t_2
1821 $ADDU $c_3,$at
1822 $ST $c_1,3*$BNSZ($a0)
1823
947716c1
AP
1824 mflo ($t_1,$a_3,$b_1)
1825 mfhi ($t_2,$a_3,$b_1)
da4d239d
AP
1826 $ADDU $c_2,$t_1
1827 sltu $at,$c_2,$t_1
947716c1 1828 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
da4d239d
AP
1829 $ADDU $t_2,$at
1830 $ADDU $c_3,$t_2
1831 sltu $c_1,$c_3,$t_2
947716c1
AP
1832 mflo ($t_1,$a_2,$b_2)
1833 mfhi ($t_2,$a_2,$b_2)
da4d239d
AP
1834 $ADDU $c_2,$t_1
1835 sltu $at,$c_2,$t_1
947716c1 1836 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
da4d239d
AP
1837 $ADDU $t_2,$at
1838 $ADDU $c_3,$t_2
1839 sltu $at,$c_3,$t_2
1840 $ADDU $c_1,$at
947716c1
AP
1841 mflo ($t_1,$a_1,$b_3)
1842 mfhi ($t_2,$a_1,$b_3)
da4d239d
AP
1843 $ADDU $c_2,$t_1
1844 sltu $at,$c_2,$t_1
947716c1 1845 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
da4d239d
AP
1846 $ADDU $t_2,$at
1847 $ADDU $c_3,$t_2
1848 sltu $at,$c_3,$t_2
1849 $ADDU $c_1,$at
1850 $ST $c_2,4*$BNSZ($a0)
1851
947716c1
AP
1852 mflo ($t_1,$a_2,$b_3)
1853 mfhi ($t_2,$a_2,$b_3)
da4d239d
AP
1854 $ADDU $c_3,$t_1
1855 sltu $at,$c_3,$t_1
947716c1 1856 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
da4d239d
AP
1857 $ADDU $t_2,$at
1858 $ADDU $c_1,$t_2
1859 sltu $c_2,$c_1,$t_2
947716c1
AP
1860 mflo ($t_1,$a_3,$b_2)
1861 mfhi ($t_2,$a_3,$b_2)
da4d239d
AP
1862 $ADDU $c_3,$t_1
1863 sltu $at,$c_3,$t_1
947716c1 1864 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
da4d239d
AP
1865 $ADDU $t_2,$at
1866 $ADDU $c_1,$t_2
1867 sltu $at,$c_1,$t_2
1868 $ADDU $c_2,$at
1869 $ST $c_3,5*$BNSZ($a0)
1870
947716c1
AP
1871 mflo ($t_1,$a_3,$b_3)
1872 mfhi ($t_2,$a_3,$b_3)
da4d239d
AP
1873 $ADDU $c_1,$t_1
1874 sltu $at,$c_1,$t_1
1875 $ADDU $t_2,$at
1876 $ADDU $c_2,$t_2
1877 $ST $c_1,6*$BNSZ($a0)
1878 $ST $c_2,7*$BNSZ($a0)
1879
1880 .set noreorder
1881___
1882$code.=<<___ if ($flavour =~ /nubi/i);
1883 $REG_L $t3,4*$SZREG($sp)
1884 $REG_L $t2,3*$SZREG($sp)
1885 $REG_L $t1,2*$SZREG($sp)
1886 $REG_L $t0,1*$SZREG($sp)
1887 $REG_L $gp,0*$SZREG($sp)
1888 $PTR_ADD $sp,6*$SZREG
1889___
1890$code.=<<___;
1891 jr $ra
1892 nop
1893.end bn_mul_comba4
1894___
1895
1896($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1897
a7a44ba5
AP
1898sub add_c2 () {
1899my ($hi,$lo,$c0,$c1,$c2,
1900 $warm, # !$warm denotes first call with specific sequence of
1901 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1902 $an,$bn # these two are arguments for multiplication which
1903 # result is used in *next* step [which is why it's
1904 # commented as "forward multiplication" below];
1905 )=@_;
1906$code.=<<___;
a7a44ba5
AP
1907 $ADDU $c0,$lo
1908 sltu $at,$c0,$lo
947716c1 1909 $MULTU ($an,$bn) # forward multiplication
a7a44ba5
AP
1910 $ADDU $c0,$lo
1911 $ADDU $at,$hi
1912 sltu $lo,$c0,$lo
1913 $ADDU $c1,$at
1914 $ADDU $hi,$lo
1915___
1916$code.=<<___ if (!$warm);
1917 sltu $c2,$c1,$at
1918 $ADDU $c1,$hi
a7a44ba5
AP
1919___
1920$code.=<<___ if ($warm);
1921 sltu $at,$c1,$at
1922 $ADDU $c1,$hi
1923 $ADDU $c2,$at
947716c1
AP
1924___
1925$code.=<<___;
a7a44ba5
AP
1926 sltu $hi,$c1,$hi
1927 $ADDU $c2,$hi
947716c1
AP
1928 mflo ($lo,$an,$bn)
1929 mfhi ($hi,$an,$bn)
a7a44ba5
AP
1930___
1931}
1932
da4d239d
AP
1933$code.=<<___;
1934
1935.align 5
1936.globl bn_sqr_comba8
1937.ent bn_sqr_comba8
1938bn_sqr_comba8:
1939___
1940$code.=<<___ if ($flavour =~ /nubi/i);
1941 .frame $sp,6*$SZREG,$ra
1942 .mask 0x8000f008,-$SZREG
1943 .set noreorder
1944 $PTR_SUB $sp,6*$SZREG
1945 $REG_S $ra,5*$SZREG($sp)
1946 $REG_S $t3,4*$SZREG($sp)
1947 $REG_S $t2,3*$SZREG($sp)
1948 $REG_S $t1,2*$SZREG($sp)
1949 $REG_S $t0,1*$SZREG($sp)
1950 $REG_S $gp,0*$SZREG($sp)
1951___
1952$code.=<<___;
1953 .set reorder
1954 $LD $a_0,0($a1)
1955 $LD $a_1,$BNSZ($a1)
1956 $LD $a_2,2*$BNSZ($a1)
1957 $LD $a_3,3*$BNSZ($a1)
1958
947716c1 1959 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
1960 $LD $a_4,4*$BNSZ($a1)
1961 $LD $a_5,5*$BNSZ($a1)
1962 $LD $a_6,6*$BNSZ($a1)
1963 $LD $a_7,7*$BNSZ($a1)
947716c1
AP
1964 mflo ($c_1,$a_0,$a_0)
1965 mfhi ($c_2,$a_0,$a_0)
da4d239d
AP
1966 $ST $c_1,0($a0)
1967
947716c1
AP
1968 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1969 mflo ($t_1,$a_0,$a_1)
1970 mfhi ($t_2,$a_0,$a_1)
da4d239d
AP
1971 slt $c_1,$t_2,$zero
1972 $SLL $t_2,1
947716c1 1973 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
da4d239d
AP
1974 slt $a2,$t_1,$zero
1975 $ADDU $t_2,$a2
1976 $SLL $t_1,1
1977 $ADDU $c_2,$t_1
1978 sltu $at,$c_2,$t_1
1979 $ADDU $c_3,$t_2,$at
1980 $ST $c_2,$BNSZ($a0)
947716c1
AP
1981 mflo ($t_1,$a_2,$a_0)
1982 mfhi ($t_2,$a_2,$a_0)
a7a44ba5
AP
1983___
1984 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1985 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1986$code.=<<___;
da4d239d
AP
1987 $ADDU $c_3,$t_1
1988 sltu $at,$c_3,$t_1
947716c1 1989 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
da4d239d
AP
1990 $ADDU $t_2,$at
1991 $ADDU $c_1,$t_2
1992 sltu $at,$c_1,$t_2
1993 $ADDU $c_2,$at
1994 $ST $c_3,2*$BNSZ($a0)
947716c1
AP
1995 mflo ($t_1,$a_0,$a_3)
1996 mfhi ($t_2,$a_0,$a_3)
a7a44ba5
AP
1997___
1998 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1999 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2000 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2001 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2002$code.=<<___;
da4d239d 2003 $ST $c_1,3*$BNSZ($a0)
a7a44ba5
AP
2004___
2005 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2006 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2007 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2008 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2009$code.=<<___;
da4d239d
AP
2010 $ADDU $c_2,$t_1
2011 sltu $at,$c_2,$t_1
947716c1 2012 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
da4d239d
AP
2013 $ADDU $t_2,$at
2014 $ADDU $c_3,$t_2
2015 sltu $at,$c_3,$t_2
2016 $ADDU $c_1,$at
2017 $ST $c_2,4*$BNSZ($a0)
947716c1
AP
2018 mflo ($t_1,$a_0,$a_5)
2019 mfhi ($t_2,$a_0,$a_5)
a7a44ba5
AP
2020___
2021 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2022 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2023 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2024 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2025 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2026 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2027$code.=<<___;
da4d239d 2028 $ST $c_3,5*$BNSZ($a0)
a7a44ba5
AP
2029___
2030 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2031 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2032 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2033 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2034 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2035 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2036$code.=<<___;
da4d239d
AP
2037 $ADDU $c_1,$t_1
2038 sltu $at,$c_1,$t_1
947716c1 2039 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
da4d239d
AP
2040 $ADDU $t_2,$at
2041 $ADDU $c_2,$t_2
2042 sltu $at,$c_2,$t_2
2043 $ADDU $c_3,$at
2044 $ST $c_1,6*$BNSZ($a0)
947716c1
AP
2045 mflo ($t_1,$a_0,$a_7)
2046 mfhi ($t_2,$a_0,$a_7)
a7a44ba5
AP
2047___
2048 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2049 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2050 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2051 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2052 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2053 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2054 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2055 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2056$code.=<<___;
da4d239d 2057 $ST $c_2,7*$BNSZ($a0)
a7a44ba5
AP
2058___
2059 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2060 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2061 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2062 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2063 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2064 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2065$code.=<<___;
da4d239d
AP
2066 $ADDU $c_3,$t_1
2067 sltu $at,$c_3,$t_1
947716c1 2068 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
da4d239d
AP
2069 $ADDU $t_2,$at
2070 $ADDU $c_1,$t_2
2071 sltu $at,$c_1,$t_2
2072 $ADDU $c_2,$at
2073 $ST $c_3,8*$BNSZ($a0)
947716c1
AP
2074 mflo ($t_1,$a_2,$a_7)
2075 mfhi ($t_2,$a_2,$a_7)
a7a44ba5
AP
2076___
2077 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2078 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2079 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2080 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2081 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2082 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2083$code.=<<___;
da4d239d 2084 $ST $c_1,9*$BNSZ($a0)
a7a44ba5
AP
2085___
2086 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2087 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2088 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2089 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2090$code.=<<___;
da4d239d
AP
2091 $ADDU $c_2,$t_1
2092 sltu $at,$c_2,$t_1
947716c1 2093 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
da4d239d
AP
2094 $ADDU $t_2,$at
2095 $ADDU $c_3,$t_2
2096 sltu $at,$c_3,$t_2
2097 $ADDU $c_1,$at
2098 $ST $c_2,10*$BNSZ($a0)
947716c1
AP
2099 mflo ($t_1,$a_4,$a_7)
2100 mfhi ($t_2,$a_4,$a_7)
a7a44ba5
AP
2101___
2102 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2103 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2104 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2105 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2106$code.=<<___;
da4d239d 2107 $ST $c_3,11*$BNSZ($a0)
a7a44ba5
AP
2108___
2109 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2110 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2111$code.=<<___;
da4d239d
AP
2112 $ADDU $c_1,$t_1
2113 sltu $at,$c_1,$t_1
947716c1 2114 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
da4d239d
AP
2115 $ADDU $t_2,$at
2116 $ADDU $c_2,$t_2
2117 sltu $at,$c_2,$t_2
2118 $ADDU $c_3,$at
2119 $ST $c_1,12*$BNSZ($a0)
947716c1
AP
2120 mflo ($t_1,$a_6,$a_7)
2121 mfhi ($t_2,$a_6,$a_7)
a7a44ba5
AP
2122___
2123 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2124 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2125$code.=<<___;
da4d239d
AP
2126 $ST $c_2,13*$BNSZ($a0)
2127
da4d239d
AP
2128 $ADDU $c_3,$t_1
2129 sltu $at,$c_3,$t_1
2130 $ADDU $t_2,$at
2131 $ADDU $c_1,$t_2
2132 $ST $c_3,14*$BNSZ($a0)
2133 $ST $c_1,15*$BNSZ($a0)
2134
2135 .set noreorder
2136___
2137$code.=<<___ if ($flavour =~ /nubi/i);
2138 $REG_L $t3,4*$SZREG($sp)
2139 $REG_L $t2,3*$SZREG($sp)
2140 $REG_L $t1,2*$SZREG($sp)
2141 $REG_L $t0,1*$SZREG($sp)
2142 $REG_L $gp,0*$SZREG($sp)
2143 $PTR_ADD $sp,6*$SZREG
2144___
2145$code.=<<___;
2146 jr $ra
2147 nop
2148.end bn_sqr_comba8
2149
2150.align 5
2151.globl bn_sqr_comba4
2152.ent bn_sqr_comba4
2153bn_sqr_comba4:
2154___
2155$code.=<<___ if ($flavour =~ /nubi/i);
2156 .frame $sp,6*$SZREG,$ra
2157 .mask 0x8000f008,-$SZREG
2158 .set noreorder
2159 $PTR_SUB $sp,6*$SZREG
2160 $REG_S $ra,5*$SZREG($sp)
2161 $REG_S $t3,4*$SZREG($sp)
2162 $REG_S $t2,3*$SZREG($sp)
2163 $REG_S $t1,2*$SZREG($sp)
2164 $REG_S $t0,1*$SZREG($sp)
2165 $REG_S $gp,0*$SZREG($sp)
2166___
2167$code.=<<___;
2168 .set reorder
2169 $LD $a_0,0($a1)
2170 $LD $a_1,$BNSZ($a1)
947716c1 2171 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
da4d239d
AP
2172 $LD $a_2,2*$BNSZ($a1)
2173 $LD $a_3,3*$BNSZ($a1)
947716c1
AP
2174 mflo ($c_1,$a_0,$a_0)
2175 mfhi ($c_2,$a_0,$a_0)
da4d239d
AP
2176 $ST $c_1,0($a0)
2177
947716c1
AP
2178 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2179 mflo ($t_1,$a_0,$a_1)
2180 mfhi ($t_2,$a_0,$a_1)
da4d239d
AP
2181 slt $c_1,$t_2,$zero
2182 $SLL $t_2,1
947716c1 2183 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
da4d239d
AP
2184 slt $a2,$t_1,$zero
2185 $ADDU $t_2,$a2
2186 $SLL $t_1,1
2187 $ADDU $c_2,$t_1
2188 sltu $at,$c_2,$t_1
2189 $ADDU $c_3,$t_2,$at
2190 $ST $c_2,$BNSZ($a0)
947716c1
AP
2191 mflo ($t_1,$a_2,$a_0)
2192 mfhi ($t_2,$a_2,$a_0)
a7a44ba5
AP
2193___
2194 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2195 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2196$code.=<<___;
da4d239d
AP
2197 $ADDU $c_3,$t_1
2198 sltu $at,$c_3,$t_1
947716c1 2199 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
da4d239d
AP
2200 $ADDU $t_2,$at
2201 $ADDU $c_1,$t_2
2202 sltu $at,$c_1,$t_2
2203 $ADDU $c_2,$at
2204 $ST $c_3,2*$BNSZ($a0)
947716c1
AP
2205 mflo ($t_1,$a_0,$a_3)
2206 mfhi ($t_2,$a_0,$a_3)
a7a44ba5
AP
2207___
2208 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2209 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2210 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2211 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2212$code.=<<___;
da4d239d 2213 $ST $c_1,3*$BNSZ($a0)
a7a44ba5
AP
2214___
2215 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2216 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2217$code.=<<___;
da4d239d
AP
2218 $ADDU $c_2,$t_1
2219 sltu $at,$c_2,$t_1
947716c1 2220 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
da4d239d
AP
2221 $ADDU $t_2,$at
2222 $ADDU $c_3,$t_2
2223 sltu $at,$c_3,$t_2
2224 $ADDU $c_1,$at
2225 $ST $c_2,4*$BNSZ($a0)
947716c1
AP
2226 mflo ($t_1,$a_2,$a_3)
2227 mfhi ($t_2,$a_2,$a_3)
a7a44ba5
AP
2228___
2229 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2230 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2231$code.=<<___;
da4d239d
AP
2232 $ST $c_3,5*$BNSZ($a0)
2233
da4d239d
AP
2234 $ADDU $c_1,$t_1
2235 sltu $at,$c_1,$t_1
2236 $ADDU $t_2,$at
2237 $ADDU $c_2,$t_2
2238 $ST $c_1,6*$BNSZ($a0)
2239 $ST $c_2,7*$BNSZ($a0)
2240
2241 .set noreorder
2242___
2243$code.=<<___ if ($flavour =~ /nubi/i);
2244 $REG_L $t3,4*$SZREG($sp)
2245 $REG_L $t2,3*$SZREG($sp)
2246 $REG_L $t1,2*$SZREG($sp)
2247 $REG_L $t0,1*$SZREG($sp)
2248 $REG_L $gp,0*$SZREG($sp)
2249 $PTR_ADD $sp,6*$SZREG
2250___
2251$code.=<<___;
2252 jr $ra
2253 nop
2254.end bn_sqr_comba4
2255___
2256print $code;
2257close STDOUT;