]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/bn/asm/ppc64-mont.pl
Update copyright year
[thirdparty/openssl.git] / crypto / bn / asm / ppc64-mont.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # December 2007
18
19 # The reason for undertaken effort is basically following. Even though
20 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
21 # performance was observed to be less than impressive, essentially as
22 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
23 # Well, it's not surprising that IBM had to make some sacrifices to
24 # boost the clock frequency that much, but no overall improvement?
25 # Having observed how much difference did switching to FPU make on
26 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
27 # Unfortunately the resulting performance improvement is not as
28 # impressive, ~30%, and in absolute terms is still very far from what
29 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
30 # something wrong, but in the lack of assembler level micro-profiling
31 # data or at least decent platform guide I can't tell... Or better
32 # results might be achieved with VMX... Anyway, this module provides
33 # *worse* performance on other PowerPC implementations, ~40-15% slower
34 # on PPC970 depending on key length and ~40% slower on Power 5 for all
35 # key lengths. As it's obviously inappropriate as "best all-round"
36 # alternative, it has to be complemented with run-time CPU family
37 # detection. Oh! It should also be noted that unlike other PowerPC
38 # implementation IALU ppc-mont.pl module performs *suboptimally* on
39 # >=1024-bit key lengths on Power 6. It should also be noted that
40 # *everything* said so far applies to 64-bit builds! As far as 32-bit
41 # application executed on 64-bit CPU goes, this module is likely to
42 # become preferred choice, because it's easy to adapt it for such
43 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
44
45 # February 2008
46
47 # Micro-profiling assisted optimization results in ~15% improvement
48 # over original ppc64-mont.pl version, or overall ~50% improvement
49 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
50 # Power 6 CPU, this module is 5-150% faster depending on key length,
51 # [hereafter] more for longer keys. But if compared to ppc-mont.pl
52 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
53 # in absolute terms, but it's apparently the way Power 6 is...
54
55 # December 2009
56
57 # Adapted for 32-bit build this module delivers 25-120%, yes, more
58 # than *twice* for longer keys, performance improvement over 32-bit
59 # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
60 # even 64-bit integer operations and the trouble is that most PPC
61 # operating systems don't preserve upper halves of general purpose
62 # registers upon 32-bit signal delivery. They do preserve them upon
63 # context switch, but not signalling:-( This means that asynchronous
64 # signals have to be blocked upon entry to this subroutine. Signal
65 # masking (and of course complementary unmasking) has quite an impact
66 # on performance, naturally larger for shorter keys. It's so severe
67 # that 512-bit key performance can be as low as 1/3 of expected one.
68 # This is why this routine can be engaged for longer key operations
69 # only on these OSes, see crypto/ppccap.c for further details. MacOS X
70 # is an exception from this and doesn't require signal masking, and
71 # that's where above improvement coefficients were collected. For
72 # others alternative would be to break dependence on upper halves of
73 # GPRs by sticking to 32-bit integer operations...
74
75 # December 2012
76
77 # Remove above mentioned dependence on GPRs' upper halves in 32-bit
78 # build. No signal masking overhead, but integer instructions are
79 # *more* numerous... It's still "universally" faster than 32-bit
80 # ppc-mont.pl, but improvement coefficient is not as impressive
81 # for longer keys...
82
83 $flavour = shift;
84
85 if ($flavour =~ /32/) {
86 $SIZE_T=4;
87 $RZONE= 224;
88 $fname= "bn_mul_mont_fpu64";
89
90 $STUX= "stwux"; # store indexed and update
91 $PUSH= "stw";
92 $POP= "lwz";
93 } elsif ($flavour =~ /64/) {
94 $SIZE_T=8;
95 $RZONE= 288;
96 $fname= "bn_mul_mont_fpu64";
97
98 # same as above, but 64-bit mnemonics...
99 $STUX= "stdux"; # store indexed and update
100 $PUSH= "std";
101 $POP= "ld";
102 } else { die "nonsense $flavour"; }
103
104 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
105
106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
108 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
109 die "can't locate ppc-xlate.pl";
110
111 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
112
113 $FRAME=64; # padded frame header
114 $TRANSFER=16*8;
115
116 $carry="r0";
117 $sp="r1";
118 $toc="r2";
119 $rp="r3"; $ovf="r3";
120 $ap="r4";
121 $bp="r5";
122 $np="r6";
123 $n0="r7";
124 $num="r8";
125 $rp="r9"; # $rp is reassigned
126 $tp="r10";
127 $j="r11";
128 $i="r12";
129 # non-volatile registers
130 $c1="r19";
131 $n1="r20";
132 $a1="r21";
133 $nap_d="r22"; # interleaved ap and np in double format
134 $a0="r23"; # ap[0]
135 $t0="r24"; # temporary registers
136 $t1="r25";
137 $t2="r26";
138 $t3="r27";
139 $t4="r28";
140 $t5="r29";
141 $t6="r30";
142 $t7="r31";
143
144 # PPC offers enough register bank capacity to unroll inner loops twice
145 #
146 # ..A3A2A1A0
147 # dcba
148 # -----------
149 # A0a
150 # A0b
151 # A0c
152 # A0d
153 # A1a
154 # A1b
155 # A1c
156 # A1d
157 # A2a
158 # A2b
159 # A2c
160 # A2d
161 # A3a
162 # A3b
163 # A3c
164 # A3d
165 # ..a
166 # ..b
167 #
168 $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
169 $na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
170 $dota="f8"; $dotb="f9";
171 $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
172 $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
173 $T0a="f24"; $T0b="f25";
174 $T1a="f26"; $T1b="f27";
175 $T2a="f28"; $T2b="f29";
176 $T3a="f30"; $T3b="f31";
177 \f
178 # sp----------->+-------------------------------+
179 # | saved sp |
180 # +-------------------------------+
181 # . .
182 # +64 +-------------------------------+
183 # | 16 gpr<->fpr transfer zone |
184 # . .
185 # . .
186 # +16*8 +-------------------------------+
187 # | __int64 tmp[-1] |
188 # +-------------------------------+
189 # | __int64 tmp[num] |
190 # . .
191 # . .
192 # . .
193 # +(num+1)*8 +-------------------------------+
194 # | padding to 64 byte boundary |
195 # . .
196 # +X +-------------------------------+
197 # | double nap_d[4*num] |
198 # . .
199 # . .
200 # . .
201 # +-------------------------------+
202 # . .
203 # -13*size_t +-------------------------------+
204 # | 13 saved gpr, r19-r31 |
205 # . .
206 # . .
207 # -12*8 +-------------------------------+
208 # | 12 saved fpr, f20-f31 |
209 # . .
210 # . .
211 # +-------------------------------+
212 \f
213 $code=<<___;
214 .machine "any"
215 .text
216
217 .globl .$fname
218 .align 5
219 .$fname:
220 cmpwi $num,`3*8/$SIZE_T`
221 mr $rp,r3 ; $rp is reassigned
222 li r3,0 ; possible "not handled" return code
223 bltlr-
224 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
225 bnelr-
226
227 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
228 li $i,-4096
229 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
230 add $tp,$tp,$num ; place for tp[num+1]
231 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
232 subf $tp,$tp,$sp ; $sp-$tp
233 and $tp,$tp,$i ; minimize TLB usage
234 subf $tp,$sp,$tp ; $tp-$sp
235 mr $i,$sp
236 $STUX $sp,$sp,$tp ; alloca
237
238 $PUSH r19,`-12*8-13*$SIZE_T`($i)
239 $PUSH r20,`-12*8-12*$SIZE_T`($i)
240 $PUSH r21,`-12*8-11*$SIZE_T`($i)
241 $PUSH r22,`-12*8-10*$SIZE_T`($i)
242 $PUSH r23,`-12*8-9*$SIZE_T`($i)
243 $PUSH r24,`-12*8-8*$SIZE_T`($i)
244 $PUSH r25,`-12*8-7*$SIZE_T`($i)
245 $PUSH r26,`-12*8-6*$SIZE_T`($i)
246 $PUSH r27,`-12*8-5*$SIZE_T`($i)
247 $PUSH r28,`-12*8-4*$SIZE_T`($i)
248 $PUSH r29,`-12*8-3*$SIZE_T`($i)
249 $PUSH r30,`-12*8-2*$SIZE_T`($i)
250 $PUSH r31,`-12*8-1*$SIZE_T`($i)
251 stfd f20,`-12*8`($i)
252 stfd f21,`-11*8`($i)
253 stfd f22,`-10*8`($i)
254 stfd f23,`-9*8`($i)
255 stfd f24,`-8*8`($i)
256 stfd f25,`-7*8`($i)
257 stfd f26,`-6*8`($i)
258 stfd f27,`-5*8`($i)
259 stfd f28,`-4*8`($i)
260 stfd f29,`-3*8`($i)
261 stfd f30,`-2*8`($i)
262 stfd f31,`-1*8`($i)
263
264 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
265 li $i,-64
266 add $nap_d,$tp,$num
267 and $nap_d,$nap_d,$i ; align to 64 bytes
268 ; nap_d is off by 1, because it's used with stfdu/lfdu
269 addi $nap_d,$nap_d,-8
270 srwi $j,$num,`3+1` ; counter register, num/2
271 addi $j,$j,-1
272 addi $tp,$sp,`$FRAME+$TRANSFER-8`
273 li $carry,0
274 mtctr $j
275 ___
276 \f
277 $code.=<<___ if ($SIZE_T==8);
278 ld $a0,0($ap) ; pull ap[0] value
279 ld $t3,0($bp) ; bp[0]
280 ld $n0,0($n0) ; pull n0[0] value
281
282 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
283 ; transfer bp[0] to FPU as 4x16-bit values
284 extrdi $t0,$t3,16,48
285 extrdi $t1,$t3,16,32
286 extrdi $t2,$t3,16,16
287 extrdi $t3,$t3,16,0
288 std $t0,`$FRAME+0`($sp)
289 std $t1,`$FRAME+8`($sp)
290 std $t2,`$FRAME+16`($sp)
291 std $t3,`$FRAME+24`($sp)
292
293 mulld $t7,$t7,$n0 ; tp[0]*n0
294 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
295 extrdi $t4,$t7,16,48
296 extrdi $t5,$t7,16,32
297 extrdi $t6,$t7,16,16
298 extrdi $t7,$t7,16,0
299 std $t4,`$FRAME+32`($sp)
300 std $t5,`$FRAME+40`($sp)
301 std $t6,`$FRAME+48`($sp)
302 std $t7,`$FRAME+56`($sp)
303
304 extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
305 extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
306 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
307 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
308 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
309 lwz $t5,`0^$LITTLE_ENDIAN`($np)
310 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
311 lwz $t7,`8^$LITTLE_ENDIAN`($np)
312 ___
313 $code.=<<___ if ($SIZE_T==4);
314 lwz $a0,0($ap) ; pull ap[0,1] value
315 mr $n1,$n0
316 lwz $a1,4($ap)
317 li $c1,0
318 lwz $t1,0($bp) ; bp[0,1]
319 lwz $t3,4($bp)
320 lwz $n0,0($n1) ; pull n0[0,1] value
321 lwz $n1,4($n1)
322
323 mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
324 mulhwu $t5,$a0,$t1
325 mullw $t6,$a1,$t1
326 mullw $t7,$a0,$t3
327 add $t5,$t5,$t6
328 add $t5,$t5,$t7
329 ; transfer bp[0] to FPU as 4x16-bit values
330 extrwi $t0,$t1,16,16
331 extrwi $t1,$t1,16,0
332 extrwi $t2,$t3,16,16
333 extrwi $t3,$t3,16,0
334 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
335 std $t1,`$FRAME+8`($sp)
336 std $t2,`$FRAME+16`($sp)
337 std $t3,`$FRAME+24`($sp)
338
339 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
340 mulhwu $t1,$t4,$n0
341 mullw $t2,$t5,$n0
342 mullw $t3,$t4,$n1
343 add $t1,$t1,$t2
344 add $t1,$t1,$t3
345 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
346 extrwi $t4,$t0,16,16
347 extrwi $t5,$t0,16,0
348 extrwi $t6,$t1,16,16
349 extrwi $t7,$t1,16,0
350 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
351 std $t5,`$FRAME+40`($sp)
352 std $t6,`$FRAME+48`($sp)
353 std $t7,`$FRAME+56`($sp)
354
355 mr $t0,$a0 ; lwz $t0,0($ap)
356 mr $t1,$a1 ; lwz $t1,4($ap)
357 lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
358 lwz $t3,12($ap)
359 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
360 lwz $t5,4($np)
361 lwz $t6,8($np)
362 lwz $t7,12($np)
363 ___
364 $code.=<<___;
365 lfd $ba,`$FRAME+0`($sp)
366 lfd $bb,`$FRAME+8`($sp)
367 lfd $bc,`$FRAME+16`($sp)
368 lfd $bd,`$FRAME+24`($sp)
369 lfd $na,`$FRAME+32`($sp)
370 lfd $nb,`$FRAME+40`($sp)
371 lfd $nc,`$FRAME+48`($sp)
372 lfd $nd,`$FRAME+56`($sp)
373 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
374 std $t1,`$FRAME+72`($sp)
375 std $t2,`$FRAME+80`($sp)
376 std $t3,`$FRAME+88`($sp)
377 std $t4,`$FRAME+96`($sp)
378 std $t5,`$FRAME+104`($sp)
379 std $t6,`$FRAME+112`($sp)
380 std $t7,`$FRAME+120`($sp)
381 fcfid $ba,$ba
382 fcfid $bb,$bb
383 fcfid $bc,$bc
384 fcfid $bd,$bd
385 fcfid $na,$na
386 fcfid $nb,$nb
387 fcfid $nc,$nc
388 fcfid $nd,$nd
389
390 lfd $A0,`$FRAME+64`($sp)
391 lfd $A1,`$FRAME+72`($sp)
392 lfd $A2,`$FRAME+80`($sp)
393 lfd $A3,`$FRAME+88`($sp)
394 lfd $N0,`$FRAME+96`($sp)
395 lfd $N1,`$FRAME+104`($sp)
396 lfd $N2,`$FRAME+112`($sp)
397 lfd $N3,`$FRAME+120`($sp)
398 fcfid $A0,$A0
399 fcfid $A1,$A1
400 fcfid $A2,$A2
401 fcfid $A3,$A3
402 fcfid $N0,$N0
403 fcfid $N1,$N1
404 fcfid $N2,$N2
405 fcfid $N3,$N3
406 addi $ap,$ap,16
407 addi $np,$np,16
408
409 fmul $T1a,$A1,$ba
410 fmul $T1b,$A1,$bb
411 stfd $A0,8($nap_d) ; save a[j] in double format
412 stfd $A1,16($nap_d)
413 fmul $T2a,$A2,$ba
414 fmul $T2b,$A2,$bb
415 stfd $A2,24($nap_d) ; save a[j+1] in double format
416 stfd $A3,32($nap_d)
417 fmul $T3a,$A3,$ba
418 fmul $T3b,$A3,$bb
419 stfd $N0,40($nap_d) ; save n[j] in double format
420 stfd $N1,48($nap_d)
421 fmul $T0a,$A0,$ba
422 fmul $T0b,$A0,$bb
423 stfd $N2,56($nap_d) ; save n[j+1] in double format
424 stfdu $N3,64($nap_d)
425
426 fmadd $T1a,$A0,$bc,$T1a
427 fmadd $T1b,$A0,$bd,$T1b
428 fmadd $T2a,$A1,$bc,$T2a
429 fmadd $T2b,$A1,$bd,$T2b
430 fmadd $T3a,$A2,$bc,$T3a
431 fmadd $T3b,$A2,$bd,$T3b
432 fmul $dota,$A3,$bc
433 fmul $dotb,$A3,$bd
434
435 fmadd $T1a,$N1,$na,$T1a
436 fmadd $T1b,$N1,$nb,$T1b
437 fmadd $T2a,$N2,$na,$T2a
438 fmadd $T2b,$N2,$nb,$T2b
439 fmadd $T3a,$N3,$na,$T3a
440 fmadd $T3b,$N3,$nb,$T3b
441 fmadd $T0a,$N0,$na,$T0a
442 fmadd $T0b,$N0,$nb,$T0b
443
444 fmadd $T1a,$N0,$nc,$T1a
445 fmadd $T1b,$N0,$nd,$T1b
446 fmadd $T2a,$N1,$nc,$T2a
447 fmadd $T2b,$N1,$nd,$T2b
448 fmadd $T3a,$N2,$nc,$T3a
449 fmadd $T3b,$N2,$nd,$T3b
450 fmadd $dota,$N3,$nc,$dota
451 fmadd $dotb,$N3,$nd,$dotb
452
453 fctid $T0a,$T0a
454 fctid $T0b,$T0b
455 fctid $T1a,$T1a
456 fctid $T1b,$T1b
457 fctid $T2a,$T2a
458 fctid $T2b,$T2b
459 fctid $T3a,$T3a
460 fctid $T3b,$T3b
461
462 stfd $T0a,`$FRAME+0`($sp)
463 stfd $T0b,`$FRAME+8`($sp)
464 stfd $T1a,`$FRAME+16`($sp)
465 stfd $T1b,`$FRAME+24`($sp)
466 stfd $T2a,`$FRAME+32`($sp)
467 stfd $T2b,`$FRAME+40`($sp)
468 stfd $T3a,`$FRAME+48`($sp)
469 stfd $T3b,`$FRAME+56`($sp)
470 \f
471 .align 5
472 L1st:
473 ___
474 $code.=<<___ if ($SIZE_T==8);
475 lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
476 lwz $t1,`0^$LITTLE_ENDIAN`($ap)
477 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
478 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
479 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
480 lwz $t5,`0^$LITTLE_ENDIAN`($np)
481 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
482 lwz $t7,`8^$LITTLE_ENDIAN`($np)
483 ___
484 $code.=<<___ if ($SIZE_T==4);
485 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
486 lwz $t1,4($ap)
487 lwz $t2,8($ap)
488 lwz $t3,12($ap)
489 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
490 lwz $t5,4($np)
491 lwz $t6,8($np)
492 lwz $t7,12($np)
493 ___
494 $code.=<<___;
495 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
496 std $t1,`$FRAME+72`($sp)
497 std $t2,`$FRAME+80`($sp)
498 std $t3,`$FRAME+88`($sp)
499 std $t4,`$FRAME+96`($sp)
500 std $t5,`$FRAME+104`($sp)
501 std $t6,`$FRAME+112`($sp)
502 std $t7,`$FRAME+120`($sp)
503 ___
504 if ($SIZE_T==8 or $flavour =~ /osx/) {
505 $code.=<<___;
506 ld $t0,`$FRAME+0`($sp)
507 ld $t1,`$FRAME+8`($sp)
508 ld $t2,`$FRAME+16`($sp)
509 ld $t3,`$FRAME+24`($sp)
510 ld $t4,`$FRAME+32`($sp)
511 ld $t5,`$FRAME+40`($sp)
512 ld $t6,`$FRAME+48`($sp)
513 ld $t7,`$FRAME+56`($sp)
514 ___
515 } else {
516 $code.=<<___;
517 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
518 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
519 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
520 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
521 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
522 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
523 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
524 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
525 ___
526 }
527 $code.=<<___;
528 lfd $A0,`$FRAME+64`($sp)
529 lfd $A1,`$FRAME+72`($sp)
530 lfd $A2,`$FRAME+80`($sp)
531 lfd $A3,`$FRAME+88`($sp)
532 lfd $N0,`$FRAME+96`($sp)
533 lfd $N1,`$FRAME+104`($sp)
534 lfd $N2,`$FRAME+112`($sp)
535 lfd $N3,`$FRAME+120`($sp)
536 fcfid $A0,$A0
537 fcfid $A1,$A1
538 fcfid $A2,$A2
539 fcfid $A3,$A3
540 fcfid $N0,$N0
541 fcfid $N1,$N1
542 fcfid $N2,$N2
543 fcfid $N3,$N3
544 addi $ap,$ap,16
545 addi $np,$np,16
546
547 fmul $T1a,$A1,$ba
548 fmul $T1b,$A1,$bb
549 fmul $T2a,$A2,$ba
550 fmul $T2b,$A2,$bb
551 stfd $A0,8($nap_d) ; save a[j] in double format
552 stfd $A1,16($nap_d)
553 fmul $T3a,$A3,$ba
554 fmul $T3b,$A3,$bb
555 fmadd $T0a,$A0,$ba,$dota
556 fmadd $T0b,$A0,$bb,$dotb
557 stfd $A2,24($nap_d) ; save a[j+1] in double format
558 stfd $A3,32($nap_d)
559 ___
560 if ($SIZE_T==8 or $flavour =~ /osx/) {
561 $code.=<<___;
562 fmadd $T1a,$A0,$bc,$T1a
563 fmadd $T1b,$A0,$bd,$T1b
564 fmadd $T2a,$A1,$bc,$T2a
565 fmadd $T2b,$A1,$bd,$T2b
566 stfd $N0,40($nap_d) ; save n[j] in double format
567 stfd $N1,48($nap_d)
568 fmadd $T3a,$A2,$bc,$T3a
569 fmadd $T3b,$A2,$bd,$T3b
570 add $t0,$t0,$carry ; can not overflow
571 fmul $dota,$A3,$bc
572 fmul $dotb,$A3,$bd
573 stfd $N2,56($nap_d) ; save n[j+1] in double format
574 stfdu $N3,64($nap_d)
575 srdi $carry,$t0,16
576 add $t1,$t1,$carry
577 srdi $carry,$t1,16
578
579 fmadd $T1a,$N1,$na,$T1a
580 fmadd $T1b,$N1,$nb,$T1b
581 insrdi $t0,$t1,16,32
582 fmadd $T2a,$N2,$na,$T2a
583 fmadd $T2b,$N2,$nb,$T2b
584 add $t2,$t2,$carry
585 fmadd $T3a,$N3,$na,$T3a
586 fmadd $T3b,$N3,$nb,$T3b
587 srdi $carry,$t2,16
588 fmadd $T0a,$N0,$na,$T0a
589 fmadd $T0b,$N0,$nb,$T0b
590 insrdi $t0,$t2,16,16
591 add $t3,$t3,$carry
592 srdi $carry,$t3,16
593
594 fmadd $T1a,$N0,$nc,$T1a
595 fmadd $T1b,$N0,$nd,$T1b
596 insrdi $t0,$t3,16,0 ; 0..63 bits
597 fmadd $T2a,$N1,$nc,$T2a
598 fmadd $T2b,$N1,$nd,$T2b
599 add $t4,$t4,$carry
600 fmadd $T3a,$N2,$nc,$T3a
601 fmadd $T3b,$N2,$nd,$T3b
602 srdi $carry,$t4,16
603 fmadd $dota,$N3,$nc,$dota
604 fmadd $dotb,$N3,$nd,$dotb
605 add $t5,$t5,$carry
606 srdi $carry,$t5,16
607 insrdi $t4,$t5,16,32
608
609 fctid $T0a,$T0a
610 fctid $T0b,$T0b
611 add $t6,$t6,$carry
612 fctid $T1a,$T1a
613 fctid $T1b,$T1b
614 srdi $carry,$t6,16
615 fctid $T2a,$T2a
616 fctid $T2b,$T2b
617 insrdi $t4,$t6,16,16
618 fctid $T3a,$T3a
619 fctid $T3b,$T3b
620 add $t7,$t7,$carry
621 insrdi $t4,$t7,16,0 ; 64..127 bits
622 srdi $carry,$t7,16 ; upper 33 bits
623
624 stfd $T0a,`$FRAME+0`($sp)
625 stfd $T0b,`$FRAME+8`($sp)
626 stfd $T1a,`$FRAME+16`($sp)
627 stfd $T1b,`$FRAME+24`($sp)
628 stfd $T2a,`$FRAME+32`($sp)
629 stfd $T2b,`$FRAME+40`($sp)
630 stfd $T3a,`$FRAME+48`($sp)
631 stfd $T3b,`$FRAME+56`($sp)
632 std $t0,8($tp) ; tp[j-1]
633 stdu $t4,16($tp) ; tp[j]
634 ___
635 } else {
636 $code.=<<___;
637 fmadd $T1a,$A0,$bc,$T1a
638 fmadd $T1b,$A0,$bd,$T1b
639 addc $t0,$t0,$carry
640 adde $t1,$t1,$c1
641 srwi $carry,$t0,16
642 fmadd $T2a,$A1,$bc,$T2a
643 fmadd $T2b,$A1,$bd,$T2b
644 stfd $N0,40($nap_d) ; save n[j] in double format
645 stfd $N1,48($nap_d)
646 srwi $c1,$t1,16
647 insrwi $carry,$t1,16,0
648 fmadd $T3a,$A2,$bc,$T3a
649 fmadd $T3b,$A2,$bd,$T3b
650 addc $t2,$t2,$carry
651 adde $t3,$t3,$c1
652 srwi $carry,$t2,16
653 fmul $dota,$A3,$bc
654 fmul $dotb,$A3,$bd
655 stfd $N2,56($nap_d) ; save n[j+1] in double format
656 stfdu $N3,64($nap_d)
657 insrwi $t0,$t2,16,0 ; 0..31 bits
658 srwi $c1,$t3,16
659 insrwi $carry,$t3,16,0
660
661 fmadd $T1a,$N1,$na,$T1a
662 fmadd $T1b,$N1,$nb,$T1b
663 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
664 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
665 addc $t4,$t4,$carry
666 adde $t5,$t5,$c1
667 srwi $carry,$t4,16
668 fmadd $T2a,$N2,$na,$T2a
669 fmadd $T2b,$N2,$nb,$T2b
670 srwi $c1,$t5,16
671 insrwi $carry,$t5,16,0
672 fmadd $T3a,$N3,$na,$T3a
673 fmadd $T3b,$N3,$nb,$T3b
674 addc $t6,$t6,$carry
675 adde $t7,$t7,$c1
676 srwi $carry,$t6,16
677 fmadd $T0a,$N0,$na,$T0a
678 fmadd $T0b,$N0,$nb,$T0b
679 insrwi $t4,$t6,16,0 ; 32..63 bits
680 srwi $c1,$t7,16
681 insrwi $carry,$t7,16,0
682
683 fmadd $T1a,$N0,$nc,$T1a
684 fmadd $T1b,$N0,$nd,$T1b
685 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
686 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
687 addc $t2,$t2,$carry
688 adde $t3,$t3,$c1
689 srwi $carry,$t2,16
690 fmadd $T2a,$N1,$nc,$T2a
691 fmadd $T2b,$N1,$nd,$T2b
692 stw $t0,12($tp) ; tp[j-1]
693 stw $t4,8($tp)
694 srwi $c1,$t3,16
695 insrwi $carry,$t3,16,0
696 fmadd $T3a,$N2,$nc,$T3a
697 fmadd $T3b,$N2,$nd,$T3b
698 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
699 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
700 addc $t6,$t6,$carry
701 adde $t7,$t7,$c1
702 srwi $carry,$t6,16
703 fmadd $dota,$N3,$nc,$dota
704 fmadd $dotb,$N3,$nd,$dotb
705 insrwi $t2,$t6,16,0 ; 64..95 bits
706 srwi $c1,$t7,16
707 insrwi $carry,$t7,16,0
708
709 fctid $T0a,$T0a
710 fctid $T0b,$T0b
711 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
712 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
713 addc $t0,$t0,$carry
714 adde $t1,$t1,$c1
715 srwi $carry,$t0,16
716 fctid $T1a,$T1a
717 fctid $T1b,$T1b
718 srwi $c1,$t1,16
719 insrwi $carry,$t1,16,0
720 fctid $T2a,$T2a
721 fctid $T2b,$T2b
722 addc $t4,$t4,$carry
723 adde $t5,$t5,$c1
724 srwi $carry,$t4,16
725 fctid $T3a,$T3a
726 fctid $T3b,$T3b
727 insrwi $t0,$t4,16,0 ; 96..127 bits
728 srwi $c1,$t5,16
729 insrwi $carry,$t5,16,0
730
731 stfd $T0a,`$FRAME+0`($sp)
732 stfd $T0b,`$FRAME+8`($sp)
733 stfd $T1a,`$FRAME+16`($sp)
734 stfd $T1b,`$FRAME+24`($sp)
735 stfd $T2a,`$FRAME+32`($sp)
736 stfd $T2b,`$FRAME+40`($sp)
737 stfd $T3a,`$FRAME+48`($sp)
738 stfd $T3b,`$FRAME+56`($sp)
739 stw $t2,20($tp) ; tp[j]
740 stwu $t0,16($tp)
741 ___
742 }
743 $code.=<<___;
744 bdnz L1st
745 \f
746 fctid $dota,$dota
747 fctid $dotb,$dotb
748 ___
749 if ($SIZE_T==8 or $flavour =~ /osx/) {
750 $code.=<<___;
751 ld $t0,`$FRAME+0`($sp)
752 ld $t1,`$FRAME+8`($sp)
753 ld $t2,`$FRAME+16`($sp)
754 ld $t3,`$FRAME+24`($sp)
755 ld $t4,`$FRAME+32`($sp)
756 ld $t5,`$FRAME+40`($sp)
757 ld $t6,`$FRAME+48`($sp)
758 ld $t7,`$FRAME+56`($sp)
759 stfd $dota,`$FRAME+64`($sp)
760 stfd $dotb,`$FRAME+72`($sp)
761
762 add $t0,$t0,$carry ; can not overflow
763 srdi $carry,$t0,16
764 add $t1,$t1,$carry
765 srdi $carry,$t1,16
766 insrdi $t0,$t1,16,32
767 add $t2,$t2,$carry
768 srdi $carry,$t2,16
769 insrdi $t0,$t2,16,16
770 add $t3,$t3,$carry
771 srdi $carry,$t3,16
772 insrdi $t0,$t3,16,0 ; 0..63 bits
773 add $t4,$t4,$carry
774 srdi $carry,$t4,16
775 add $t5,$t5,$carry
776 srdi $carry,$t5,16
777 insrdi $t4,$t5,16,32
778 add $t6,$t6,$carry
779 srdi $carry,$t6,16
780 insrdi $t4,$t6,16,16
781 add $t7,$t7,$carry
782 insrdi $t4,$t7,16,0 ; 64..127 bits
783 srdi $carry,$t7,16 ; upper 33 bits
784 ld $t6,`$FRAME+64`($sp)
785 ld $t7,`$FRAME+72`($sp)
786
787 std $t0,8($tp) ; tp[j-1]
788 stdu $t4,16($tp) ; tp[j]
789
790 add $t6,$t6,$carry ; can not overflow
791 srdi $carry,$t6,16
792 add $t7,$t7,$carry
793 insrdi $t6,$t7,48,0
794 srdi $ovf,$t7,48
795 std $t6,8($tp) ; tp[num-1]
796 ___
797 } else {
798 $code.=<<___;
799 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
800 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
801 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
802 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
803 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
804 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
805 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
806 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
807 stfd $dota,`$FRAME+64`($sp)
808 stfd $dotb,`$FRAME+72`($sp)
809
810 addc $t0,$t0,$carry
811 adde $t1,$t1,$c1
812 srwi $carry,$t0,16
813 insrwi $carry,$t1,16,0
814 srwi $c1,$t1,16
815 addc $t2,$t2,$carry
816 adde $t3,$t3,$c1
817 srwi $carry,$t2,16
818 insrwi $t0,$t2,16,0 ; 0..31 bits
819 insrwi $carry,$t3,16,0
820 srwi $c1,$t3,16
821 addc $t4,$t4,$carry
822 adde $t5,$t5,$c1
823 srwi $carry,$t4,16
824 insrwi $carry,$t5,16,0
825 srwi $c1,$t5,16
826 addc $t6,$t6,$carry
827 adde $t7,$t7,$c1
828 srwi $carry,$t6,16
829 insrwi $t4,$t6,16,0 ; 32..63 bits
830 insrwi $carry,$t7,16,0
831 srwi $c1,$t7,16
832 stw $t0,12($tp) ; tp[j-1]
833 stw $t4,8($tp)
834
835 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
836 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
837 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
838 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
839 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
840 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
841 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
842 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
843
844 addc $t2,$t2,$carry
845 adde $t3,$t3,$c1
846 srwi $carry,$t2,16
847 insrwi $carry,$t3,16,0
848 srwi $c1,$t3,16
849 addc $t6,$t6,$carry
850 adde $t7,$t7,$c1
851 srwi $carry,$t6,16
852 insrwi $t2,$t6,16,0 ; 64..95 bits
853 insrwi $carry,$t7,16,0
854 srwi $c1,$t7,16
855 addc $t0,$t0,$carry
856 adde $t1,$t1,$c1
857 srwi $carry,$t0,16
858 insrwi $carry,$t1,16,0
859 srwi $c1,$t1,16
860 addc $t4,$t4,$carry
861 adde $t5,$t5,$c1
862 srwi $carry,$t4,16
863 insrwi $t0,$t4,16,0 ; 96..127 bits
864 insrwi $carry,$t5,16,0
865 srwi $c1,$t5,16
866 stw $t2,20($tp) ; tp[j]
867 stwu $t0,16($tp)
868
869 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
870 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
871 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
872 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
873
874 addc $t6,$t6,$carry
875 adde $t7,$t7,$c1
876 srwi $carry,$t6,16
877 insrwi $carry,$t7,16,0
878 srwi $c1,$t7,16
879 addc $t4,$t4,$carry
880 adde $t5,$t5,$c1
881
882 insrwi $t6,$t4,16,0
883 srwi $t4,$t4,16
884 insrwi $t4,$t5,16,0
885 srwi $ovf,$t5,16
886 stw $t6,12($tp) ; tp[num-1]
887 stw $t4,8($tp)
888 ___
889 }
890 $code.=<<___;
891 slwi $t7,$num,2
892 subf $nap_d,$t7,$nap_d ; rewind pointer
893 \f
894 li $i,8 ; i=1
895 .align 5
896 Louter:
897 addi $tp,$sp,`$FRAME+$TRANSFER`
898 li $carry,0
899 mtctr $j
900 ___
901 $code.=<<___ if ($SIZE_T==8);
902 ldx $t3,$bp,$i ; bp[i]
903
904 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
905 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
906 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
907 ; transfer bp[i] to FPU as 4x16-bit values
908 extrdi $t0,$t3,16,48
909 extrdi $t1,$t3,16,32
910 extrdi $t2,$t3,16,16
911 extrdi $t3,$t3,16,0
912 std $t0,`$FRAME+0`($sp)
913 std $t1,`$FRAME+8`($sp)
914 std $t2,`$FRAME+16`($sp)
915 std $t3,`$FRAME+24`($sp)
916
917 mulld $t7,$t7,$n0 ; tp[0]*n0
918 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
919 extrdi $t4,$t7,16,48
920 extrdi $t5,$t7,16,32
921 extrdi $t6,$t7,16,16
922 extrdi $t7,$t7,16,0
923 std $t4,`$FRAME+32`($sp)
924 std $t5,`$FRAME+40`($sp)
925 std $t6,`$FRAME+48`($sp)
926 std $t7,`$FRAME+56`($sp)
927 ___
928 $code.=<<___ if ($SIZE_T==4);
929 add $t0,$bp,$i
930 li $c1,0
931 lwz $t1,0($t0) ; bp[i,i+1]
932 lwz $t3,4($t0)
933
934 mullw $t4,$a0,$t1 ; ap[0]*bp[i]
935 lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
936 mulhwu $t5,$a0,$t1
937 lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
938 mullw $t6,$a1,$t1
939 mullw $t7,$a0,$t3
940 add $t5,$t5,$t6
941 add $t5,$t5,$t7
942 addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
943 adde $t5,$t5,$t2
944 ; transfer bp[i] to FPU as 4x16-bit values
945 extrwi $t0,$t1,16,16
946 extrwi $t1,$t1,16,0
947 extrwi $t2,$t3,16,16
948 extrwi $t3,$t3,16,0
949 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
950 std $t1,`$FRAME+8`($sp)
951 std $t2,`$FRAME+16`($sp)
952 std $t3,`$FRAME+24`($sp)
953
954 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
955 mulhwu $t1,$t4,$n0
956 mullw $t2,$t5,$n0
957 mullw $t3,$t4,$n1
958 add $t1,$t1,$t2
959 add $t1,$t1,$t3
960 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
961 extrwi $t4,$t0,16,16
962 extrwi $t5,$t0,16,0
963 extrwi $t6,$t1,16,16
964 extrwi $t7,$t1,16,0
965 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
966 std $t5,`$FRAME+40`($sp)
967 std $t6,`$FRAME+48`($sp)
968 std $t7,`$FRAME+56`($sp)
969 ___
970 $code.=<<___;
971 lfd $A0,8($nap_d) ; load a[j] in double format
972 lfd $A1,16($nap_d)
973 lfd $A2,24($nap_d) ; load a[j+1] in double format
974 lfd $A3,32($nap_d)
975 lfd $N0,40($nap_d) ; load n[j] in double format
976 lfd $N1,48($nap_d)
977 lfd $N2,56($nap_d) ; load n[j+1] in double format
978 lfdu $N3,64($nap_d)
979
980 lfd $ba,`$FRAME+0`($sp)
981 lfd $bb,`$FRAME+8`($sp)
982 lfd $bc,`$FRAME+16`($sp)
983 lfd $bd,`$FRAME+24`($sp)
984 lfd $na,`$FRAME+32`($sp)
985 lfd $nb,`$FRAME+40`($sp)
986 lfd $nc,`$FRAME+48`($sp)
987 lfd $nd,`$FRAME+56`($sp)
988
989 fcfid $ba,$ba
990 fcfid $bb,$bb
991 fcfid $bc,$bc
992 fcfid $bd,$bd
993 fcfid $na,$na
994 fcfid $nb,$nb
995 fcfid $nc,$nc
996 fcfid $nd,$nd
997
998 fmul $T1a,$A1,$ba
999 fmul $T1b,$A1,$bb
1000 fmul $T2a,$A2,$ba
1001 fmul $T2b,$A2,$bb
1002 fmul $T3a,$A3,$ba
1003 fmul $T3b,$A3,$bb
1004 fmul $T0a,$A0,$ba
1005 fmul $T0b,$A0,$bb
1006
1007 fmadd $T1a,$A0,$bc,$T1a
1008 fmadd $T1b,$A0,$bd,$T1b
1009 fmadd $T2a,$A1,$bc,$T2a
1010 fmadd $T2b,$A1,$bd,$T2b
1011 fmadd $T3a,$A2,$bc,$T3a
1012 fmadd $T3b,$A2,$bd,$T3b
1013 fmul $dota,$A3,$bc
1014 fmul $dotb,$A3,$bd
1015
1016 fmadd $T1a,$N1,$na,$T1a
1017 fmadd $T1b,$N1,$nb,$T1b
1018 lfd $A0,8($nap_d) ; load a[j] in double format
1019 lfd $A1,16($nap_d)
1020 fmadd $T2a,$N2,$na,$T2a
1021 fmadd $T2b,$N2,$nb,$T2b
1022 lfd $A2,24($nap_d) ; load a[j+1] in double format
1023 lfd $A3,32($nap_d)
1024 fmadd $T3a,$N3,$na,$T3a
1025 fmadd $T3b,$N3,$nb,$T3b
1026 fmadd $T0a,$N0,$na,$T0a
1027 fmadd $T0b,$N0,$nb,$T0b
1028
1029 fmadd $T1a,$N0,$nc,$T1a
1030 fmadd $T1b,$N0,$nd,$T1b
1031 fmadd $T2a,$N1,$nc,$T2a
1032 fmadd $T2b,$N1,$nd,$T2b
1033 fmadd $T3a,$N2,$nc,$T3a
1034 fmadd $T3b,$N2,$nd,$T3b
1035 fmadd $dota,$N3,$nc,$dota
1036 fmadd $dotb,$N3,$nd,$dotb
1037
1038 fctid $T0a,$T0a
1039 fctid $T0b,$T0b
1040 fctid $T1a,$T1a
1041 fctid $T1b,$T1b
1042 fctid $T2a,$T2a
1043 fctid $T2b,$T2b
1044 fctid $T3a,$T3a
1045 fctid $T3b,$T3b
1046
1047 stfd $T0a,`$FRAME+0`($sp)
1048 stfd $T0b,`$FRAME+8`($sp)
1049 stfd $T1a,`$FRAME+16`($sp)
1050 stfd $T1b,`$FRAME+24`($sp)
1051 stfd $T2a,`$FRAME+32`($sp)
1052 stfd $T2b,`$FRAME+40`($sp)
1053 stfd $T3a,`$FRAME+48`($sp)
1054 stfd $T3b,`$FRAME+56`($sp)
1055 \f
1056 .align 5
1057 Linner:
1058 fmul $T1a,$A1,$ba
1059 fmul $T1b,$A1,$bb
1060 fmul $T2a,$A2,$ba
1061 fmul $T2b,$A2,$bb
1062 lfd $N0,40($nap_d) ; load n[j] in double format
1063 lfd $N1,48($nap_d)
1064 fmul $T3a,$A3,$ba
1065 fmul $T3b,$A3,$bb
1066 fmadd $T0a,$A0,$ba,$dota
1067 fmadd $T0b,$A0,$bb,$dotb
1068 lfd $N2,56($nap_d) ; load n[j+1] in double format
1069 lfdu $N3,64($nap_d)
1070
1071 fmadd $T1a,$A0,$bc,$T1a
1072 fmadd $T1b,$A0,$bd,$T1b
1073 fmadd $T2a,$A1,$bc,$T2a
1074 fmadd $T2b,$A1,$bd,$T2b
1075 lfd $A0,8($nap_d) ; load a[j] in double format
1076 lfd $A1,16($nap_d)
1077 fmadd $T3a,$A2,$bc,$T3a
1078 fmadd $T3b,$A2,$bd,$T3b
1079 fmul $dota,$A3,$bc
1080 fmul $dotb,$A3,$bd
1081 lfd $A2,24($nap_d) ; load a[j+1] in double format
1082 lfd $A3,32($nap_d)
1083 ___
1084 if ($SIZE_T==8 or $flavour =~ /osx/) {
1085 $code.=<<___;
1086 fmadd $T1a,$N1,$na,$T1a
1087 fmadd $T1b,$N1,$nb,$T1b
1088 ld $t0,`$FRAME+0`($sp)
1089 ld $t1,`$FRAME+8`($sp)
1090 fmadd $T2a,$N2,$na,$T2a
1091 fmadd $T2b,$N2,$nb,$T2b
1092 ld $t2,`$FRAME+16`($sp)
1093 ld $t3,`$FRAME+24`($sp)
1094 fmadd $T3a,$N3,$na,$T3a
1095 fmadd $T3b,$N3,$nb,$T3b
1096 add $t0,$t0,$carry ; can not overflow
1097 ld $t4,`$FRAME+32`($sp)
1098 ld $t5,`$FRAME+40`($sp)
1099 fmadd $T0a,$N0,$na,$T0a
1100 fmadd $T0b,$N0,$nb,$T0b
1101 srdi $carry,$t0,16
1102 add $t1,$t1,$carry
1103 srdi $carry,$t1,16
1104 ld $t6,`$FRAME+48`($sp)
1105 ld $t7,`$FRAME+56`($sp)
1106
1107 fmadd $T1a,$N0,$nc,$T1a
1108 fmadd $T1b,$N0,$nd,$T1b
1109 insrdi $t0,$t1,16,32
1110 ld $t1,8($tp) ; tp[j]
1111 fmadd $T2a,$N1,$nc,$T2a
1112 fmadd $T2b,$N1,$nd,$T2b
1113 add $t2,$t2,$carry
1114 fmadd $T3a,$N2,$nc,$T3a
1115 fmadd $T3b,$N2,$nd,$T3b
1116 srdi $carry,$t2,16
1117 insrdi $t0,$t2,16,16
1118 fmadd $dota,$N3,$nc,$dota
1119 fmadd $dotb,$N3,$nd,$dotb
1120 add $t3,$t3,$carry
1121 ldu $t2,16($tp) ; tp[j+1]
1122 srdi $carry,$t3,16
1123 insrdi $t0,$t3,16,0 ; 0..63 bits
1124 add $t4,$t4,$carry
1125
1126 fctid $T0a,$T0a
1127 fctid $T0b,$T0b
1128 srdi $carry,$t4,16
1129 fctid $T1a,$T1a
1130 fctid $T1b,$T1b
1131 add $t5,$t5,$carry
1132 fctid $T2a,$T2a
1133 fctid $T2b,$T2b
1134 srdi $carry,$t5,16
1135 insrdi $t4,$t5,16,32
1136 fctid $T3a,$T3a
1137 fctid $T3b,$T3b
1138 add $t6,$t6,$carry
1139 srdi $carry,$t6,16
1140 insrdi $t4,$t6,16,16
1141
1142 stfd $T0a,`$FRAME+0`($sp)
1143 stfd $T0b,`$FRAME+8`($sp)
1144 add $t7,$t7,$carry
1145 addc $t3,$t0,$t1
1146 ___
1147 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1148 extrdi $t0,$t0,32,0
1149 extrdi $t1,$t1,32,0
1150 adde $t0,$t0,$t1
1151 ___
1152 $code.=<<___;
1153 stfd $T1a,`$FRAME+16`($sp)
1154 stfd $T1b,`$FRAME+24`($sp)
1155 insrdi $t4,$t7,16,0 ; 64..127 bits
1156 srdi $carry,$t7,16 ; upper 33 bits
1157 stfd $T2a,`$FRAME+32`($sp)
1158 stfd $T2b,`$FRAME+40`($sp)
1159 adde $t5,$t4,$t2
1160 ___
1161 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1162 extrdi $t4,$t4,32,0
1163 extrdi $t2,$t2,32,0
1164 adde $t4,$t4,$t2
1165 ___
1166 $code.=<<___;
1167 stfd $T3a,`$FRAME+48`($sp)
1168 stfd $T3b,`$FRAME+56`($sp)
1169 addze $carry,$carry
1170 std $t3,-16($tp) ; tp[j-1]
1171 std $t5,-8($tp) ; tp[j]
1172 ___
1173 } else {
1174 $code.=<<___;
1175 fmadd $T1a,$N1,$na,$T1a
1176 fmadd $T1b,$N1,$nb,$T1b
1177 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1178 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1179 fmadd $T2a,$N2,$na,$T2a
1180 fmadd $T2b,$N2,$nb,$T2b
1181 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1182 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1183 fmadd $T3a,$N3,$na,$T3a
1184 fmadd $T3b,$N3,$nb,$T3b
1185 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1186 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1187 addc $t0,$t0,$carry
1188 adde $t1,$t1,$c1
1189 srwi $carry,$t0,16
1190 fmadd $T0a,$N0,$na,$T0a
1191 fmadd $T0b,$N0,$nb,$T0b
1192 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1193 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1194 srwi $c1,$t1,16
1195 insrwi $carry,$t1,16,0
1196
1197 fmadd $T1a,$N0,$nc,$T1a
1198 fmadd $T1b,$N0,$nd,$T1b
1199 addc $t2,$t2,$carry
1200 adde $t3,$t3,$c1
1201 srwi $carry,$t2,16
1202 fmadd $T2a,$N1,$nc,$T2a
1203 fmadd $T2b,$N1,$nd,$T2b
1204 insrwi $t0,$t2,16,0 ; 0..31 bits
1205 srwi $c1,$t3,16
1206 insrwi $carry,$t3,16,0
1207 fmadd $T3a,$N2,$nc,$T3a
1208 fmadd $T3b,$N2,$nd,$T3b
1209 lwz $t2,12($tp) ; tp[j]
1210 lwz $t3,8($tp)
1211 addc $t4,$t4,$carry
1212 adde $t5,$t5,$c1
1213 srwi $carry,$t4,16
1214 fmadd $dota,$N3,$nc,$dota
1215 fmadd $dotb,$N3,$nd,$dotb
1216 srwi $c1,$t5,16
1217 insrwi $carry,$t5,16,0
1218
1219 fctid $T0a,$T0a
1220 addc $t6,$t6,$carry
1221 adde $t7,$t7,$c1
1222 srwi $carry,$t6,16
1223 fctid $T0b,$T0b
1224 insrwi $t4,$t6,16,0 ; 32..63 bits
1225 srwi $c1,$t7,16
1226 insrwi $carry,$t7,16,0
1227 fctid $T1a,$T1a
1228 addc $t0,$t0,$t2
1229 adde $t4,$t4,$t3
1230 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1231 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1232 fctid $T1b,$T1b
1233 addze $carry,$carry
1234 addze $c1,$c1
1235 stw $t0,4($tp) ; tp[j-1]
1236 stw $t4,0($tp)
1237 fctid $T2a,$T2a
1238 addc $t2,$t2,$carry
1239 adde $t3,$t3,$c1
1240 srwi $carry,$t2,16
1241 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1242 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1243 fctid $T2b,$T2b
1244 srwi $c1,$t3,16
1245 insrwi $carry,$t3,16,0
1246 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1247 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1248 fctid $T3a,$T3a
1249 addc $t6,$t6,$carry
1250 adde $t7,$t7,$c1
1251 srwi $carry,$t6,16
1252 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1253 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1254 fctid $T3b,$T3b
1255
1256 insrwi $t2,$t6,16,0 ; 64..95 bits
1257 insrwi $carry,$t7,16,0
1258 srwi $c1,$t7,16
1259 lwz $t6,20($tp)
1260 lwzu $t7,16($tp)
1261 addc $t0,$t0,$carry
1262 stfd $T0a,`$FRAME+0`($sp)
1263 adde $t1,$t1,$c1
1264 srwi $carry,$t0,16
1265 stfd $T0b,`$FRAME+8`($sp)
1266 insrwi $carry,$t1,16,0
1267 srwi $c1,$t1,16
1268 addc $t4,$t4,$carry
1269 stfd $T1a,`$FRAME+16`($sp)
1270 adde $t5,$t5,$c1
1271 srwi $carry,$t4,16
1272 insrwi $t0,$t4,16,0 ; 96..127 bits
1273 stfd $T1b,`$FRAME+24`($sp)
1274 insrwi $carry,$t5,16,0
1275 srwi $c1,$t5,16
1276
1277 addc $t2,$t2,$t6
1278 stfd $T2a,`$FRAME+32`($sp)
1279 adde $t0,$t0,$t7
1280 stfd $T2b,`$FRAME+40`($sp)
1281 addze $carry,$carry
1282 stfd $T3a,`$FRAME+48`($sp)
1283 addze $c1,$c1
1284 stfd $T3b,`$FRAME+56`($sp)
1285 stw $t2,-4($tp) ; tp[j]
1286 stw $t0,-8($tp)
1287 ___
1288 }
1289 $code.=<<___;
1290 bdnz Linner
1291 \f
1292 fctid $dota,$dota
1293 fctid $dotb,$dotb
1294 ___
1295 if ($SIZE_T==8 or $flavour =~ /osx/) {
1296 $code.=<<___;
1297 ld $t0,`$FRAME+0`($sp)
1298 ld $t1,`$FRAME+8`($sp)
1299 ld $t2,`$FRAME+16`($sp)
1300 ld $t3,`$FRAME+24`($sp)
1301 ld $t4,`$FRAME+32`($sp)
1302 ld $t5,`$FRAME+40`($sp)
1303 ld $t6,`$FRAME+48`($sp)
1304 ld $t7,`$FRAME+56`($sp)
1305 stfd $dota,`$FRAME+64`($sp)
1306 stfd $dotb,`$FRAME+72`($sp)
1307
1308 add $t0,$t0,$carry ; can not overflow
1309 srdi $carry,$t0,16
1310 add $t1,$t1,$carry
1311 srdi $carry,$t1,16
1312 insrdi $t0,$t1,16,32
1313 add $t2,$t2,$carry
1314 ld $t1,8($tp) ; tp[j]
1315 srdi $carry,$t2,16
1316 insrdi $t0,$t2,16,16
1317 add $t3,$t3,$carry
1318 ldu $t2,16($tp) ; tp[j+1]
1319 srdi $carry,$t3,16
1320 insrdi $t0,$t3,16,0 ; 0..63 bits
1321 add $t4,$t4,$carry
1322 srdi $carry,$t4,16
1323 add $t5,$t5,$carry
1324 srdi $carry,$t5,16
1325 insrdi $t4,$t5,16,32
1326 add $t6,$t6,$carry
1327 srdi $carry,$t6,16
1328 insrdi $t4,$t6,16,16
1329 add $t7,$t7,$carry
1330 insrdi $t4,$t7,16,0 ; 64..127 bits
1331 srdi $carry,$t7,16 ; upper 33 bits
1332 ld $t6,`$FRAME+64`($sp)
1333 ld $t7,`$FRAME+72`($sp)
1334
1335 addc $t3,$t0,$t1
1336 ___
1337 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1338 extrdi $t0,$t0,32,0
1339 extrdi $t1,$t1,32,0
1340 adde $t0,$t0,$t1
1341 ___
1342 $code.=<<___;
1343 adde $t5,$t4,$t2
1344 ___
1345 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1346 extrdi $t4,$t4,32,0
1347 extrdi $t2,$t2,32,0
1348 adde $t4,$t4,$t2
1349 ___
1350 $code.=<<___;
1351 addze $carry,$carry
1352
1353 std $t3,-16($tp) ; tp[j-1]
1354 std $t5,-8($tp) ; tp[j]
1355
1356 add $carry,$carry,$ovf ; consume upmost overflow
1357 add $t6,$t6,$carry ; can not overflow
1358 srdi $carry,$t6,16
1359 add $t7,$t7,$carry
1360 insrdi $t6,$t7,48,0
1361 srdi $ovf,$t7,48
1362 std $t6,0($tp) ; tp[num-1]
1363 ___
1364 } else {
1365 $code.=<<___;
1366 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1367 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1368 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1369 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1370 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1371 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1372 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1373 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1374 stfd $dota,`$FRAME+64`($sp)
1375 stfd $dotb,`$FRAME+72`($sp)
1376
1377 addc $t0,$t0,$carry
1378 adde $t1,$t1,$c1
1379 srwi $carry,$t0,16
1380 insrwi $carry,$t1,16,0
1381 srwi $c1,$t1,16
1382 addc $t2,$t2,$carry
1383 adde $t3,$t3,$c1
1384 srwi $carry,$t2,16
1385 insrwi $t0,$t2,16,0 ; 0..31 bits
1386 lwz $t2,12($tp) ; tp[j]
1387 insrwi $carry,$t3,16,0
1388 srwi $c1,$t3,16
1389 lwz $t3,8($tp)
1390 addc $t4,$t4,$carry
1391 adde $t5,$t5,$c1
1392 srwi $carry,$t4,16
1393 insrwi $carry,$t5,16,0
1394 srwi $c1,$t5,16
1395 addc $t6,$t6,$carry
1396 adde $t7,$t7,$c1
1397 srwi $carry,$t6,16
1398 insrwi $t4,$t6,16,0 ; 32..63 bits
1399 insrwi $carry,$t7,16,0
1400 srwi $c1,$t7,16
1401
1402 addc $t0,$t0,$t2
1403 adde $t4,$t4,$t3
1404 addze $carry,$carry
1405 addze $c1,$c1
1406 stw $t0,4($tp) ; tp[j-1]
1407 stw $t4,0($tp)
1408
1409 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1410 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1411 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1412 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1413 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1414 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1415 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1416 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1417
1418 addc $t2,$t2,$carry
1419 adde $t3,$t3,$c1
1420 srwi $carry,$t2,16
1421 insrwi $carry,$t3,16,0
1422 srwi $c1,$t3,16
1423 addc $t6,$t6,$carry
1424 adde $t7,$t7,$c1
1425 srwi $carry,$t6,16
1426 insrwi $t2,$t6,16,0 ; 64..95 bits
1427 lwz $t6,20($tp)
1428 insrwi $carry,$t7,16,0
1429 srwi $c1,$t7,16
1430 lwzu $t7,16($tp)
1431 addc $t0,$t0,$carry
1432 adde $t1,$t1,$c1
1433 srwi $carry,$t0,16
1434 insrwi $carry,$t1,16,0
1435 srwi $c1,$t1,16
1436 addc $t4,$t4,$carry
1437 adde $t5,$t5,$c1
1438 srwi $carry,$t4,16
1439 insrwi $t0,$t4,16,0 ; 96..127 bits
1440 insrwi $carry,$t5,16,0
1441 srwi $c1,$t5,16
1442
1443 addc $t2,$t2,$t6
1444 adde $t0,$t0,$t7
1445 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
1446 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
1447 addze $carry,$carry
1448 addze $c1,$c1
1449 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
1450 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
1451
1452 addc $t6,$t6,$carry
1453 adde $t7,$t7,$c1
1454 stw $t2,-4($tp) ; tp[j]
1455 stw $t0,-8($tp)
1456 addc $t6,$t6,$ovf
1457 addze $t7,$t7
1458 srwi $carry,$t6,16
1459 insrwi $carry,$t7,16,0
1460 srwi $c1,$t7,16
1461 addc $t4,$t4,$carry
1462 adde $t5,$t5,$c1
1463
1464 insrwi $t6,$t4,16,0
1465 srwi $t4,$t4,16
1466 insrwi $t4,$t5,16,0
1467 srwi $ovf,$t5,16
1468 stw $t6,4($tp) ; tp[num-1]
1469 stw $t4,0($tp)
1470 ___
1471 }
1472 $code.=<<___;
1473 slwi $t7,$num,2
1474 addi $i,$i,8
1475 subf $nap_d,$t7,$nap_d ; rewind pointer
1476 cmpw $i,$num
1477 blt- Louter
1478 ___
1479 \f
1480 $code.=<<___ if ($SIZE_T==8);
1481 subf $np,$num,$np ; rewind np
1482 addi $j,$j,1 ; restore counter
1483 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1484 addi $tp,$sp,`$FRAME+$TRANSFER+8`
1485 addi $t4,$sp,`$FRAME+$TRANSFER+16`
1486 addi $t5,$np,8
1487 addi $t6,$rp,8
1488 mtctr $j
1489
1490 .align 4
1491 Lsub: ldx $t0,$tp,$i
1492 ldx $t1,$np,$i
1493 ldx $t2,$t4,$i
1494 ldx $t3,$t5,$i
1495 subfe $t0,$t1,$t0 ; tp[j]-np[j]
1496 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
1497 stdx $t0,$rp,$i
1498 stdx $t2,$t6,$i
1499 addi $i,$i,16
1500 bdnz Lsub
1501
1502 li $i,0
1503 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1504 mtctr $j
1505
1506 .align 4
1507 Lcopy: ; conditional copy
1508 ldx $t0,$tp,$i
1509 ldx $t1,$t4,$i
1510 ldx $t2,$rp,$i
1511 ldx $t3,$t6,$i
1512 std $i,8($nap_d) ; zap nap_d
1513 std $i,16($nap_d)
1514 std $i,24($nap_d)
1515 std $i,32($nap_d)
1516 std $i,40($nap_d)
1517 std $i,48($nap_d)
1518 std $i,56($nap_d)
1519 stdu $i,64($nap_d)
1520 and $t0,$t0,$ovf
1521 and $t1,$t1,$ovf
1522 andc $t2,$t2,$ovf
1523 andc $t3,$t3,$ovf
1524 or $t0,$t0,$t2
1525 or $t1,$t1,$t3
1526 stdx $t0,$rp,$i
1527 stdx $t1,$t6,$i
1528 stdx $i,$tp,$i ; zap tp at once
1529 stdx $i,$t4,$i
1530 addi $i,$i,16
1531 bdnz Lcopy
1532 ___
1533 $code.=<<___ if ($SIZE_T==4);
1534 subf $np,$num,$np ; rewind np
1535 addi $j,$j,1 ; restore counter
1536 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1537 addi $tp,$sp,`$FRAME+$TRANSFER`
1538 addi $np,$np,-4
1539 addi $rp,$rp,-4
1540 addi $ap,$sp,`$FRAME+$TRANSFER+4`
1541 mtctr $j
1542
1543 .align 4
1544 Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
1545 lwz $t1,8($tp)
1546 lwz $t2,20($tp)
1547 lwzu $t3,16($tp)
1548 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
1549 lwz $t5,8($np)
1550 lwz $t6,12($np)
1551 lwzu $t7,16($np)
1552 subfe $t4,$t4,$t0 ; tp[j]-np[j]
1553 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
1554 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
1555 stw $t1,8($ap)
1556 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
1557 stw $t2,12($ap)
1558 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
1559 stwu $t3,16($ap)
1560 stw $t4,4($rp)
1561 stw $t5,8($rp)
1562 stw $t6,12($rp)
1563 stwu $t7,16($rp)
1564 bdnz Lsub
1565
1566 li $i,0
1567 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1568 addi $ap,$sp,`$FRAME+$TRANSFER+4`
1569 subf $rp,$num,$rp ; rewind rp
1570 addi $tp,$sp,`$FRAME+$TRANSFER`
1571 mtctr $j
1572
1573 .align 4
1574 Lcopy: ; conditional copy
1575 lwz $t0,4($ap)
1576 lwz $t1,8($ap)
1577 lwz $t2,12($ap)
1578 lwzu $t3,16($ap)
1579 lwz $t4,4($rp)
1580 lwz $t5,8($rp)
1581 lwz $t6,12($rp)
1582 lwz $t7,16($rp)
1583 std $i,8($nap_d) ; zap nap_d
1584 std $i,16($nap_d)
1585 std $i,24($nap_d)
1586 std $i,32($nap_d)
1587 std $i,40($nap_d)
1588 std $i,48($nap_d)
1589 std $i,56($nap_d)
1590 stdu $i,64($nap_d)
1591 and $t0,$t0,$ovf
1592 and $t1,$t1,$ovf
1593 and $t2,$t2,$ovf
1594 and $t3,$t3,$ovf
1595 andc $t4,$t4,$ovf
1596 andc $t5,$t5,$ovf
1597 andc $t6,$t6,$ovf
1598 andc $t7,$t7,$ovf
1599 or $t0,$t0,$t4
1600 or $t1,$t1,$t5
1601 or $t2,$t2,$t6
1602 or $t3,$t3,$t7
1603 stw $t0,4($rp)
1604 stw $t1,8($rp)
1605 stw $t2,12($rp)
1606 stwu $t3,16($rp)
1607 std $i,8($tp) ; zap tp at once
1608 stdu $i,16($tp)
1609 bdnz Lcopy
1610 ___
1611 \f
1612 $code.=<<___;
1613 $POP $i,0($sp)
1614 li r3,1 ; signal "handled"
1615 $POP r19,`-12*8-13*$SIZE_T`($i)
1616 $POP r20,`-12*8-12*$SIZE_T`($i)
1617 $POP r21,`-12*8-11*$SIZE_T`($i)
1618 $POP r22,`-12*8-10*$SIZE_T`($i)
1619 $POP r23,`-12*8-9*$SIZE_T`($i)
1620 $POP r24,`-12*8-8*$SIZE_T`($i)
1621 $POP r25,`-12*8-7*$SIZE_T`($i)
1622 $POP r26,`-12*8-6*$SIZE_T`($i)
1623 $POP r27,`-12*8-5*$SIZE_T`($i)
1624 $POP r28,`-12*8-4*$SIZE_T`($i)
1625 $POP r29,`-12*8-3*$SIZE_T`($i)
1626 $POP r30,`-12*8-2*$SIZE_T`($i)
1627 $POP r31,`-12*8-1*$SIZE_T`($i)
1628 lfd f20,`-12*8`($i)
1629 lfd f21,`-11*8`($i)
1630 lfd f22,`-10*8`($i)
1631 lfd f23,`-9*8`($i)
1632 lfd f24,`-8*8`($i)
1633 lfd f25,`-7*8`($i)
1634 lfd f26,`-6*8`($i)
1635 lfd f27,`-5*8`($i)
1636 lfd f28,`-4*8`($i)
1637 lfd f29,`-3*8`($i)
1638 lfd f30,`-2*8`($i)
1639 lfd f31,`-1*8`($i)
1640 mr $sp,$i
1641 blr
1642 .long 0
1643 .byte 0,12,4,0,0x8c,13,6,0
1644 .long 0
1645 .size .$fname,.-.$fname
1646
1647 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1648 ___
1649
1650 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1651 print $code;
1652 close STDOUT;