]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/ppc.pl
Update copyright year
[thirdparty/openssl.git] / crypto / bn / asm / ppc.pl
CommitLineData
e0a65194 1#! /usr/bin/env perl
33388b44 2# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
dd558806 3#
367ace68 4# Licensed under the Apache License 2.0 (the "License"). You may not use
e0a65194
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
dd558806
AP
9# Implemented as a Perl wrapper as we want to support several different
10# architectures with single file. We pick up the target based on the
11# file name we are asked to generate.
12#
13# It should be noted though that this perl code is nothing like
14# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15# as pre-processor to cover for platform differences in name decoration,
16# linker tables, 32-/64-bit instruction sets...
17#
18# As you might know there're several PowerPC ABI in use. Most notably
19# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20# are similar enough to implement leaf(!) functions, which would be ABI
21# neutral. And that's what you find here: ABI neutral leaf functions.
22# In case you wonder what that is...
23#
24# AIX performance
25#
26# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27#
28# The following is the performance of 32-bit compiler
29# generated code:
30#
31# OpenSSL 0.9.6c 21 dec 2001
32# built on: Tue Jun 11 11:06:51 EDT 2002
33# options:bn(64,32) ...
34#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
35# sign verify sign/s verify/s
36#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
37#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
38#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
39#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
40#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
609b0852 41#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
dd558806 42#
dd07e68b 43# Same benchmark with this assembler code:
dd558806
AP
44#
45#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
46#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
47#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
48#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
49#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
50#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
51#
52# Number of operations increases by at almost 75%
53#
54# Here are performance numbers for 64-bit compiler
55# generated code:
56#
57# OpenSSL 0.9.6g [engine] 9 Aug 2002
58# built on: Fri Apr 18 16:59:20 EDT 2003
59# options:bn(64,64) ...
60# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61# sign verify sign/s verify/s
62#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
63#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
64#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
65#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
66#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
67#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
68#
69# Same benchmark with this assembler code:
70#
71#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
72#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
73#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
74#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
75#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
76#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
609b0852 77#
dd558806
AP
78# Again, performance increases by at about 75%
79#
80# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81# OpenSSL 0.9.7c 30 Sep 2003
82#
83# Original code.
84#
85#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
86#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
87#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
88#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
89#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
90#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
91#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
92#
93# Same benchmark with this assembler code:
94#
95#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
96#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
97#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
98#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
99#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
100#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
101#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
102#
103# Performance increase of ~60%
e3713c36 104# Based on submission from Suresh N. Chari of IBM
dd558806 105
1aa89a7a
RL
106# $output is the last argument if it looks like a file (it has an extension)
107# $flavour is the first argument if it doesn't look like a file
108$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
109$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
dd558806 110
addd641f 111if ($flavour =~ /32/) {
dd558806
AP
112 $BITS= 32;
113 $BNSZ= $BITS/8;
114 $ISA= "\"ppc\"";
115
116 $LD= "lwz"; # load
117 $LDU= "lwzu"; # load and update
118 $ST= "stw"; # store
119 $STU= "stwu"; # store and update
120 $UMULL= "mullw"; # unsigned multiply low
121 $UMULH= "mulhwu"; # unsigned multiply high
122 $UDIV= "divwu"; # unsigned divide
123 $UCMPI= "cmplwi"; # unsigned compare with immediate
124 $UCMP= "cmplw"; # unsigned compare
aaa5dc61 125 $CNTLZ= "cntlzw"; # count leading zeros
dd558806
AP
126 $SHL= "slw"; # shift left
127 $SHR= "srw"; # unsigned shift right
609b0852 128 $SHRI= "srwi"; # unsigned shift right by immediate
dd558806
AP
129 $SHLI= "slwi"; # shift left by immediate
130 $CLRU= "clrlwi"; # clear upper bits
131 $INSR= "insrwi"; # insert right
132 $ROTL= "rotlwi"; # rotate left by immediate
31efffbd 133 $TR= "tw"; # conditional trap
addd641f 134} elsif ($flavour =~ /64/) {
dd558806
AP
135 $BITS= 64;
136 $BNSZ= $BITS/8;
137 $ISA= "\"ppc64\"";
138
139 # same as above, but 64-bit mnemonics...
140 $LD= "ld"; # load
141 $LDU= "ldu"; # load and update
142 $ST= "std"; # store
143 $STU= "stdu"; # store and update
144 $UMULL= "mulld"; # unsigned multiply low
145 $UMULH= "mulhdu"; # unsigned multiply high
146 $UDIV= "divdu"; # unsigned divide
147 $UCMPI= "cmpldi"; # unsigned compare with immediate
148 $UCMP= "cmpld"; # unsigned compare
aaa5dc61 149 $CNTLZ= "cntlzd"; # count leading zeros
dd558806
AP
150 $SHL= "sld"; # shift left
151 $SHR= "srd"; # unsigned shift right
609b0852 152 $SHRI= "srdi"; # unsigned shift right by immediate
dd558806
AP
153 $SHLI= "sldi"; # shift left by immediate
154 $CLRU= "clrldi"; # clear upper bits
609b0852 155 $INSR= "insrdi"; # insert right
dd558806 156 $ROTL= "rotldi"; # rotate left by immediate
31efffbd 157 $TR= "td"; # conditional trap
addd641f 158} else { die "nonsense $flavour"; }
dd558806 159
31439046
AP
160$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
161( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
162( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
163die "can't locate ppc-xlate.pl";
dd558806 164
1aa89a7a
RL
165open STDOUT,"| $^X $xlate $flavour \"$output\""
166 or die "can't call $xlate: $!";
dd558806 167
31439046 168$data=<<EOF;
dd558806
AP
169#--------------------------------------------------------------------
170#
171#
172#
173#
174# File: ppc32.s
175#
176# Created by: Suresh Chari
177# IBM Thomas J. Watson Research Library
178# Hawthorne, NY
179#
180#
181# Description: Optimized assembly routines for OpenSSL crypto
182# on the 32 bitPowerPC platform.
183#
184#
185# Version History
186#
187# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
188# cleaned up code. Also made a single version which can
189# be used for both the AIX and Linux compilers. See NOTE
190# below.
191# 12/05/03 Suresh Chari
192# (with lots of help from) Andy Polyakov
609b0852 193##
dd558806
AP
194# 1. Initial version 10/20/02 Suresh Chari
195#
196#
197# The following file works for the xlc,cc
198# and gcc compilers.
199#
200# NOTE: To get the file to link correctly with the gcc compiler
201# you have to change the names of the routines and remove
202# the first .(dot) character. This should automatically
203# be done in the build process.
204#
205# Hand optimized assembly code for the following routines
609b0852 206#
dd558806
AP
207# bn_sqr_comba4
208# bn_sqr_comba8
209# bn_mul_comba4
210# bn_mul_comba8
211# bn_sub_words
212# bn_add_words
213# bn_div_words
214# bn_sqr_words
215# bn_mul_words
216# bn_mul_add_words
217#
218# NOTE: It is possible to optimize this code more for
219# specific PowerPC or Power architectures. On the Northstar
220# architecture the optimizations in this file do
221# NOT provide much improvement.
222#
223# If you have comments or suggestions to improve code send
224# me a note at schari\@us.ibm.com
225#
226#--------------------------------------------------------------------------
227#
228# Defines to be used in the assembly code.
609b0852 229#
31439046
AP
230#.set r0,0 # we use it as storage for value of 0
231#.set SP,1 # preserved
609b0852 232#.set RTOC,2 # preserved
31439046
AP
233#.set r3,3 # 1st argument/return value
234#.set r4,4 # 2nd argument/volatile register
235#.set r5,5 # 3rd argument/volatile register
236#.set r6,6 # ...
237#.set r7,7
238#.set r8,8
239#.set r9,9
240#.set r10,10
241#.set r11,11
242#.set r12,12
243#.set r13,13 # not used, nor any other "below" it...
dd558806
AP
244
245# Declare function names to be global
246# NOTE: For gcc these names MUST be changed to remove
247# the first . i.e. for example change ".bn_sqr_comba4"
248# to "bn_sqr_comba4". This should be automatically done
249# in the build.
609b0852 250
dd558806
AP
251 .globl .bn_sqr_comba4
252 .globl .bn_sqr_comba8
253 .globl .bn_mul_comba4
254 .globl .bn_mul_comba8
255 .globl .bn_sub_words
256 .globl .bn_add_words
257 .globl .bn_div_words
258 .globl .bn_sqr_words
259 .globl .bn_mul_words
260 .globl .bn_mul_add_words
609b0852 261
dd558806 262# .text section
609b0852 263
492279f6 264 .machine "any"
2864df8f 265 .text
dd558806
AP
266
267#
268# NOTE: The following label name should be changed to
269# "bn_sqr_comba4" i.e. remove the first dot
270# for the gcc compiler. This should be automatically
271# done in the build
272#
273
274.align 4
275.bn_sqr_comba4:
276#
277# Optimized version of bn_sqr_comba4.
278#
279# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
280# r3 contains r
281# r4 contains a
282#
609b0852
DB
283# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
284#
dd558806
AP
285# r5,r6 are the two BN_ULONGs being multiplied.
286# r7,r8 are the results of the 32x32 giving 64 bit multiply.
287# r9,r10, r11 are the equivalents of c1,c2, c3.
288# Here's the assembly
289#
290#
291 xor r0,r0,r0 # set r0 = 0. Used in the addze
292 # instructions below
609b0852 293
dd558806 294 #sqr_add_c(a,0,c1,c2,c3)
609b0852
DB
295 $LD r5,`0*$BNSZ`(r4)
296 $UMULL r9,r5,r5
dd558806
AP
297 $UMULH r10,r5,r5 #in first iteration. No need
298 #to add since c1=c2=c3=0.
299 # Note c3(r11) is NOT set to 0
300 # but will be.
301
302 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
303 # sqr_add_c2(a,1,0,c2,c3,c1);
609b0852 304 $LD r6,`1*$BNSZ`(r4)
dd558806
AP
305 $UMULL r7,r5,r6
306 $UMULH r8,r5,r6
609b0852 307
dd558806
AP
308 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
309 adde r8,r8,r8
310 addze r9,r0 # catch carry if any.
609b0852
DB
311 # r9= r0(=0) and carry
312
dd558806 313 addc r10,r7,r10 # now add to temp result.
609b0852 314 addze r11,r8 # r8 added to r11 which is 0
dd558806 315 addze r9,r9
609b0852
DB
316
317 $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
dd558806
AP
318 #sqr_add_c(a,1,c3,c1,c2)
319 $UMULL r7,r6,r6
320 $UMULH r8,r6,r6
321 addc r11,r7,r11
322 adde r9,r8,r9
323 addze r10,r0
324 #sqr_add_c2(a,2,0,c3,c1,c2)
325 $LD r6,`2*$BNSZ`(r4)
326 $UMULL r7,r5,r6
327 $UMULH r8,r5,r6
609b0852 328
dd558806
AP
329 addc r7,r7,r7
330 adde r8,r8,r8
331 addze r10,r10
609b0852 332
dd558806
AP
333 addc r11,r7,r11
334 adde r9,r8,r9
335 addze r10,r10
609b0852 336 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
dd558806 337 #sqr_add_c2(a,3,0,c1,c2,c3);
609b0852 338 $LD r6,`3*$BNSZ`(r4)
dd558806
AP
339 $UMULL r7,r5,r6
340 $UMULH r8,r5,r6
341 addc r7,r7,r7
342 adde r8,r8,r8
343 addze r11,r0
609b0852 344
dd558806
AP
345 addc r9,r7,r9
346 adde r10,r8,r10
347 addze r11,r11
348 #sqr_add_c2(a,2,1,c1,c2,c3);
349 $LD r5,`1*$BNSZ`(r4)
350 $LD r6,`2*$BNSZ`(r4)
351 $UMULL r7,r5,r6
352 $UMULH r8,r5,r6
609b0852 353
dd558806
AP
354 addc r7,r7,r7
355 adde r8,r8,r8
356 addze r11,r11
357 addc r9,r7,r9
358 adde r10,r8,r10
359 addze r11,r11
360 $ST r9,`3*$BNSZ`(r3) #r[3]=c1
361 #sqr_add_c(a,2,c2,c3,c1);
362 $UMULL r7,r6,r6
363 $UMULH r8,r6,r6
364 addc r10,r7,r10
365 adde r11,r8,r11
366 addze r9,r0
367 #sqr_add_c2(a,3,1,c2,c3,c1);
609b0852 368 $LD r6,`3*$BNSZ`(r4)
dd558806
AP
369 $UMULL r7,r5,r6
370 $UMULH r8,r5,r6
371 addc r7,r7,r7
372 adde r8,r8,r8
373 addze r9,r9
609b0852 374
dd558806
AP
375 addc r10,r7,r10
376 adde r11,r8,r11
377 addze r9,r9
378 $ST r10,`4*$BNSZ`(r3) #r[4]=c2
379 #sqr_add_c2(a,3,2,c3,c1,c2);
609b0852 380 $LD r5,`2*$BNSZ`(r4)
dd558806
AP
381 $UMULL r7,r5,r6
382 $UMULH r8,r5,r6
383 addc r7,r7,r7
384 adde r8,r8,r8
385 addze r10,r0
609b0852 386
dd558806
AP
387 addc r11,r7,r11
388 adde r9,r8,r9
389 addze r10,r10
390 $ST r11,`5*$BNSZ`(r3) #r[5] = c3
391 #sqr_add_c(a,3,c1,c2,c3);
609b0852 392 $UMULL r7,r6,r6
dd558806
AP
393 $UMULH r8,r6,r6
394 addc r9,r7,r9
395 adde r10,r8,r10
396
397 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
398 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
31439046 399 blr
67150340
AP
400 .long 0
401 .byte 0,12,0x14,0,0,0,2,0
402 .long 0
d6019e16 403.size .bn_sqr_comba4,.-.bn_sqr_comba4
dd558806
AP
404
405#
406# NOTE: The following label name should be changed to
407# "bn_sqr_comba8" i.e. remove the first dot
408# for the gcc compiler. This should be automatically
409# done in the build
410#
609b0852 411
dd558806
AP
412.align 4
413.bn_sqr_comba8:
414#
415# This is an optimized version of the bn_sqr_comba8 routine.
416# Tightly uses the adde instruction
417#
418#
419# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
420# r3 contains r
421# r4 contains a
422#
609b0852
DB
423# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
424#
dd558806
AP
425# r5,r6 are the two BN_ULONGs being multiplied.
426# r7,r8 are the results of the 32x32 giving 64 bit multiply.
427# r9,r10, r11 are the equivalents of c1,c2, c3.
428#
429# Possible optimization of loading all 8 longs of a into registers
60250017 430# doesn't provide any speedup
609b0852 431#
dd558806
AP
432
433 xor r0,r0,r0 #set r0 = 0.Used in addze
434 #instructions below.
435
436 #sqr_add_c(a,0,c1,c2,c3);
437 $LD r5,`0*$BNSZ`(r4)
438 $UMULL r9,r5,r5 #1st iteration: no carries.
439 $UMULH r10,r5,r5
440 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
441 #sqr_add_c2(a,1,0,c2,c3,c1);
442 $LD r6,`1*$BNSZ`(r4)
443 $UMULL r7,r5,r6
609b0852
DB
444 $UMULH r8,r5,r6
445
dd558806
AP
446 addc r10,r7,r10 #add the two register number
447 adde r11,r8,r0 # (r8,r7) to the three register
448 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
609b0852 449
dd558806
AP
450 addc r10,r7,r10 #add the two register number
451 adde r11,r8,r11 # (r8,r7) to the three register
452 addze r9,r9 # number (r9,r11,r10).
609b0852 453
dd558806 454 $ST r10,`1*$BNSZ`(r3) # r[1]=c2
609b0852 455
dd558806
AP
456 #sqr_add_c(a,1,c3,c1,c2);
457 $UMULL r7,r6,r6
458 $UMULH r8,r6,r6
459 addc r11,r7,r11
460 adde r9,r8,r9
461 addze r10,r0
462 #sqr_add_c2(a,2,0,c3,c1,c2);
463 $LD r6,`2*$BNSZ`(r4)
464 $UMULL r7,r5,r6
465 $UMULH r8,r5,r6
609b0852 466
dd558806
AP
467 addc r11,r7,r11
468 adde r9,r8,r9
469 addze r10,r10
609b0852 470
dd558806
AP
471 addc r11,r7,r11
472 adde r9,r8,r9
473 addze r10,r10
609b0852 474
dd558806
AP
475 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
476 #sqr_add_c2(a,3,0,c1,c2,c3);
477 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
478 $UMULL r7,r5,r6
479 $UMULH r8,r5,r6
609b0852 480
dd558806
AP
481 addc r9,r7,r9
482 adde r10,r8,r10
483 addze r11,r0
609b0852 484
dd558806
AP
485 addc r9,r7,r9
486 adde r10,r8,r10
487 addze r11,r11
488 #sqr_add_c2(a,2,1,c1,c2,c3);
489 $LD r5,`1*$BNSZ`(r4)
490 $LD r6,`2*$BNSZ`(r4)
491 $UMULL r7,r5,r6
492 $UMULH r8,r5,r6
609b0852 493
dd558806
AP
494 addc r9,r7,r9
495 adde r10,r8,r10
496 addze r11,r11
609b0852 497
dd558806
AP
498 addc r9,r7,r9
499 adde r10,r8,r10
500 addze r11,r11
609b0852 501
dd558806
AP
502 $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
503 #sqr_add_c(a,2,c2,c3,c1);
504 $UMULL r7,r6,r6
505 $UMULH r8,r6,r6
609b0852 506
dd558806
AP
507 addc r10,r7,r10
508 adde r11,r8,r11
509 addze r9,r0
510 #sqr_add_c2(a,3,1,c2,c3,c1);
511 $LD r6,`3*$BNSZ`(r4)
512 $UMULL r7,r5,r6
513 $UMULH r8,r5,r6
609b0852 514
dd558806
AP
515 addc r10,r7,r10
516 adde r11,r8,r11
517 addze r9,r9
609b0852 518
dd558806
AP
519 addc r10,r7,r10
520 adde r11,r8,r11
521 addze r9,r9
522 #sqr_add_c2(a,4,0,c2,c3,c1);
523 $LD r5,`0*$BNSZ`(r4)
524 $LD r6,`4*$BNSZ`(r4)
525 $UMULL r7,r5,r6
526 $UMULH r8,r5,r6
609b0852 527
dd558806
AP
528 addc r10,r7,r10
529 adde r11,r8,r11
530 addze r9,r9
609b0852 531
dd558806
AP
532 addc r10,r7,r10
533 adde r11,r8,r11
534 addze r9,r9
535 $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
536 #sqr_add_c2(a,5,0,c3,c1,c2);
537 $LD r6,`5*$BNSZ`(r4)
538 $UMULL r7,r5,r6
539 $UMULH r8,r5,r6
609b0852 540
dd558806
AP
541 addc r11,r7,r11
542 adde r9,r8,r9
543 addze r10,r0
609b0852 544
dd558806
AP
545 addc r11,r7,r11
546 adde r9,r8,r9
547 addze r10,r10
548 #sqr_add_c2(a,4,1,c3,c1,c2);
549 $LD r5,`1*$BNSZ`(r4)
550 $LD r6,`4*$BNSZ`(r4)
551 $UMULL r7,r5,r6
552 $UMULH r8,r5,r6
609b0852 553
dd558806
AP
554 addc r11,r7,r11
555 adde r9,r8,r9
556 addze r10,r10
609b0852 557
dd558806
AP
558 addc r11,r7,r11
559 adde r9,r8,r9
560 addze r10,r10
561 #sqr_add_c2(a,3,2,c3,c1,c2);
562 $LD r5,`2*$BNSZ`(r4)
563 $LD r6,`3*$BNSZ`(r4)
564 $UMULL r7,r5,r6
565 $UMULH r8,r5,r6
609b0852 566
dd558806
AP
567 addc r11,r7,r11
568 adde r9,r8,r9
569 addze r10,r10
609b0852 570
dd558806
AP
571 addc r11,r7,r11
572 adde r9,r8,r9
573 addze r10,r10
574 $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
575 #sqr_add_c(a,3,c1,c2,c3);
576 $UMULL r7,r6,r6
577 $UMULH r8,r6,r6
578 addc r9,r7,r9
579 adde r10,r8,r10
580 addze r11,r0
581 #sqr_add_c2(a,4,2,c1,c2,c3);
582 $LD r6,`4*$BNSZ`(r4)
583 $UMULL r7,r5,r6
584 $UMULH r8,r5,r6
609b0852 585
dd558806
AP
586 addc r9,r7,r9
587 adde r10,r8,r10
588 addze r11,r11
609b0852 589
dd558806
AP
590 addc r9,r7,r9
591 adde r10,r8,r10
592 addze r11,r11
593 #sqr_add_c2(a,5,1,c1,c2,c3);
594 $LD r5,`1*$BNSZ`(r4)
595 $LD r6,`5*$BNSZ`(r4)
596 $UMULL r7,r5,r6
597 $UMULH r8,r5,r6
609b0852 598
dd558806
AP
599 addc r9,r7,r9
600 adde r10,r8,r10
601 addze r11,r11
609b0852 602
dd558806
AP
603 addc r9,r7,r9
604 adde r10,r8,r10
605 addze r11,r11
606 #sqr_add_c2(a,6,0,c1,c2,c3);
607 $LD r5,`0*$BNSZ`(r4)
608 $LD r6,`6*$BNSZ`(r4)
609 $UMULL r7,r5,r6
610 $UMULH r8,r5,r6
611 addc r9,r7,r9
612 adde r10,r8,r10
613 addze r11,r11
614 addc r9,r7,r9
615 adde r10,r8,r10
616 addze r11,r11
617 $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
618 #sqr_add_c2(a,7,0,c2,c3,c1);
619 $LD r6,`7*$BNSZ`(r4)
620 $UMULL r7,r5,r6
621 $UMULH r8,r5,r6
609b0852 622
dd558806
AP
623 addc r10,r7,r10
624 adde r11,r8,r11
625 addze r9,r0
626 addc r10,r7,r10
627 adde r11,r8,r11
628 addze r9,r9
629 #sqr_add_c2(a,6,1,c2,c3,c1);
630 $LD r5,`1*$BNSZ`(r4)
631 $LD r6,`6*$BNSZ`(r4)
632 $UMULL r7,r5,r6
633 $UMULH r8,r5,r6
609b0852 634
dd558806
AP
635 addc r10,r7,r10
636 adde r11,r8,r11
637 addze r9,r9
638 addc r10,r7,r10
639 adde r11,r8,r11
640 addze r9,r9
641 #sqr_add_c2(a,5,2,c2,c3,c1);
642 $LD r5,`2*$BNSZ`(r4)
643 $LD r6,`5*$BNSZ`(r4)
644 $UMULL r7,r5,r6
645 $UMULH r8,r5,r6
646 addc r10,r7,r10
647 adde r11,r8,r11
648 addze r9,r9
649 addc r10,r7,r10
650 adde r11,r8,r11
651 addze r9,r9
652 #sqr_add_c2(a,4,3,c2,c3,c1);
653 $LD r5,`3*$BNSZ`(r4)
654 $LD r6,`4*$BNSZ`(r4)
655 $UMULL r7,r5,r6
656 $UMULH r8,r5,r6
609b0852 657
dd558806
AP
658 addc r10,r7,r10
659 adde r11,r8,r11
660 addze r9,r9
661 addc r10,r7,r10
662 adde r11,r8,r11
663 addze r9,r9
664 $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
665 #sqr_add_c(a,4,c3,c1,c2);
666 $UMULL r7,r6,r6
667 $UMULH r8,r6,r6
668 addc r11,r7,r11
669 adde r9,r8,r9
670 addze r10,r0
671 #sqr_add_c2(a,5,3,c3,c1,c2);
672 $LD r6,`5*$BNSZ`(r4)
673 $UMULL r7,r5,r6
674 $UMULH r8,r5,r6
675 addc r11,r7,r11
676 adde r9,r8,r9
677 addze r10,r10
678 addc r11,r7,r11
679 adde r9,r8,r9
680 addze r10,r10
681 #sqr_add_c2(a,6,2,c3,c1,c2);
682 $LD r5,`2*$BNSZ`(r4)
683 $LD r6,`6*$BNSZ`(r4)
684 $UMULL r7,r5,r6
685 $UMULH r8,r5,r6
686 addc r11,r7,r11
687 adde r9,r8,r9
688 addze r10,r10
609b0852 689
dd558806
AP
690 addc r11,r7,r11
691 adde r9,r8,r9
692 addze r10,r10
693 #sqr_add_c2(a,7,1,c3,c1,c2);
694 $LD r5,`1*$BNSZ`(r4)
695 $LD r6,`7*$BNSZ`(r4)
696 $UMULL r7,r5,r6
697 $UMULH r8,r5,r6
698 addc r11,r7,r11
699 adde r9,r8,r9
700 addze r10,r10
701 addc r11,r7,r11
702 adde r9,r8,r9
703 addze r10,r10
704 $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
705 #sqr_add_c2(a,7,2,c1,c2,c3);
706 $LD r5,`2*$BNSZ`(r4)
707 $UMULL r7,r5,r6
708 $UMULH r8,r5,r6
609b0852 709
dd558806
AP
710 addc r9,r7,r9
711 adde r10,r8,r10
712 addze r11,r0
713 addc r9,r7,r9
714 adde r10,r8,r10
715 addze r11,r11
716 #sqr_add_c2(a,6,3,c1,c2,c3);
717 $LD r5,`3*$BNSZ`(r4)
718 $LD r6,`6*$BNSZ`(r4)
719 $UMULL r7,r5,r6
720 $UMULH r8,r5,r6
721 addc r9,r7,r9
722 adde r10,r8,r10
723 addze r11,r11
724 addc r9,r7,r9
725 adde r10,r8,r10
726 addze r11,r11
727 #sqr_add_c2(a,5,4,c1,c2,c3);
728 $LD r5,`4*$BNSZ`(r4)
729 $LD r6,`5*$BNSZ`(r4)
730 $UMULL r7,r5,r6
731 $UMULH r8,r5,r6
732 addc r9,r7,r9
733 adde r10,r8,r10
734 addze r11,r11
735 addc r9,r7,r9
736 adde r10,r8,r10
737 addze r11,r11
738 $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
739 #sqr_add_c(a,5,c2,c3,c1);
740 $UMULL r7,r6,r6
741 $UMULH r8,r6,r6
742 addc r10,r7,r10
743 adde r11,r8,r11
744 addze r9,r0
745 #sqr_add_c2(a,6,4,c2,c3,c1);
746 $LD r6,`6*$BNSZ`(r4)
747 $UMULL r7,r5,r6
748 $UMULH r8,r5,r6
749 addc r10,r7,r10
750 adde r11,r8,r11
751 addze r9,r9
752 addc r10,r7,r10
753 adde r11,r8,r11
754 addze r9,r9
755 #sqr_add_c2(a,7,3,c2,c3,c1);
756 $LD r5,`3*$BNSZ`(r4)
757 $LD r6,`7*$BNSZ`(r4)
758 $UMULL r7,r5,r6
759 $UMULH r8,r5,r6
760 addc r10,r7,r10
761 adde r11,r8,r11
762 addze r9,r9
763 addc r10,r7,r10
764 adde r11,r8,r11
765 addze r9,r9
766 $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
767 #sqr_add_c2(a,7,4,c3,c1,c2);
768 $LD r5,`4*$BNSZ`(r4)
769 $UMULL r7,r5,r6
770 $UMULH r8,r5,r6
771 addc r11,r7,r11
772 adde r9,r8,r9
773 addze r10,r0
774 addc r11,r7,r11
775 adde r9,r8,r9
776 addze r10,r10
777 #sqr_add_c2(a,6,5,c3,c1,c2);
778 $LD r5,`5*$BNSZ`(r4)
779 $LD r6,`6*$BNSZ`(r4)
780 $UMULL r7,r5,r6
781 $UMULH r8,r5,r6
782 addc r11,r7,r11
783 adde r9,r8,r9
784 addze r10,r10
785 addc r11,r7,r11
786 adde r9,r8,r9
787 addze r10,r10
788 $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
789 #sqr_add_c(a,6,c1,c2,c3);
790 $UMULL r7,r6,r6
791 $UMULH r8,r6,r6
792 addc r9,r7,r9
793 adde r10,r8,r10
794 addze r11,r0
795 #sqr_add_c2(a,7,5,c1,c2,c3)
796 $LD r6,`7*$BNSZ`(r4)
797 $UMULL r7,r5,r6
798 $UMULH r8,r5,r6
799 addc r9,r7,r9
800 adde r10,r8,r10
801 addze r11,r11
802 addc r9,r7,r9
803 adde r10,r8,r10
804 addze r11,r11
805 $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
609b0852 806
dd558806
AP
807 #sqr_add_c2(a,7,6,c2,c3,c1)
808 $LD r5,`6*$BNSZ`(r4)
809 $UMULL r7,r5,r6
810 $UMULH r8,r5,r6
811 addc r10,r7,r10
812 adde r11,r8,r11
813 addze r9,r0
814 addc r10,r7,r10
815 adde r11,r8,r11
816 addze r9,r9
817 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
818 #sqr_add_c(a,7,c3,c1,c2);
819 $UMULL r7,r6,r6
820 $UMULH r8,r6,r6
821 addc r11,r7,r11
822 adde r9,r8,r9
823 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
824 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
825
826
31439046 827 blr
67150340
AP
828 .long 0
829 .byte 0,12,0x14,0,0,0,2,0
830 .long 0
d6019e16 831.size .bn_sqr_comba8,.-.bn_sqr_comba8
dd558806
AP
832
833#
834# NOTE: The following label name should be changed to
835# "bn_mul_comba4" i.e. remove the first dot
836# for the gcc compiler. This should be automatically
837# done in the build
838#
839
840.align 4
841.bn_mul_comba4:
842#
843# This is an optimized version of the bn_mul_comba4 routine.
844#
845# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
846# r3 contains r
847# r4 contains a
848# r5 contains b
849# r6, r7 are the 2 BN_ULONGs being multiplied.
850# r8, r9 are the results of the 32x32 giving 64 multiply.
851# r10, r11, r12 are the equivalents of c1, c2, and c3.
852#
853 xor r0,r0,r0 #r0=0. Used in addze below.
854 #mul_add_c(a[0],b[0],c1,c2,c3);
609b0852
DB
855 $LD r6,`0*$BNSZ`(r4)
856 $LD r7,`0*$BNSZ`(r5)
857 $UMULL r10,r6,r7
858 $UMULH r11,r6,r7
dd558806
AP
859 $ST r10,`0*$BNSZ`(r3) #r[0]=c1
860 #mul_add_c(a[0],b[1],c2,c3,c1);
609b0852 861 $LD r7,`1*$BNSZ`(r5)
dd558806
AP
862 $UMULL r8,r6,r7
863 $UMULH r9,r6,r7
864 addc r11,r8,r11
865 adde r12,r9,r0
866 addze r10,r0
867 #mul_add_c(a[1],b[0],c2,c3,c1);
609b0852
DB
868 $LD r6, `1*$BNSZ`(r4)
869 $LD r7, `0*$BNSZ`(r5)
dd558806
AP
870 $UMULL r8,r6,r7
871 $UMULH r9,r6,r7
872 addc r11,r8,r11
873 adde r12,r9,r12
874 addze r10,r10
875 $ST r11,`1*$BNSZ`(r3) #r[1]=c2
876 #mul_add_c(a[2],b[0],c3,c1,c2);
609b0852 877 $LD r6,`2*$BNSZ`(r4)
dd558806
AP
878 $UMULL r8,r6,r7
879 $UMULH r9,r6,r7
880 addc r12,r8,r12
881 adde r10,r9,r10
882 addze r11,r0
883 #mul_add_c(a[1],b[1],c3,c1,c2);
609b0852
DB
884 $LD r6,`1*$BNSZ`(r4)
885 $LD r7,`1*$BNSZ`(r5)
dd558806
AP
886 $UMULL r8,r6,r7
887 $UMULH r9,r6,r7
888 addc r12,r8,r12
889 adde r10,r9,r10
890 addze r11,r11
891 #mul_add_c(a[0],b[2],c3,c1,c2);
609b0852
DB
892 $LD r6,`0*$BNSZ`(r4)
893 $LD r7,`2*$BNSZ`(r5)
dd558806
AP
894 $UMULL r8,r6,r7
895 $UMULH r9,r6,r7
896 addc r12,r8,r12
897 adde r10,r9,r10
898 addze r11,r11
899 $ST r12,`2*$BNSZ`(r3) #r[2]=c3
900 #mul_add_c(a[0],b[3],c1,c2,c3);
609b0852 901 $LD r7,`3*$BNSZ`(r5)
dd558806
AP
902 $UMULL r8,r6,r7
903 $UMULH r9,r6,r7
904 addc r10,r8,r10
905 adde r11,r9,r11
906 addze r12,r0
907 #mul_add_c(a[1],b[2],c1,c2,c3);
908 $LD r6,`1*$BNSZ`(r4)
909 $LD r7,`2*$BNSZ`(r5)
910 $UMULL r8,r6,r7
911 $UMULH r9,r6,r7
912 addc r10,r8,r10
913 adde r11,r9,r11
914 addze r12,r12
915 #mul_add_c(a[2],b[1],c1,c2,c3);
916 $LD r6,`2*$BNSZ`(r4)
917 $LD r7,`1*$BNSZ`(r5)
918 $UMULL r8,r6,r7
919 $UMULH r9,r6,r7
920 addc r10,r8,r10
921 adde r11,r9,r11
922 addze r12,r12
923 #mul_add_c(a[3],b[0],c1,c2,c3);
924 $LD r6,`3*$BNSZ`(r4)
925 $LD r7,`0*$BNSZ`(r5)
926 $UMULL r8,r6,r7
927 $UMULH r9,r6,r7
928 addc r10,r8,r10
929 adde r11,r9,r11
930 addze r12,r12
931 $ST r10,`3*$BNSZ`(r3) #r[3]=c1
932 #mul_add_c(a[3],b[1],c2,c3,c1);
609b0852 933 $LD r7,`1*$BNSZ`(r5)
dd558806
AP
934 $UMULL r8,r6,r7
935 $UMULH r9,r6,r7
936 addc r11,r8,r11
937 adde r12,r9,r12
938 addze r10,r0
939 #mul_add_c(a[2],b[2],c2,c3,c1);
940 $LD r6,`2*$BNSZ`(r4)
941 $LD r7,`2*$BNSZ`(r5)
942 $UMULL r8,r6,r7
943 $UMULH r9,r6,r7
944 addc r11,r8,r11
945 adde r12,r9,r12
946 addze r10,r10
947 #mul_add_c(a[1],b[3],c2,c3,c1);
948 $LD r6,`1*$BNSZ`(r4)
949 $LD r7,`3*$BNSZ`(r5)
950 $UMULL r8,r6,r7
951 $UMULH r9,r6,r7
952 addc r11,r8,r11
953 adde r12,r9,r12
954 addze r10,r10
955 $ST r11,`4*$BNSZ`(r3) #r[4]=c2
956 #mul_add_c(a[2],b[3],c3,c1,c2);
609b0852 957 $LD r6,`2*$BNSZ`(r4)
dd558806
AP
958 $UMULL r8,r6,r7
959 $UMULH r9,r6,r7
960 addc r12,r8,r12
961 adde r10,r9,r10
962 addze r11,r0
963 #mul_add_c(a[3],b[2],c3,c1,c2);
964 $LD r6,`3*$BNSZ`(r4)
09f40a3c 965 $LD r7,`2*$BNSZ`(r5)
dd558806
AP
966 $UMULL r8,r6,r7
967 $UMULH r9,r6,r7
968 addc r12,r8,r12
969 adde r10,r9,r10
970 addze r11,r11
971 $ST r12,`5*$BNSZ`(r3) #r[5]=c3
972 #mul_add_c(a[3],b[3],c1,c2,c3);
609b0852 973 $LD r7,`3*$BNSZ`(r5)
dd558806
AP
974 $UMULL r8,r6,r7
975 $UMULH r9,r6,r7
976 addc r10,r8,r10
977 adde r11,r9,r11
978
979 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
980 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
31439046 981 blr
67150340
AP
982 .long 0
983 .byte 0,12,0x14,0,0,0,3,0
984 .long 0
d6019e16 985.size .bn_mul_comba4,.-.bn_mul_comba4
dd558806
AP
986
987#
988# NOTE: The following label name should be changed to
989# "bn_mul_comba8" i.e. remove the first dot
990# for the gcc compiler. This should be automatically
991# done in the build
992#
609b0852 993
dd558806
AP
994.align 4
995.bn_mul_comba8:
996#
997# Optimized version of the bn_mul_comba8 routine.
998#
999# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1000# r3 contains r
1001# r4 contains a
1002# r5 contains b
1003# r6, r7 are the 2 BN_ULONGs being multiplied.
1004# r8, r9 are the results of the 32x32 giving 64 multiply.
1005# r10, r11, r12 are the equivalents of c1, c2, and c3.
1006#
1007 xor r0,r0,r0 #r0=0. Used in addze below.
609b0852 1008
dd558806
AP
1009 #mul_add_c(a[0],b[0],c1,c2,c3);
1010 $LD r6,`0*$BNSZ`(r4) #a[0]
1011 $LD r7,`0*$BNSZ`(r5) #b[0]
1012 $UMULL r10,r6,r7
1013 $UMULH r11,r6,r7
1014 $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
1015 #mul_add_c(a[0],b[1],c2,c3,c1);
1016 $LD r7,`1*$BNSZ`(r5)
1017 $UMULL r8,r6,r7
1018 $UMULH r9,r6,r7
1019 addc r11,r11,r8
60250017 1020 addze r12,r9 # since we didn't set r12 to zero before.
dd558806
AP
1021 addze r10,r0
1022 #mul_add_c(a[1],b[0],c2,c3,c1);
1023 $LD r6,`1*$BNSZ`(r4)
1024 $LD r7,`0*$BNSZ`(r5)
1025 $UMULL r8,r6,r7
1026 $UMULH r9,r6,r7
1027 addc r11,r11,r8
1028 adde r12,r12,r9
1029 addze r10,r10
1030 $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1031 #mul_add_c(a[2],b[0],c3,c1,c2);
1032 $LD r6,`2*$BNSZ`(r4)
1033 $UMULL r8,r6,r7
1034 $UMULH r9,r6,r7
1035 addc r12,r12,r8
1036 adde r10,r10,r9
1037 addze r11,r0
1038 #mul_add_c(a[1],b[1],c3,c1,c2);
1039 $LD r6,`1*$BNSZ`(r4)
1040 $LD r7,`1*$BNSZ`(r5)
1041 $UMULL r8,r6,r7
1042 $UMULH r9,r6,r7
1043 addc r12,r12,r8
1044 adde r10,r10,r9
1045 addze r11,r11
1046 #mul_add_c(a[0],b[2],c3,c1,c2);
1047 $LD r6,`0*$BNSZ`(r4)
1048 $LD r7,`2*$BNSZ`(r5)
1049 $UMULL r8,r6,r7
1050 $UMULH r9,r6,r7
1051 addc r12,r12,r8
1052 adde r10,r10,r9
1053 addze r11,r11
1054 $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1055 #mul_add_c(a[0],b[3],c1,c2,c3);
1056 $LD r7,`3*$BNSZ`(r5)
1057 $UMULL r8,r6,r7
1058 $UMULH r9,r6,r7
1059 addc r10,r10,r8
1060 adde r11,r11,r9
1061 addze r12,r0
1062 #mul_add_c(a[1],b[2],c1,c2,c3);
1063 $LD r6,`1*$BNSZ`(r4)
1064 $LD r7,`2*$BNSZ`(r5)
1065 $UMULL r8,r6,r7
1066 $UMULH r9,r6,r7
1067 addc r10,r10,r8
1068 adde r11,r11,r9
1069 addze r12,r12
609b0852 1070
dd558806
AP
1071 #mul_add_c(a[2],b[1],c1,c2,c3);
1072 $LD r6,`2*$BNSZ`(r4)
1073 $LD r7,`1*$BNSZ`(r5)
1074 $UMULL r8,r6,r7
1075 $UMULH r9,r6,r7
1076 addc r10,r10,r8
1077 adde r11,r11,r9
1078 addze r12,r12
1079 #mul_add_c(a[3],b[0],c1,c2,c3);
1080 $LD r6,`3*$BNSZ`(r4)
1081 $LD r7,`0*$BNSZ`(r5)
1082 $UMULL r8,r6,r7
1083 $UMULH r9,r6,r7
1084 addc r10,r10,r8
1085 adde r11,r11,r9
1086 addze r12,r12
1087 $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1088 #mul_add_c(a[4],b[0],c2,c3,c1);
1089 $LD r6,`4*$BNSZ`(r4)
1090 $UMULL r8,r6,r7
1091 $UMULH r9,r6,r7
1092 addc r11,r11,r8
1093 adde r12,r12,r9
1094 addze r10,r0
1095 #mul_add_c(a[3],b[1],c2,c3,c1);
1096 $LD r6,`3*$BNSZ`(r4)
1097 $LD r7,`1*$BNSZ`(r5)
1098 $UMULL r8,r6,r7
1099 $UMULH r9,r6,r7
1100 addc r11,r11,r8
1101 adde r12,r12,r9
1102 addze r10,r10
1103 #mul_add_c(a[2],b[2],c2,c3,c1);
1104 $LD r6,`2*$BNSZ`(r4)
1105 $LD r7,`2*$BNSZ`(r5)
1106 $UMULL r8,r6,r7
1107 $UMULH r9,r6,r7
1108 addc r11,r11,r8
1109 adde r12,r12,r9
1110 addze r10,r10
1111 #mul_add_c(a[1],b[3],c2,c3,c1);
1112 $LD r6,`1*$BNSZ`(r4)
1113 $LD r7,`3*$BNSZ`(r5)
1114 $UMULL r8,r6,r7
1115 $UMULH r9,r6,r7
1116 addc r11,r11,r8
1117 adde r12,r12,r9
1118 addze r10,r10
1119 #mul_add_c(a[0],b[4],c2,c3,c1);
1120 $LD r6,`0*$BNSZ`(r4)
1121 $LD r7,`4*$BNSZ`(r5)
1122 $UMULL r8,r6,r7
1123 $UMULH r9,r6,r7
1124 addc r11,r11,r8
1125 adde r12,r12,r9
1126 addze r10,r10
1127 $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1128 #mul_add_c(a[0],b[5],c3,c1,c2);
1129 $LD r7,`5*$BNSZ`(r5)
1130 $UMULL r8,r6,r7
1131 $UMULH r9,r6,r7
1132 addc r12,r12,r8
1133 adde r10,r10,r9
1134 addze r11,r0
1135 #mul_add_c(a[1],b[4],c3,c1,c2);
609b0852 1136 $LD r6,`1*$BNSZ`(r4)
dd558806
AP
1137 $LD r7,`4*$BNSZ`(r5)
1138 $UMULL r8,r6,r7
1139 $UMULH r9,r6,r7
1140 addc r12,r12,r8
1141 adde r10,r10,r9
1142 addze r11,r11
1143 #mul_add_c(a[2],b[3],c3,c1,c2);
609b0852 1144 $LD r6,`2*$BNSZ`(r4)
dd558806
AP
1145 $LD r7,`3*$BNSZ`(r5)
1146 $UMULL r8,r6,r7
1147 $UMULH r9,r6,r7
1148 addc r12,r12,r8
1149 adde r10,r10,r9
1150 addze r11,r11
1151 #mul_add_c(a[3],b[2],c3,c1,c2);
609b0852 1152 $LD r6,`3*$BNSZ`(r4)
dd558806
AP
1153 $LD r7,`2*$BNSZ`(r5)
1154 $UMULL r8,r6,r7
1155 $UMULH r9,r6,r7
1156 addc r12,r12,r8
1157 adde r10,r10,r9
1158 addze r11,r11
1159 #mul_add_c(a[4],b[1],c3,c1,c2);
609b0852 1160 $LD r6,`4*$BNSZ`(r4)
dd558806
AP
1161 $LD r7,`1*$BNSZ`(r5)
1162 $UMULL r8,r6,r7
1163 $UMULH r9,r6,r7
1164 addc r12,r12,r8
1165 adde r10,r10,r9
1166 addze r11,r11
1167 #mul_add_c(a[5],b[0],c3,c1,c2);
609b0852 1168 $LD r6,`5*$BNSZ`(r4)
dd558806
AP
1169 $LD r7,`0*$BNSZ`(r5)
1170 $UMULL r8,r6,r7
1171 $UMULH r9,r6,r7
1172 addc r12,r12,r8
1173 adde r10,r10,r9
1174 addze r11,r11
1175 $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1176 #mul_add_c(a[6],b[0],c1,c2,c3);
1177 $LD r6,`6*$BNSZ`(r4)
1178 $UMULL r8,r6,r7
1179 $UMULH r9,r6,r7
1180 addc r10,r10,r8
1181 adde r11,r11,r9
1182 addze r12,r0
1183 #mul_add_c(a[5],b[1],c1,c2,c3);
1184 $LD r6,`5*$BNSZ`(r4)
1185 $LD r7,`1*$BNSZ`(r5)
1186 $UMULL r8,r6,r7
1187 $UMULH r9,r6,r7
1188 addc r10,r10,r8
1189 adde r11,r11,r9
1190 addze r12,r12
1191 #mul_add_c(a[4],b[2],c1,c2,c3);
1192 $LD r6,`4*$BNSZ`(r4)
1193 $LD r7,`2*$BNSZ`(r5)
1194 $UMULL r8,r6,r7
1195 $UMULH r9,r6,r7
1196 addc r10,r10,r8
1197 adde r11,r11,r9
1198 addze r12,r12
1199 #mul_add_c(a[3],b[3],c1,c2,c3);
1200 $LD r6,`3*$BNSZ`(r4)
1201 $LD r7,`3*$BNSZ`(r5)
1202 $UMULL r8,r6,r7
1203 $UMULH r9,r6,r7
1204 addc r10,r10,r8
1205 adde r11,r11,r9
1206 addze r12,r12
1207 #mul_add_c(a[2],b[4],c1,c2,c3);
1208 $LD r6,`2*$BNSZ`(r4)
1209 $LD r7,`4*$BNSZ`(r5)
1210 $UMULL r8,r6,r7
1211 $UMULH r9,r6,r7
1212 addc r10,r10,r8
1213 adde r11,r11,r9
1214 addze r12,r12
1215 #mul_add_c(a[1],b[5],c1,c2,c3);
1216 $LD r6,`1*$BNSZ`(r4)
1217 $LD r7,`5*$BNSZ`(r5)
1218 $UMULL r8,r6,r7
1219 $UMULH r9,r6,r7
1220 addc r10,r10,r8
1221 adde r11,r11,r9
1222 addze r12,r12
1223 #mul_add_c(a[0],b[6],c1,c2,c3);
1224 $LD r6,`0*$BNSZ`(r4)
1225 $LD r7,`6*$BNSZ`(r5)
1226 $UMULL r8,r6,r7
1227 $UMULH r9,r6,r7
1228 addc r10,r10,r8
1229 adde r11,r11,r9
1230 addze r12,r12
1231 $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1232 #mul_add_c(a[0],b[7],c2,c3,c1);
1233 $LD r7,`7*$BNSZ`(r5)
1234 $UMULL r8,r6,r7
1235 $UMULH r9,r6,r7
1236 addc r11,r11,r8
1237 adde r12,r12,r9
1238 addze r10,r0
1239 #mul_add_c(a[1],b[6],c2,c3,c1);
1240 $LD r6,`1*$BNSZ`(r4)
1241 $LD r7,`6*$BNSZ`(r5)
1242 $UMULL r8,r6,r7
1243 $UMULH r9,r6,r7
1244 addc r11,r11,r8
1245 adde r12,r12,r9
1246 addze r10,r10
1247 #mul_add_c(a[2],b[5],c2,c3,c1);
1248 $LD r6,`2*$BNSZ`(r4)
1249 $LD r7,`5*$BNSZ`(r5)
1250 $UMULL r8,r6,r7
1251 $UMULH r9,r6,r7
1252 addc r11,r11,r8
1253 adde r12,r12,r9
1254 addze r10,r10
1255 #mul_add_c(a[3],b[4],c2,c3,c1);
1256 $LD r6,`3*$BNSZ`(r4)
1257 $LD r7,`4*$BNSZ`(r5)
1258 $UMULL r8,r6,r7
1259 $UMULH r9,r6,r7
1260 addc r11,r11,r8
1261 adde r12,r12,r9
1262 addze r10,r10
1263 #mul_add_c(a[4],b[3],c2,c3,c1);
1264 $LD r6,`4*$BNSZ`(r4)
1265 $LD r7,`3*$BNSZ`(r5)
1266 $UMULL r8,r6,r7
1267 $UMULH r9,r6,r7
1268 addc r11,r11,r8
1269 adde r12,r12,r9
1270 addze r10,r10
1271 #mul_add_c(a[5],b[2],c2,c3,c1);
1272 $LD r6,`5*$BNSZ`(r4)
1273 $LD r7,`2*$BNSZ`(r5)
1274 $UMULL r8,r6,r7
1275 $UMULH r9,r6,r7
1276 addc r11,r11,r8
1277 adde r12,r12,r9
1278 addze r10,r10
1279 #mul_add_c(a[6],b[1],c2,c3,c1);
1280 $LD r6,`6*$BNSZ`(r4)
1281 $LD r7,`1*$BNSZ`(r5)
1282 $UMULL r8,r6,r7
1283 $UMULH r9,r6,r7
1284 addc r11,r11,r8
1285 adde r12,r12,r9
1286 addze r10,r10
1287 #mul_add_c(a[7],b[0],c2,c3,c1);
1288 $LD r6,`7*$BNSZ`(r4)
1289 $LD r7,`0*$BNSZ`(r5)
1290 $UMULL r8,r6,r7
1291 $UMULH r9,r6,r7
1292 addc r11,r11,r8
1293 adde r12,r12,r9
1294 addze r10,r10
1295 $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1296 #mul_add_c(a[7],b[1],c3,c1,c2);
1297 $LD r7,`1*$BNSZ`(r5)
1298 $UMULL r8,r6,r7
1299 $UMULH r9,r6,r7
1300 addc r12,r12,r8
1301 adde r10,r10,r9
1302 addze r11,r0
1303 #mul_add_c(a[6],b[2],c3,c1,c2);
1304 $LD r6,`6*$BNSZ`(r4)
1305 $LD r7,`2*$BNSZ`(r5)
1306 $UMULL r8,r6,r7
1307 $UMULH r9,r6,r7
1308 addc r12,r12,r8
1309 adde r10,r10,r9
1310 addze r11,r11
1311 #mul_add_c(a[5],b[3],c3,c1,c2);
1312 $LD r6,`5*$BNSZ`(r4)
1313 $LD r7,`3*$BNSZ`(r5)
1314 $UMULL r8,r6,r7
1315 $UMULH r9,r6,r7
1316 addc r12,r12,r8
1317 adde r10,r10,r9
1318 addze r11,r11
1319 #mul_add_c(a[4],b[4],c3,c1,c2);
1320 $LD r6,`4*$BNSZ`(r4)
1321 $LD r7,`4*$BNSZ`(r5)
1322 $UMULL r8,r6,r7
1323 $UMULH r9,r6,r7
1324 addc r12,r12,r8
1325 adde r10,r10,r9
1326 addze r11,r11
1327 #mul_add_c(a[3],b[5],c3,c1,c2);
1328 $LD r6,`3*$BNSZ`(r4)
1329 $LD r7,`5*$BNSZ`(r5)
1330 $UMULL r8,r6,r7
1331 $UMULH r9,r6,r7
1332 addc r12,r12,r8
1333 adde r10,r10,r9
1334 addze r11,r11
1335 #mul_add_c(a[2],b[6],c3,c1,c2);
1336 $LD r6,`2*$BNSZ`(r4)
1337 $LD r7,`6*$BNSZ`(r5)
1338 $UMULL r8,r6,r7
1339 $UMULH r9,r6,r7
1340 addc r12,r12,r8
1341 adde r10,r10,r9
1342 addze r11,r11
1343 #mul_add_c(a[1],b[7],c3,c1,c2);
1344 $LD r6,`1*$BNSZ`(r4)
1345 $LD r7,`7*$BNSZ`(r5)
1346 $UMULL r8,r6,r7
1347 $UMULH r9,r6,r7
1348 addc r12,r12,r8
1349 adde r10,r10,r9
1350 addze r11,r11
1351 $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1352 #mul_add_c(a[2],b[7],c1,c2,c3);
1353 $LD r6,`2*$BNSZ`(r4)
1354 $UMULL r8,r6,r7
1355 $UMULH r9,r6,r7
1356 addc r10,r10,r8
1357 adde r11,r11,r9
1358 addze r12,r0
1359 #mul_add_c(a[3],b[6],c1,c2,c3);
1360 $LD r6,`3*$BNSZ`(r4)
1361 $LD r7,`6*$BNSZ`(r5)
1362 $UMULL r8,r6,r7
1363 $UMULH r9,r6,r7
1364 addc r10,r10,r8
1365 adde r11,r11,r9
1366 addze r12,r12
1367 #mul_add_c(a[4],b[5],c1,c2,c3);
1368 $LD r6,`4*$BNSZ`(r4)
1369 $LD r7,`5*$BNSZ`(r5)
1370 $UMULL r8,r6,r7
1371 $UMULH r9,r6,r7
1372 addc r10,r10,r8
1373 adde r11,r11,r9
1374 addze r12,r12
1375 #mul_add_c(a[5],b[4],c1,c2,c3);
1376 $LD r6,`5*$BNSZ`(r4)
1377 $LD r7,`4*$BNSZ`(r5)
1378 $UMULL r8,r6,r7
1379 $UMULH r9,r6,r7
1380 addc r10,r10,r8
1381 adde r11,r11,r9
1382 addze r12,r12
1383 #mul_add_c(a[6],b[3],c1,c2,c3);
1384 $LD r6,`6*$BNSZ`(r4)
1385 $LD r7,`3*$BNSZ`(r5)
1386 $UMULL r8,r6,r7
1387 $UMULH r9,r6,r7
1388 addc r10,r10,r8
1389 adde r11,r11,r9
1390 addze r12,r12
1391 #mul_add_c(a[7],b[2],c1,c2,c3);
1392 $LD r6,`7*$BNSZ`(r4)
1393 $LD r7,`2*$BNSZ`(r5)
1394 $UMULL r8,r6,r7
1395 $UMULH r9,r6,r7
1396 addc r10,r10,r8
1397 adde r11,r11,r9
1398 addze r12,r12
1399 $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1400 #mul_add_c(a[7],b[3],c2,c3,c1);
1401 $LD r7,`3*$BNSZ`(r5)
1402 $UMULL r8,r6,r7
1403 $UMULH r9,r6,r7
1404 addc r11,r11,r8
1405 adde r12,r12,r9
1406 addze r10,r0
1407 #mul_add_c(a[6],b[4],c2,c3,c1);
1408 $LD r6,`6*$BNSZ`(r4)
1409 $LD r7,`4*$BNSZ`(r5)
1410 $UMULL r8,r6,r7
1411 $UMULH r9,r6,r7
1412 addc r11,r11,r8
1413 adde r12,r12,r9
1414 addze r10,r10
1415 #mul_add_c(a[5],b[5],c2,c3,c1);
1416 $LD r6,`5*$BNSZ`(r4)
1417 $LD r7,`5*$BNSZ`(r5)
1418 $UMULL r8,r6,r7
1419 $UMULH r9,r6,r7
1420 addc r11,r11,r8
1421 adde r12,r12,r9
1422 addze r10,r10
1423 #mul_add_c(a[4],b[6],c2,c3,c1);
1424 $LD r6,`4*$BNSZ`(r4)
1425 $LD r7,`6*$BNSZ`(r5)
1426 $UMULL r8,r6,r7
1427 $UMULH r9,r6,r7
1428 addc r11,r11,r8
1429 adde r12,r12,r9
1430 addze r10,r10
1431 #mul_add_c(a[3],b[7],c2,c3,c1);
1432 $LD r6,`3*$BNSZ`(r4)
1433 $LD r7,`7*$BNSZ`(r5)
1434 $UMULL r8,r6,r7
1435 $UMULH r9,r6,r7
1436 addc r11,r11,r8
1437 adde r12,r12,r9
1438 addze r10,r10
1439 $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1440 #mul_add_c(a[4],b[7],c3,c1,c2);
1441 $LD r6,`4*$BNSZ`(r4)
1442 $UMULL r8,r6,r7
1443 $UMULH r9,r6,r7
1444 addc r12,r12,r8
1445 adde r10,r10,r9
1446 addze r11,r0
1447 #mul_add_c(a[5],b[6],c3,c1,c2);
1448 $LD r6,`5*$BNSZ`(r4)
1449 $LD r7,`6*$BNSZ`(r5)
1450 $UMULL r8,r6,r7
1451 $UMULH r9,r6,r7
1452 addc r12,r12,r8
1453 adde r10,r10,r9
1454 addze r11,r11
1455 #mul_add_c(a[6],b[5],c3,c1,c2);
1456 $LD r6,`6*$BNSZ`(r4)
1457 $LD r7,`5*$BNSZ`(r5)
1458 $UMULL r8,r6,r7
1459 $UMULH r9,r6,r7
1460 addc r12,r12,r8
1461 adde r10,r10,r9
1462 addze r11,r11
1463 #mul_add_c(a[7],b[4],c3,c1,c2);
1464 $LD r6,`7*$BNSZ`(r4)
1465 $LD r7,`4*$BNSZ`(r5)
1466 $UMULL r8,r6,r7
1467 $UMULH r9,r6,r7
1468 addc r12,r12,r8
1469 adde r10,r10,r9
1470 addze r11,r11
1471 $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1472 #mul_add_c(a[7],b[5],c1,c2,c3);
1473 $LD r7,`5*$BNSZ`(r5)
1474 $UMULL r8,r6,r7
1475 $UMULH r9,r6,r7
1476 addc r10,r10,r8
1477 adde r11,r11,r9
1478 addze r12,r0
1479 #mul_add_c(a[6],b[6],c1,c2,c3);
1480 $LD r6,`6*$BNSZ`(r4)
1481 $LD r7,`6*$BNSZ`(r5)
1482 $UMULL r8,r6,r7
1483 $UMULH r9,r6,r7
1484 addc r10,r10,r8
1485 adde r11,r11,r9
1486 addze r12,r12
1487 #mul_add_c(a[5],b[7],c1,c2,c3);
1488 $LD r6,`5*$BNSZ`(r4)
1489 $LD r7,`7*$BNSZ`(r5)
1490 $UMULL r8,r6,r7
1491 $UMULH r9,r6,r7
1492 addc r10,r10,r8
1493 adde r11,r11,r9
1494 addze r12,r12
1495 $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1496 #mul_add_c(a[6],b[7],c2,c3,c1);
1497 $LD r6,`6*$BNSZ`(r4)
1498 $UMULL r8,r6,r7
1499 $UMULH r9,r6,r7
1500 addc r11,r11,r8
1501 adde r12,r12,r9
1502 addze r10,r0
1503 #mul_add_c(a[7],b[6],c2,c3,c1);
1504 $LD r6,`7*$BNSZ`(r4)
1505 $LD r7,`6*$BNSZ`(r5)
1506 $UMULL r8,r6,r7
1507 $UMULH r9,r6,r7
1508 addc r11,r11,r8
1509 adde r12,r12,r9
1510 addze r10,r10
1511 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1512 #mul_add_c(a[7],b[7],c3,c1,c2);
1513 $LD r7,`7*$BNSZ`(r5)
1514 $UMULL r8,r6,r7
1515 $UMULH r9,r6,r7
1516 addc r12,r12,r8
1517 adde r10,r10,r9
1518 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1519 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
31439046 1520 blr
67150340
AP
1521 .long 0
1522 .byte 0,12,0x14,0,0,0,3,0
1523 .long 0
d6019e16 1524.size .bn_mul_comba8,.-.bn_mul_comba8
dd558806
AP
1525
1526#
1527# NOTE: The following label name should be changed to
1528# "bn_sub_words" i.e. remove the first dot
1529# for the gcc compiler. This should be automatically
1530# done in the build
1531#
1532#
1533.align 4
1534.bn_sub_words:
1535#
1536# Handcoded version of bn_sub_words
1537#
1538#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1539#
1540# r3 = r
1541# r4 = a
1542# r5 = b
1543# r6 = n
1544#
1545# Note: No loop unrolling done since this is not a performance
1546# critical loop.
1547
1548 xor r0,r0,r0 #set r0 = 0
1549#
1550# check for r6 = 0 AND set carry bit.
1551#
1552 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1553 # if r6 > 0 then result !=0
1554 # In either case carry bit is set.
31439046 1555 beq Lppcasm_sub_adios
dd558806
AP
1556 addi r4,r4,-$BNSZ
1557 addi r3,r3,-$BNSZ
1558 addi r5,r5,-$BNSZ
1559 mtctr r6
609b0852 1560Lppcasm_sub_mainloop:
dd558806
AP
1561 $LDU r7,$BNSZ(r4)
1562 $LDU r8,$BNSZ(r5)
1563 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1564 # if carry = 1 this is r7-r8. Else it
1565 # is r7-r8 -1 as we need.
1566 $STU r6,$BNSZ(r3)
20b88bb1 1567 bdnz Lppcasm_sub_mainloop
609b0852 1568Lppcasm_sub_adios:
dd558806
AP
1569 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1570 andi. r3,r3,1 # keep only last bit.
31439046 1571 blr
67150340
AP
1572 .long 0
1573 .byte 0,12,0x14,0,0,0,4,0
1574 .long 0
d6019e16 1575.size .bn_sub_words,.-.bn_sub_words
dd558806
AP
1576
1577#
1578# NOTE: The following label name should be changed to
1579# "bn_add_words" i.e. remove the first dot
1580# for the gcc compiler. This should be automatically
1581# done in the build
1582#
1583
1584.align 4
1585.bn_add_words:
1586#
1587# Handcoded version of bn_add_words
1588#
1589#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1590#
1591# r3 = r
1592# r4 = a
1593# r5 = b
1594# r6 = n
1595#
1596# Note: No loop unrolling done since this is not a performance
1597# critical loop.
1598
1599 xor r0,r0,r0
1600#
1601# check for r6 = 0. Is this needed?
1602#
1603 addic. r6,r6,0 #test r6 and clear carry bit.
31439046 1604 beq Lppcasm_add_adios
dd558806
AP
1605 addi r4,r4,-$BNSZ
1606 addi r3,r3,-$BNSZ
1607 addi r5,r5,-$BNSZ
1608 mtctr r6
609b0852 1609Lppcasm_add_mainloop:
dd558806
AP
1610 $LDU r7,$BNSZ(r4)
1611 $LDU r8,$BNSZ(r5)
1612 adde r8,r7,r8
1613 $STU r8,$BNSZ(r3)
20b88bb1 1614 bdnz Lppcasm_add_mainloop
609b0852 1615Lppcasm_add_adios:
dd558806 1616 addze r3,r0 #return carry bit.
31439046 1617 blr
67150340
AP
1618 .long 0
1619 .byte 0,12,0x14,0,0,0,4,0
1620 .long 0
d6019e16 1621.size .bn_add_words,.-.bn_add_words
dd558806
AP
1622
1623#
1624# NOTE: The following label name should be changed to
1625# "bn_div_words" i.e. remove the first dot
1626# for the gcc compiler. This should be automatically
1627# done in the build
1628#
1629
1630.align 4
1631.bn_div_words:
1632#
1633# This is a cleaned up version of code generated by
1634# the AIX compiler. The only optimization is to use
1635# the PPC instruction to count leading zeros instead
1636# of call to num_bits_word. Since this was compiled
1637# only at level -O2 we can possibly squeeze it more?
609b0852 1638#
dd558806
AP
1639# r3 = h
1640# r4 = l
1641# r5 = d
609b0852 1642
dd558806 1643 $UCMPI 0,r5,0 # compare r5 and 0
31439046 1644 bne Lppcasm_div1 # proceed if d!=0
dd558806 1645 li r3,-1 # d=0 return -1
31439046 1646 blr
dd558806
AP
1647Lppcasm_div1:
1648 xor r0,r0,r0 #r0=0
aaa5dc61
AP
1649 li r8,$BITS
1650 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
31439046 1651 beq Lppcasm_div2 #proceed if no leading zeros
aaa5dc61
AP
1652 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1653 $SHR. r9,r3,r8 #are there any bits above r8'th?
31efffbd 1654 $TR 16,r9,r0 #if there're, signal to dump core...
dd558806
AP
1655Lppcasm_div2:
1656 $UCMP 0,r3,r5 #h>=d?
31439046 1657 blt Lppcasm_div3 #goto Lppcasm_div3 if not
609b0852 1658 subf r3,r5,r3 #h-=d ;
dd558806
AP
1659Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1660 cmpi 0,0,r7,0 # is (i == 0)?
31439046 1661 beq Lppcasm_div4
dd558806
AP
1662 $SHL r3,r3,r7 # h = (h<< i)
1663 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1664 $SHL r5,r5,r7 # d<<=i
1665 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
1666 $SHL r4,r4,r7 # l <<=i
1667Lppcasm_div4:
1668 $SHRI r9,r5,`$BITS/2` # r9 = dh
1669 # dl will be computed when needed
1670 # as it saves registers.
1671 li r6,2 #r6=2
1672 mtctr r6 #counter will be in count.
609b0852 1673Lppcasm_divouterloop:
dd558806
AP
1674 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1675 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1676 # compute here for innerloop.
1677 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
31439046 1678 bne Lppcasm_div5 # goto Lppcasm_div5 if not
dd558806
AP
1679
1680 li r8,-1
609b0852 1681 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
dd558806
AP
1682 b Lppcasm_div6
1683Lppcasm_div5:
1684 $UDIV r8,r3,r9 #q = h/dh
1685Lppcasm_div6:
1686 $UMULL r12,r9,r8 #th = q*dh
1687 $CLRU r10,r5,`$BITS/2` #r10=dl
1688 $UMULL r6,r8,r10 #tl = q*dl
609b0852 1689
dd558806
AP
1690Lppcasm_divinnerloop:
1691 subf r10,r12,r3 #t = h -th
1692 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1693 addic. r7,r7,0 #test if r7 == 0. used below.
1694 # now want to compute
1695 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1696 # the following 2 instructions do that
1697 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1698 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
31439046
AP
1699 $UCMP cr1,r6,r7 # compare (tl <= r7)
1700 bne Lppcasm_divinnerexit
1701 ble cr1,Lppcasm_divinnerexit
dd558806
AP
1702 addi r8,r8,-1 #q--
1703 subf r12,r9,r12 #th -=dh
1704 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1705 subf r6,r10,r6 #tl -=dl
1706 b Lppcasm_divinnerloop
1707Lppcasm_divinnerexit:
1708 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1709 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
31439046 1710 $UCMP cr1,r4,r11 # compare l and tl
dd558806 1711 add r12,r12,r10 # th+=t
31439046 1712 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
dd558806
AP
1713 addi r12,r12,1 # th++
1714Lppcasm_div7:
1715 subf r11,r11,r4 #r11=l-tl
31439046
AP
1716 $UCMP cr1,r3,r12 #compare h and th
1717 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
dd558806
AP
1718 addi r8,r8,-1 # q--
1719 add r3,r5,r3 # h+=d
1720Lppcasm_div8:
1721 subf r12,r12,r3 #r12 = h-th
1722 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1723 # want to compute
1724 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1725 # the following 2 instructions will do this.
1726 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1727 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
31439046 1728 bdz Lppcasm_div9 #if (count==0) break ;
dd558806
AP
1729 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1730 b Lppcasm_divouterloop
1731Lppcasm_div9:
1732 or r3,r8,r0
31439046 1733 blr
67150340
AP
1734 .long 0
1735 .byte 0,12,0x14,0,0,0,3,0
1736 .long 0
d6019e16 1737.size .bn_div_words,.-.bn_div_words
dd558806
AP
1738
1739#
1740# NOTE: The following label name should be changed to
1741# "bn_sqr_words" i.e. remove the first dot
1742# for the gcc compiler. This should be automatically
1743# done in the build
1744#
1745.align 4
1746.bn_sqr_words:
1747#
1748# Optimized version of bn_sqr_words
1749#
1750# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1751#
1752# r3 = r
1753# r4 = a
1754# r5 = n
1755#
1756# r6 = a[i].
1757# r7,r8 = product.
1758#
1759# No unrolling done here. Not performance critical.
1760
1761 addic. r5,r5,0 #test r5.
31439046 1762 beq Lppcasm_sqr_adios
dd558806
AP
1763 addi r4,r4,-$BNSZ
1764 addi r3,r3,-$BNSZ
1765 mtctr r5
609b0852 1766Lppcasm_sqr_mainloop:
dd558806
AP
1767 #sqr(r[0],r[1],a[0]);
1768 $LDU r6,$BNSZ(r4)
1769 $UMULL r7,r6,r6
1770 $UMULH r8,r6,r6
1771 $STU r7,$BNSZ(r3)
1772 $STU r8,$BNSZ(r3)
20b88bb1 1773 bdnz Lppcasm_sqr_mainloop
609b0852 1774Lppcasm_sqr_adios:
31439046 1775 blr
67150340
AP
1776 .long 0
1777 .byte 0,12,0x14,0,0,0,3,0
1778 .long 0
d6019e16 1779.size .bn_sqr_words,.-.bn_sqr_words
dd558806
AP
1780
1781#
1782# NOTE: The following label name should be changed to
1783# "bn_mul_words" i.e. remove the first dot
1784# for the gcc compiler. This should be automatically
1785# done in the build
1786#
1787
609b0852 1788.align 4
dd558806
AP
1789.bn_mul_words:
1790#
1791# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1792#
1793# r3 = rp
1794# r4 = ap
1795# r5 = num
1796# r6 = w
1797 xor r0,r0,r0
1798 xor r12,r12,r12 # used for carry
1799 rlwinm. r7,r5,30,2,31 # num >> 2
31439046 1800 beq Lppcasm_mw_REM
dd558806 1801 mtctr r7
609b0852 1802Lppcasm_mw_LOOP:
dd558806
AP
1803 #mul(rp[0],ap[0],w,c1);
1804 $LD r8,`0*$BNSZ`(r4)
1805 $UMULL r9,r6,r8
1806 $UMULH r10,r6,r8
1807 addc r9,r9,r12
1808 #addze r10,r10 #carry is NOT ignored.
1809 #will be taken care of
1810 #in second spin below
1811 #using adde.
1812 $ST r9,`0*$BNSZ`(r3)
1813 #mul(rp[1],ap[1],w,c1);
609b0852 1814 $LD r8,`1*$BNSZ`(r4)
dd558806
AP
1815 $UMULL r11,r6,r8
1816 $UMULH r12,r6,r8
1817 adde r11,r11,r10
1818 #addze r12,r12
1819 $ST r11,`1*$BNSZ`(r3)
1820 #mul(rp[2],ap[2],w,c1);
1821 $LD r8,`2*$BNSZ`(r4)
1822 $UMULL r9,r6,r8
1823 $UMULH r10,r6,r8
1824 adde r9,r9,r12
1825 #addze r10,r10
1826 $ST r9,`2*$BNSZ`(r3)
1827 #mul_add(rp[3],ap[3],w,c1);
1828 $LD r8,`3*$BNSZ`(r4)
1829 $UMULL r11,r6,r8
1830 $UMULH r12,r6,r8
1831 adde r11,r11,r10
1832 addze r12,r12 #this spin we collect carry into
1833 #r12
1834 $ST r11,`3*$BNSZ`(r3)
609b0852 1835
dd558806
AP
1836 addi r3,r3,`4*$BNSZ`
1837 addi r4,r4,`4*$BNSZ`
20b88bb1 1838 bdnz Lppcasm_mw_LOOP
dd558806
AP
1839
1840Lppcasm_mw_REM:
1841 andi. r5,r5,0x3
31439046 1842 beq Lppcasm_mw_OVER
dd558806
AP
1843 #mul(rp[0],ap[0],w,c1);
1844 $LD r8,`0*$BNSZ`(r4)
1845 $UMULL r9,r6,r8
1846 $UMULH r10,r6,r8
1847 addc r9,r9,r12
1848 addze r10,r10
1849 $ST r9,`0*$BNSZ`(r3)
1850 addi r12,r10,0
609b0852 1851
dd558806
AP
1852 addi r5,r5,-1
1853 cmpli 0,0,r5,0
31439046 1854 beq Lppcasm_mw_OVER
dd558806 1855
609b0852 1856
dd558806 1857 #mul(rp[1],ap[1],w,c1);
609b0852 1858 $LD r8,`1*$BNSZ`(r4)
dd558806
AP
1859 $UMULL r9,r6,r8
1860 $UMULH r10,r6,r8
1861 addc r9,r9,r12
1862 addze r10,r10
1863 $ST r9,`1*$BNSZ`(r3)
1864 addi r12,r10,0
609b0852 1865
dd558806
AP
1866 addi r5,r5,-1
1867 cmpli 0,0,r5,0
31439046 1868 beq Lppcasm_mw_OVER
609b0852 1869
dd558806
AP
1870 #mul_add(rp[2],ap[2],w,c1);
1871 $LD r8,`2*$BNSZ`(r4)
1872 $UMULL r9,r6,r8
1873 $UMULH r10,r6,r8
1874 addc r9,r9,r12
1875 addze r10,r10
1876 $ST r9,`2*$BNSZ`(r3)
1877 addi r12,r10,0
609b0852
DB
1878
1879Lppcasm_mw_OVER:
dd558806 1880 addi r3,r12,0
31439046 1881 blr
67150340
AP
1882 .long 0
1883 .byte 0,12,0x14,0,0,0,4,0
1884 .long 0
fca8f5de 1885.size .bn_mul_words,.-.bn_mul_words
dd558806
AP
1886
1887#
1888# NOTE: The following label name should be changed to
1889# "bn_mul_add_words" i.e. remove the first dot
1890# for the gcc compiler. This should be automatically
1891# done in the build
1892#
1893
1894.align 4
1895.bn_mul_add_words:
1896#
1897# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1898#
1899# r3 = rp
1900# r4 = ap
1901# r5 = num
1902# r6 = w
1903#
1904# empirical evidence suggests that unrolled version performs best!!
1905#
1906 xor r0,r0,r0 #r0 = 0
609b0852 1907 xor r12,r12,r12 #r12 = 0 . used for carry
dd558806 1908 rlwinm. r7,r5,30,2,31 # num >> 2
31439046 1909 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
dd558806 1910 mtctr r7
609b0852 1911Lppcasm_maw_mainloop:
dd558806
AP
1912 #mul_add(rp[0],ap[0],w,c1);
1913 $LD r8,`0*$BNSZ`(r4)
1914 $LD r11,`0*$BNSZ`(r3)
1915 $UMULL r9,r6,r8
1916 $UMULH r10,r6,r8
1917 addc r9,r9,r12 #r12 is carry.
1918 addze r10,r10
1919 addc r9,r9,r11
1920 #addze r10,r10
1921 #the above instruction addze
1922 #is NOT needed. Carry will NOT
1923 #be ignored. It's not affected
1924 #by multiply and will be collected
1925 #in the next spin
1926 $ST r9,`0*$BNSZ`(r3)
609b0852 1927
dd558806 1928 #mul_add(rp[1],ap[1],w,c1);
609b0852 1929 $LD r8,`1*$BNSZ`(r4)
dd558806
AP
1930 $LD r9,`1*$BNSZ`(r3)
1931 $UMULL r11,r6,r8
1932 $UMULH r12,r6,r8
1933 adde r11,r11,r10 #r10 is carry.
1934 addze r12,r12
1935 addc r11,r11,r9
1936 #addze r12,r12
1937 $ST r11,`1*$BNSZ`(r3)
609b0852 1938
dd558806
AP
1939 #mul_add(rp[2],ap[2],w,c1);
1940 $LD r8,`2*$BNSZ`(r4)
1941 $UMULL r9,r6,r8
1942 $LD r11,`2*$BNSZ`(r3)
1943 $UMULH r10,r6,r8
1944 adde r9,r9,r12
1945 addze r10,r10
1946 addc r9,r9,r11
1947 #addze r10,r10
1948 $ST r9,`2*$BNSZ`(r3)
609b0852 1949
dd558806
AP
1950 #mul_add(rp[3],ap[3],w,c1);
1951 $LD r8,`3*$BNSZ`(r4)
1952 $UMULL r11,r6,r8
1953 $LD r9,`3*$BNSZ`(r3)
1954 $UMULH r12,r6,r8
1955 adde r11,r11,r10
1956 addze r12,r12
1957 addc r11,r11,r9
1958 addze r12,r12
1959 $ST r11,`3*$BNSZ`(r3)
1960 addi r3,r3,`4*$BNSZ`
1961 addi r4,r4,`4*$BNSZ`
20b88bb1 1962 bdnz Lppcasm_maw_mainloop
609b0852 1963
dd558806
AP
1964Lppcasm_maw_leftover:
1965 andi. r5,r5,0x3
31439046 1966 beq Lppcasm_maw_adios
dd558806
AP
1967 addi r3,r3,-$BNSZ
1968 addi r4,r4,-$BNSZ
1969 #mul_add(rp[0],ap[0],w,c1);
1970 mtctr r5
1971 $LDU r8,$BNSZ(r4)
1972 $UMULL r9,r6,r8
1973 $UMULH r10,r6,r8
1974 $LDU r11,$BNSZ(r3)
1975 addc r9,r9,r11
1976 addze r10,r10
1977 addc r9,r9,r12
1978 addze r12,r10
1979 $ST r9,0(r3)
609b0852 1980
31439046 1981 bdz Lppcasm_maw_adios
dd558806 1982 #mul_add(rp[1],ap[1],w,c1);
609b0852 1983 $LDU r8,$BNSZ(r4)
dd558806
AP
1984 $UMULL r9,r6,r8
1985 $UMULH r10,r6,r8
1986 $LDU r11,$BNSZ(r3)
1987 addc r9,r9,r11
1988 addze r10,r10
1989 addc r9,r9,r12
1990 addze r12,r10
1991 $ST r9,0(r3)
609b0852 1992
31439046 1993 bdz Lppcasm_maw_adios
dd558806
AP
1994 #mul_add(rp[2],ap[2],w,c1);
1995 $LDU r8,$BNSZ(r4)
1996 $UMULL r9,r6,r8
1997 $UMULH r10,r6,r8
1998 $LDU r11,$BNSZ(r3)
1999 addc r9,r9,r11
2000 addze r10,r10
2001 addc r9,r9,r12
2002 addze r12,r10
2003 $ST r9,0(r3)
609b0852
DB
2004
2005Lppcasm_maw_adios:
dd558806 2006 addi r3,r12,0
31439046 2007 blr
67150340
AP
2008 .long 0
2009 .byte 0,12,0x14,0,0,0,4,0
2010 .long 0
d6019e16 2011.size .bn_mul_add_words,.-.bn_mul_add_words
dd558806
AP
2012 .align 4
2013EOF
31439046
AP
2014$data =~ s/\`([^\`]*)\`/eval $1/gem;
2015print $data;
a21314db 2016close STDOUT or die "error closing STDOUT: $!";