]>
Commit | Line | Data |
---|---|---|
e0a65194 | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved. |
dd558806 | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
e0a65194 RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
dd558806 AP |
9 | # Implemented as a Perl wrapper as we want to support several different |
10 | # architectures with single file. We pick up the target based on the | |
11 | # file name we are asked to generate. | |
12 | # | |
13 | # It should be noted though that this perl code is nothing like | |
14 | # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much | |
15 | # as pre-processor to cover for platform differences in name decoration, | |
16 | # linker tables, 32-/64-bit instruction sets... | |
17 | # | |
18 | # As you might know there're several PowerPC ABI in use. Most notably | |
19 | # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs | |
20 | # are similar enough to implement leaf(!) functions, which would be ABI | |
21 | # neutral. And that's what you find here: ABI neutral leaf functions. | |
22 | # In case you wonder what that is... | |
23 | # | |
24 | # AIX performance | |
25 | # | |
26 | # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. | |
27 | # | |
28 | # The following is the performance of 32-bit compiler | |
29 | # generated code: | |
30 | # | |
31 | # OpenSSL 0.9.6c 21 dec 2001 | |
32 | # built on: Tue Jun 11 11:06:51 EDT 2002 | |
33 | # options:bn(64,32) ... | |
34 | #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 | |
35 | # sign verify sign/s verify/s | |
36 | #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 | |
37 | #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 | |
38 | #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 | |
39 | #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 | |
40 | #dsa 512 bits 0.0087s 0.0106s 114.3 94.5 | |
609b0852 | 41 | #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 |
dd558806 | 42 | # |
dd07e68b | 43 | # Same benchmark with this assembler code: |
dd558806 AP |
44 | # |
45 | #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 | |
46 | #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 | |
47 | #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 | |
48 | #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 | |
49 | #dsa 512 bits 0.0052s 0.0062s 191.6 162.0 | |
50 | #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 | |
51 | # | |
52 | # Number of operations increases by at almost 75% | |
53 | # | |
54 | # Here are performance numbers for 64-bit compiler | |
55 | # generated code: | |
56 | # | |
57 | # OpenSSL 0.9.6g [engine] 9 Aug 2002 | |
58 | # built on: Fri Apr 18 16:59:20 EDT 2003 | |
59 | # options:bn(64,64) ... | |
60 | # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 | |
61 | # sign verify sign/s verify/s | |
62 | #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 | |
63 | #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 | |
64 | #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 | |
65 | #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 | |
66 | #dsa 512 bits 0.0026s 0.0032s 382.5 313.7 | |
67 | #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 | |
68 | # | |
69 | # Same benchmark with this assembler code: | |
70 | # | |
71 | #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 | |
72 | #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 | |
73 | #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 | |
74 | #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 | |
75 | #dsa 512 bits 0.0016s 0.0020s 610.7 507.1 | |
76 | #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 | |
609b0852 | 77 | # |
dd558806 AP |
78 | # Again, performance increases by at about 75% |
79 | # | |
80 | # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) | |
81 | # OpenSSL 0.9.7c 30 Sep 2003 | |
82 | # | |
83 | # Original code. | |
84 | # | |
85 | #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 | |
86 | #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 | |
87 | #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 | |
88 | #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 | |
89 | #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 | |
90 | #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 | |
91 | #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 | |
92 | # | |
93 | # Same benchmark with this assembler code: | |
94 | # | |
95 | #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 | |
96 | #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 | |
97 | #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 | |
98 | #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 | |
99 | #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 | |
100 | #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 | |
101 | #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 | |
102 | # | |
103 | # Performance increase of ~60% | |
e3713c36 | 104 | # Based on submission from Suresh N. Chari of IBM |
dd558806 | 105 | |
1aa89a7a RL |
106 | # $output is the last argument if it looks like a file (it has an extension) |
107 | # $flavour is the first argument if it doesn't look like a file | |
108 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
109 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
dd558806 | 110 | |
addd641f | 111 | if ($flavour =~ /32/) { |
dd558806 AP |
112 | $BITS= 32; |
113 | $BNSZ= $BITS/8; | |
114 | $ISA= "\"ppc\""; | |
115 | ||
116 | $LD= "lwz"; # load | |
117 | $LDU= "lwzu"; # load and update | |
118 | $ST= "stw"; # store | |
119 | $STU= "stwu"; # store and update | |
120 | $UMULL= "mullw"; # unsigned multiply low | |
121 | $UMULH= "mulhwu"; # unsigned multiply high | |
122 | $UDIV= "divwu"; # unsigned divide | |
123 | $UCMPI= "cmplwi"; # unsigned compare with immediate | |
124 | $UCMP= "cmplw"; # unsigned compare | |
aaa5dc61 | 125 | $CNTLZ= "cntlzw"; # count leading zeros |
dd558806 AP |
126 | $SHL= "slw"; # shift left |
127 | $SHR= "srw"; # unsigned shift right | |
609b0852 | 128 | $SHRI= "srwi"; # unsigned shift right by immediate |
dd558806 AP |
129 | $SHLI= "slwi"; # shift left by immediate |
130 | $CLRU= "clrlwi"; # clear upper bits | |
131 | $INSR= "insrwi"; # insert right | |
132 | $ROTL= "rotlwi"; # rotate left by immediate | |
31efffbd | 133 | $TR= "tw"; # conditional trap |
addd641f | 134 | } elsif ($flavour =~ /64/) { |
dd558806 AP |
135 | $BITS= 64; |
136 | $BNSZ= $BITS/8; | |
137 | $ISA= "\"ppc64\""; | |
138 | ||
139 | # same as above, but 64-bit mnemonics... | |
140 | $LD= "ld"; # load | |
141 | $LDU= "ldu"; # load and update | |
142 | $ST= "std"; # store | |
143 | $STU= "stdu"; # store and update | |
144 | $UMULL= "mulld"; # unsigned multiply low | |
145 | $UMULH= "mulhdu"; # unsigned multiply high | |
146 | $UDIV= "divdu"; # unsigned divide | |
147 | $UCMPI= "cmpldi"; # unsigned compare with immediate | |
148 | $UCMP= "cmpld"; # unsigned compare | |
aaa5dc61 | 149 | $CNTLZ= "cntlzd"; # count leading zeros |
dd558806 AP |
150 | $SHL= "sld"; # shift left |
151 | $SHR= "srd"; # unsigned shift right | |
609b0852 | 152 | $SHRI= "srdi"; # unsigned shift right by immediate |
dd558806 AP |
153 | $SHLI= "sldi"; # shift left by immediate |
154 | $CLRU= "clrldi"; # clear upper bits | |
609b0852 | 155 | $INSR= "insrdi"; # insert right |
dd558806 | 156 | $ROTL= "rotldi"; # rotate left by immediate |
31efffbd | 157 | $TR= "td"; # conditional trap |
addd641f | 158 | } else { die "nonsense $flavour"; } |
dd558806 | 159 | |
31439046 AP |
160 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
161 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
162 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
163 | die "can't locate ppc-xlate.pl"; | |
dd558806 | 164 | |
1aa89a7a RL |
165 | open STDOUT,"| $^X $xlate $flavour \"$output\"" |
166 | or die "can't call $xlate: $!"; | |
dd558806 | 167 | |
31439046 | 168 | $data=<<EOF; |
dd558806 AP |
169 | #-------------------------------------------------------------------- |
170 | # | |
171 | # | |
172 | # | |
173 | # | |
174 | # File: ppc32.s | |
175 | # | |
176 | # Created by: Suresh Chari | |
177 | # IBM Thomas J. Watson Research Library | |
178 | # Hawthorne, NY | |
179 | # | |
180 | # | |
181 | # Description: Optimized assembly routines for OpenSSL crypto | |
182 | # on the 32 bitPowerPC platform. | |
183 | # | |
184 | # | |
185 | # Version History | |
186 | # | |
187 | # 2. Fixed bn_add,bn_sub and bn_div_words, added comments, | |
188 | # cleaned up code. Also made a single version which can | |
189 | # be used for both the AIX and Linux compilers. See NOTE | |
190 | # below. | |
191 | # 12/05/03 Suresh Chari | |
192 | # (with lots of help from) Andy Polyakov | |
609b0852 | 193 | ## |
dd558806 AP |
194 | # 1. Initial version 10/20/02 Suresh Chari |
195 | # | |
196 | # | |
197 | # The following file works for the xlc,cc | |
198 | # and gcc compilers. | |
199 | # | |
200 | # NOTE: To get the file to link correctly with the gcc compiler | |
201 | # you have to change the names of the routines and remove | |
202 | # the first .(dot) character. This should automatically | |
203 | # be done in the build process. | |
204 | # | |
205 | # Hand optimized assembly code for the following routines | |
609b0852 | 206 | # |
dd558806 AP |
207 | # bn_sqr_comba4 |
208 | # bn_sqr_comba8 | |
209 | # bn_mul_comba4 | |
210 | # bn_mul_comba8 | |
211 | # bn_sub_words | |
212 | # bn_add_words | |
213 | # bn_div_words | |
214 | # bn_sqr_words | |
215 | # bn_mul_words | |
216 | # bn_mul_add_words | |
217 | # | |
218 | # NOTE: It is possible to optimize this code more for | |
219 | # specific PowerPC or Power architectures. On the Northstar | |
220 | # architecture the optimizations in this file do | |
221 | # NOT provide much improvement. | |
222 | # | |
223 | # If you have comments or suggestions to improve code send | |
224 | # me a note at schari\@us.ibm.com | |
225 | # | |
226 | #-------------------------------------------------------------------------- | |
227 | # | |
228 | # Defines to be used in the assembly code. | |
609b0852 | 229 | # |
31439046 AP |
230 | #.set r0,0 # we use it as storage for value of 0 |
231 | #.set SP,1 # preserved | |
609b0852 | 232 | #.set RTOC,2 # preserved |
31439046 AP |
233 | #.set r3,3 # 1st argument/return value |
234 | #.set r4,4 # 2nd argument/volatile register | |
235 | #.set r5,5 # 3rd argument/volatile register | |
236 | #.set r6,6 # ... | |
237 | #.set r7,7 | |
238 | #.set r8,8 | |
239 | #.set r9,9 | |
240 | #.set r10,10 | |
241 | #.set r11,11 | |
242 | #.set r12,12 | |
243 | #.set r13,13 # not used, nor any other "below" it... | |
dd558806 AP |
244 | |
245 | # Declare function names to be global | |
246 | # NOTE: For gcc these names MUST be changed to remove | |
247 | # the first . i.e. for example change ".bn_sqr_comba4" | |
248 | # to "bn_sqr_comba4". This should be automatically done | |
249 | # in the build. | |
609b0852 | 250 | |
dd558806 AP |
251 | .globl .bn_sqr_comba4 |
252 | .globl .bn_sqr_comba8 | |
253 | .globl .bn_mul_comba4 | |
254 | .globl .bn_mul_comba8 | |
255 | .globl .bn_sub_words | |
256 | .globl .bn_add_words | |
257 | .globl .bn_div_words | |
258 | .globl .bn_sqr_words | |
259 | .globl .bn_mul_words | |
260 | .globl .bn_mul_add_words | |
609b0852 | 261 | |
dd558806 | 262 | # .text section |
609b0852 | 263 | |
492279f6 | 264 | .machine "any" |
2864df8f | 265 | .text |
dd558806 AP |
266 | |
267 | # | |
268 | # NOTE: The following label name should be changed to | |
269 | # "bn_sqr_comba4" i.e. remove the first dot | |
270 | # for the gcc compiler. This should be automatically | |
271 | # done in the build | |
272 | # | |
273 | ||
274 | .align 4 | |
275 | .bn_sqr_comba4: | |
276 | # | |
277 | # Optimized version of bn_sqr_comba4. | |
278 | # | |
279 | # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | |
280 | # r3 contains r | |
281 | # r4 contains a | |
282 | # | |
609b0852 DB |
283 | # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: |
284 | # | |
dd558806 AP |
285 | # r5,r6 are the two BN_ULONGs being multiplied. |
286 | # r7,r8 are the results of the 32x32 giving 64 bit multiply. | |
287 | # r9,r10, r11 are the equivalents of c1,c2, c3. | |
288 | # Here's the assembly | |
289 | # | |
290 | # | |
291 | xor r0,r0,r0 # set r0 = 0. Used in the addze | |
292 | # instructions below | |
609b0852 | 293 | |
dd558806 | 294 | #sqr_add_c(a,0,c1,c2,c3) |
609b0852 DB |
295 | $LD r5,`0*$BNSZ`(r4) |
296 | $UMULL r9,r5,r5 | |
dd558806 AP |
297 | $UMULH r10,r5,r5 #in first iteration. No need |
298 | #to add since c1=c2=c3=0. | |
299 | # Note c3(r11) is NOT set to 0 | |
300 | # but will be. | |
301 | ||
302 | $ST r9,`0*$BNSZ`(r3) # r[0]=c1; | |
303 | # sqr_add_c2(a,1,0,c2,c3,c1); | |
609b0852 | 304 | $LD r6,`1*$BNSZ`(r4) |
dd558806 AP |
305 | $UMULL r7,r5,r6 |
306 | $UMULH r8,r5,r6 | |
609b0852 | 307 | |
dd558806 AP |
308 | addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) |
309 | adde r8,r8,r8 | |
310 | addze r9,r0 # catch carry if any. | |
609b0852 DB |
311 | # r9= r0(=0) and carry |
312 | ||
dd558806 | 313 | addc r10,r7,r10 # now add to temp result. |
609b0852 | 314 | addze r11,r8 # r8 added to r11 which is 0 |
dd558806 | 315 | addze r9,r9 |
609b0852 DB |
316 | |
317 | $ST r10,`1*$BNSZ`(r3) #r[1]=c2; | |
dd558806 AP |
318 | #sqr_add_c(a,1,c3,c1,c2) |
319 | $UMULL r7,r6,r6 | |
320 | $UMULH r8,r6,r6 | |
321 | addc r11,r7,r11 | |
322 | adde r9,r8,r9 | |
323 | addze r10,r0 | |
324 | #sqr_add_c2(a,2,0,c3,c1,c2) | |
325 | $LD r6,`2*$BNSZ`(r4) | |
326 | $UMULL r7,r5,r6 | |
327 | $UMULH r8,r5,r6 | |
609b0852 | 328 | |
dd558806 AP |
329 | addc r7,r7,r7 |
330 | adde r8,r8,r8 | |
331 | addze r10,r10 | |
609b0852 | 332 | |
dd558806 AP |
333 | addc r11,r7,r11 |
334 | adde r9,r8,r9 | |
335 | addze r10,r10 | |
609b0852 | 336 | $ST r11,`2*$BNSZ`(r3) #r[2]=c3 |
dd558806 | 337 | #sqr_add_c2(a,3,0,c1,c2,c3); |
609b0852 | 338 | $LD r6,`3*$BNSZ`(r4) |
dd558806 AP |
339 | $UMULL r7,r5,r6 |
340 | $UMULH r8,r5,r6 | |
341 | addc r7,r7,r7 | |
342 | adde r8,r8,r8 | |
343 | addze r11,r0 | |
609b0852 | 344 | |
dd558806 AP |
345 | addc r9,r7,r9 |
346 | adde r10,r8,r10 | |
347 | addze r11,r11 | |
348 | #sqr_add_c2(a,2,1,c1,c2,c3); | |
349 | $LD r5,`1*$BNSZ`(r4) | |
350 | $LD r6,`2*$BNSZ`(r4) | |
351 | $UMULL r7,r5,r6 | |
352 | $UMULH r8,r5,r6 | |
609b0852 | 353 | |
dd558806 AP |
354 | addc r7,r7,r7 |
355 | adde r8,r8,r8 | |
356 | addze r11,r11 | |
357 | addc r9,r7,r9 | |
358 | adde r10,r8,r10 | |
359 | addze r11,r11 | |
360 | $ST r9,`3*$BNSZ`(r3) #r[3]=c1 | |
361 | #sqr_add_c(a,2,c2,c3,c1); | |
362 | $UMULL r7,r6,r6 | |
363 | $UMULH r8,r6,r6 | |
364 | addc r10,r7,r10 | |
365 | adde r11,r8,r11 | |
366 | addze r9,r0 | |
367 | #sqr_add_c2(a,3,1,c2,c3,c1); | |
609b0852 | 368 | $LD r6,`3*$BNSZ`(r4) |
dd558806 AP |
369 | $UMULL r7,r5,r6 |
370 | $UMULH r8,r5,r6 | |
371 | addc r7,r7,r7 | |
372 | adde r8,r8,r8 | |
373 | addze r9,r9 | |
609b0852 | 374 | |
dd558806 AP |
375 | addc r10,r7,r10 |
376 | adde r11,r8,r11 | |
377 | addze r9,r9 | |
378 | $ST r10,`4*$BNSZ`(r3) #r[4]=c2 | |
379 | #sqr_add_c2(a,3,2,c3,c1,c2); | |
609b0852 | 380 | $LD r5,`2*$BNSZ`(r4) |
dd558806 AP |
381 | $UMULL r7,r5,r6 |
382 | $UMULH r8,r5,r6 | |
383 | addc r7,r7,r7 | |
384 | adde r8,r8,r8 | |
385 | addze r10,r0 | |
609b0852 | 386 | |
dd558806 AP |
387 | addc r11,r7,r11 |
388 | adde r9,r8,r9 | |
389 | addze r10,r10 | |
390 | $ST r11,`5*$BNSZ`(r3) #r[5] = c3 | |
391 | #sqr_add_c(a,3,c1,c2,c3); | |
609b0852 | 392 | $UMULL r7,r6,r6 |
dd558806 AP |
393 | $UMULH r8,r6,r6 |
394 | addc r9,r7,r9 | |
395 | adde r10,r8,r10 | |
396 | ||
397 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 | |
398 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 | |
31439046 | 399 | blr |
67150340 AP |
400 | .long 0 |
401 | .byte 0,12,0x14,0,0,0,2,0 | |
402 | .long 0 | |
d6019e16 | 403 | .size .bn_sqr_comba4,.-.bn_sqr_comba4 |
dd558806 AP |
404 | |
405 | # | |
406 | # NOTE: The following label name should be changed to | |
407 | # "bn_sqr_comba8" i.e. remove the first dot | |
408 | # for the gcc compiler. This should be automatically | |
409 | # done in the build | |
410 | # | |
609b0852 | 411 | |
dd558806 AP |
412 | .align 4 |
413 | .bn_sqr_comba8: | |
414 | # | |
415 | # This is an optimized version of the bn_sqr_comba8 routine. | |
416 | # Tightly uses the adde instruction | |
417 | # | |
418 | # | |
419 | # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | |
420 | # r3 contains r | |
421 | # r4 contains a | |
422 | # | |
609b0852 DB |
423 | # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: |
424 | # | |
dd558806 AP |
425 | # r5,r6 are the two BN_ULONGs being multiplied. |
426 | # r7,r8 are the results of the 32x32 giving 64 bit multiply. | |
427 | # r9,r10, r11 are the equivalents of c1,c2, c3. | |
428 | # | |
429 | # Possible optimization of loading all 8 longs of a into registers | |
60250017 | 430 | # doesn't provide any speedup |
609b0852 | 431 | # |
dd558806 AP |
432 | |
433 | xor r0,r0,r0 #set r0 = 0.Used in addze | |
434 | #instructions below. | |
435 | ||
436 | #sqr_add_c(a,0,c1,c2,c3); | |
437 | $LD r5,`0*$BNSZ`(r4) | |
438 | $UMULL r9,r5,r5 #1st iteration: no carries. | |
439 | $UMULH r10,r5,r5 | |
440 | $ST r9,`0*$BNSZ`(r3) # r[0]=c1; | |
441 | #sqr_add_c2(a,1,0,c2,c3,c1); | |
442 | $LD r6,`1*$BNSZ`(r4) | |
443 | $UMULL r7,r5,r6 | |
609b0852 DB |
444 | $UMULH r8,r5,r6 |
445 | ||
dd558806 AP |
446 | addc r10,r7,r10 #add the two register number |
447 | adde r11,r8,r0 # (r8,r7) to the three register | |
448 | addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 | |
609b0852 | 449 | |
dd558806 AP |
450 | addc r10,r7,r10 #add the two register number |
451 | adde r11,r8,r11 # (r8,r7) to the three register | |
452 | addze r9,r9 # number (r9,r11,r10). | |
609b0852 | 453 | |
dd558806 | 454 | $ST r10,`1*$BNSZ`(r3) # r[1]=c2 |
609b0852 | 455 | |
dd558806 AP |
456 | #sqr_add_c(a,1,c3,c1,c2); |
457 | $UMULL r7,r6,r6 | |
458 | $UMULH r8,r6,r6 | |
459 | addc r11,r7,r11 | |
460 | adde r9,r8,r9 | |
461 | addze r10,r0 | |
462 | #sqr_add_c2(a,2,0,c3,c1,c2); | |
463 | $LD r6,`2*$BNSZ`(r4) | |
464 | $UMULL r7,r5,r6 | |
465 | $UMULH r8,r5,r6 | |
609b0852 | 466 | |
dd558806 AP |
467 | addc r11,r7,r11 |
468 | adde r9,r8,r9 | |
469 | addze r10,r10 | |
609b0852 | 470 | |
dd558806 AP |
471 | addc r11,r7,r11 |
472 | adde r9,r8,r9 | |
473 | addze r10,r10 | |
609b0852 | 474 | |
dd558806 AP |
475 | $ST r11,`2*$BNSZ`(r3) #r[2]=c3 |
476 | #sqr_add_c2(a,3,0,c1,c2,c3); | |
477 | $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. | |
478 | $UMULL r7,r5,r6 | |
479 | $UMULH r8,r5,r6 | |
609b0852 | 480 | |
dd558806 AP |
481 | addc r9,r7,r9 |
482 | adde r10,r8,r10 | |
483 | addze r11,r0 | |
609b0852 | 484 | |
dd558806 AP |
485 | addc r9,r7,r9 |
486 | adde r10,r8,r10 | |
487 | addze r11,r11 | |
488 | #sqr_add_c2(a,2,1,c1,c2,c3); | |
489 | $LD r5,`1*$BNSZ`(r4) | |
490 | $LD r6,`2*$BNSZ`(r4) | |
491 | $UMULL r7,r5,r6 | |
492 | $UMULH r8,r5,r6 | |
609b0852 | 493 | |
dd558806 AP |
494 | addc r9,r7,r9 |
495 | adde r10,r8,r10 | |
496 | addze r11,r11 | |
609b0852 | 497 | |
dd558806 AP |
498 | addc r9,r7,r9 |
499 | adde r10,r8,r10 | |
500 | addze r11,r11 | |
609b0852 | 501 | |
dd558806 AP |
502 | $ST r9,`3*$BNSZ`(r3) #r[3]=c1; |
503 | #sqr_add_c(a,2,c2,c3,c1); | |
504 | $UMULL r7,r6,r6 | |
505 | $UMULH r8,r6,r6 | |
609b0852 | 506 | |
dd558806 AP |
507 | addc r10,r7,r10 |
508 | adde r11,r8,r11 | |
509 | addze r9,r0 | |
510 | #sqr_add_c2(a,3,1,c2,c3,c1); | |
511 | $LD r6,`3*$BNSZ`(r4) | |
512 | $UMULL r7,r5,r6 | |
513 | $UMULH r8,r5,r6 | |
609b0852 | 514 | |
dd558806 AP |
515 | addc r10,r7,r10 |
516 | adde r11,r8,r11 | |
517 | addze r9,r9 | |
609b0852 | 518 | |
dd558806 AP |
519 | addc r10,r7,r10 |
520 | adde r11,r8,r11 | |
521 | addze r9,r9 | |
522 | #sqr_add_c2(a,4,0,c2,c3,c1); | |
523 | $LD r5,`0*$BNSZ`(r4) | |
524 | $LD r6,`4*$BNSZ`(r4) | |
525 | $UMULL r7,r5,r6 | |
526 | $UMULH r8,r5,r6 | |
609b0852 | 527 | |
dd558806 AP |
528 | addc r10,r7,r10 |
529 | adde r11,r8,r11 | |
530 | addze r9,r9 | |
609b0852 | 531 | |
dd558806 AP |
532 | addc r10,r7,r10 |
533 | adde r11,r8,r11 | |
534 | addze r9,r9 | |
535 | $ST r10,`4*$BNSZ`(r3) #r[4]=c2; | |
536 | #sqr_add_c2(a,5,0,c3,c1,c2); | |
537 | $LD r6,`5*$BNSZ`(r4) | |
538 | $UMULL r7,r5,r6 | |
539 | $UMULH r8,r5,r6 | |
609b0852 | 540 | |
dd558806 AP |
541 | addc r11,r7,r11 |
542 | adde r9,r8,r9 | |
543 | addze r10,r0 | |
609b0852 | 544 | |
dd558806 AP |
545 | addc r11,r7,r11 |
546 | adde r9,r8,r9 | |
547 | addze r10,r10 | |
548 | #sqr_add_c2(a,4,1,c3,c1,c2); | |
549 | $LD r5,`1*$BNSZ`(r4) | |
550 | $LD r6,`4*$BNSZ`(r4) | |
551 | $UMULL r7,r5,r6 | |
552 | $UMULH r8,r5,r6 | |
609b0852 | 553 | |
dd558806 AP |
554 | addc r11,r7,r11 |
555 | adde r9,r8,r9 | |
556 | addze r10,r10 | |
609b0852 | 557 | |
dd558806 AP |
558 | addc r11,r7,r11 |
559 | adde r9,r8,r9 | |
560 | addze r10,r10 | |
561 | #sqr_add_c2(a,3,2,c3,c1,c2); | |
562 | $LD r5,`2*$BNSZ`(r4) | |
563 | $LD r6,`3*$BNSZ`(r4) | |
564 | $UMULL r7,r5,r6 | |
565 | $UMULH r8,r5,r6 | |
609b0852 | 566 | |
dd558806 AP |
567 | addc r11,r7,r11 |
568 | adde r9,r8,r9 | |
569 | addze r10,r10 | |
609b0852 | 570 | |
dd558806 AP |
571 | addc r11,r7,r11 |
572 | adde r9,r8,r9 | |
573 | addze r10,r10 | |
574 | $ST r11,`5*$BNSZ`(r3) #r[5]=c3; | |
575 | #sqr_add_c(a,3,c1,c2,c3); | |
576 | $UMULL r7,r6,r6 | |
577 | $UMULH r8,r6,r6 | |
578 | addc r9,r7,r9 | |
579 | adde r10,r8,r10 | |
580 | addze r11,r0 | |
581 | #sqr_add_c2(a,4,2,c1,c2,c3); | |
582 | $LD r6,`4*$BNSZ`(r4) | |
583 | $UMULL r7,r5,r6 | |
584 | $UMULH r8,r5,r6 | |
609b0852 | 585 | |
dd558806 AP |
586 | addc r9,r7,r9 |
587 | adde r10,r8,r10 | |
588 | addze r11,r11 | |
609b0852 | 589 | |
dd558806 AP |
590 | addc r9,r7,r9 |
591 | adde r10,r8,r10 | |
592 | addze r11,r11 | |
593 | #sqr_add_c2(a,5,1,c1,c2,c3); | |
594 | $LD r5,`1*$BNSZ`(r4) | |
595 | $LD r6,`5*$BNSZ`(r4) | |
596 | $UMULL r7,r5,r6 | |
597 | $UMULH r8,r5,r6 | |
609b0852 | 598 | |
dd558806 AP |
599 | addc r9,r7,r9 |
600 | adde r10,r8,r10 | |
601 | addze r11,r11 | |
609b0852 | 602 | |
dd558806 AP |
603 | addc r9,r7,r9 |
604 | adde r10,r8,r10 | |
605 | addze r11,r11 | |
606 | #sqr_add_c2(a,6,0,c1,c2,c3); | |
607 | $LD r5,`0*$BNSZ`(r4) | |
608 | $LD r6,`6*$BNSZ`(r4) | |
609 | $UMULL r7,r5,r6 | |
610 | $UMULH r8,r5,r6 | |
611 | addc r9,r7,r9 | |
612 | adde r10,r8,r10 | |
613 | addze r11,r11 | |
614 | addc r9,r7,r9 | |
615 | adde r10,r8,r10 | |
616 | addze r11,r11 | |
617 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1; | |
618 | #sqr_add_c2(a,7,0,c2,c3,c1); | |
619 | $LD r6,`7*$BNSZ`(r4) | |
620 | $UMULL r7,r5,r6 | |
621 | $UMULH r8,r5,r6 | |
609b0852 | 622 | |
dd558806 AP |
623 | addc r10,r7,r10 |
624 | adde r11,r8,r11 | |
625 | addze r9,r0 | |
626 | addc r10,r7,r10 | |
627 | adde r11,r8,r11 | |
628 | addze r9,r9 | |
629 | #sqr_add_c2(a,6,1,c2,c3,c1); | |
630 | $LD r5,`1*$BNSZ`(r4) | |
631 | $LD r6,`6*$BNSZ`(r4) | |
632 | $UMULL r7,r5,r6 | |
633 | $UMULH r8,r5,r6 | |
609b0852 | 634 | |
dd558806 AP |
635 | addc r10,r7,r10 |
636 | adde r11,r8,r11 | |
637 | addze r9,r9 | |
638 | addc r10,r7,r10 | |
639 | adde r11,r8,r11 | |
640 | addze r9,r9 | |
641 | #sqr_add_c2(a,5,2,c2,c3,c1); | |
642 | $LD r5,`2*$BNSZ`(r4) | |
643 | $LD r6,`5*$BNSZ`(r4) | |
644 | $UMULL r7,r5,r6 | |
645 | $UMULH r8,r5,r6 | |
646 | addc r10,r7,r10 | |
647 | adde r11,r8,r11 | |
648 | addze r9,r9 | |
649 | addc r10,r7,r10 | |
650 | adde r11,r8,r11 | |
651 | addze r9,r9 | |
652 | #sqr_add_c2(a,4,3,c2,c3,c1); | |
653 | $LD r5,`3*$BNSZ`(r4) | |
654 | $LD r6,`4*$BNSZ`(r4) | |
655 | $UMULL r7,r5,r6 | |
656 | $UMULH r8,r5,r6 | |
609b0852 | 657 | |
dd558806 AP |
658 | addc r10,r7,r10 |
659 | adde r11,r8,r11 | |
660 | addze r9,r9 | |
661 | addc r10,r7,r10 | |
662 | adde r11,r8,r11 | |
663 | addze r9,r9 | |
664 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2; | |
665 | #sqr_add_c(a,4,c3,c1,c2); | |
666 | $UMULL r7,r6,r6 | |
667 | $UMULH r8,r6,r6 | |
668 | addc r11,r7,r11 | |
669 | adde r9,r8,r9 | |
670 | addze r10,r0 | |
671 | #sqr_add_c2(a,5,3,c3,c1,c2); | |
672 | $LD r6,`5*$BNSZ`(r4) | |
673 | $UMULL r7,r5,r6 | |
674 | $UMULH r8,r5,r6 | |
675 | addc r11,r7,r11 | |
676 | adde r9,r8,r9 | |
677 | addze r10,r10 | |
678 | addc r11,r7,r11 | |
679 | adde r9,r8,r9 | |
680 | addze r10,r10 | |
681 | #sqr_add_c2(a,6,2,c3,c1,c2); | |
682 | $LD r5,`2*$BNSZ`(r4) | |
683 | $LD r6,`6*$BNSZ`(r4) | |
684 | $UMULL r7,r5,r6 | |
685 | $UMULH r8,r5,r6 | |
686 | addc r11,r7,r11 | |
687 | adde r9,r8,r9 | |
688 | addze r10,r10 | |
609b0852 | 689 | |
dd558806 AP |
690 | addc r11,r7,r11 |
691 | adde r9,r8,r9 | |
692 | addze r10,r10 | |
693 | #sqr_add_c2(a,7,1,c3,c1,c2); | |
694 | $LD r5,`1*$BNSZ`(r4) | |
695 | $LD r6,`7*$BNSZ`(r4) | |
696 | $UMULL r7,r5,r6 | |
697 | $UMULH r8,r5,r6 | |
698 | addc r11,r7,r11 | |
699 | adde r9,r8,r9 | |
700 | addze r10,r10 | |
701 | addc r11,r7,r11 | |
702 | adde r9,r8,r9 | |
703 | addze r10,r10 | |
704 | $ST r11,`8*$BNSZ`(r3) #r[8]=c3; | |
705 | #sqr_add_c2(a,7,2,c1,c2,c3); | |
706 | $LD r5,`2*$BNSZ`(r4) | |
707 | $UMULL r7,r5,r6 | |
708 | $UMULH r8,r5,r6 | |
609b0852 | 709 | |
dd558806 AP |
710 | addc r9,r7,r9 |
711 | adde r10,r8,r10 | |
712 | addze r11,r0 | |
713 | addc r9,r7,r9 | |
714 | adde r10,r8,r10 | |
715 | addze r11,r11 | |
716 | #sqr_add_c2(a,6,3,c1,c2,c3); | |
717 | $LD r5,`3*$BNSZ`(r4) | |
718 | $LD r6,`6*$BNSZ`(r4) | |
719 | $UMULL r7,r5,r6 | |
720 | $UMULH r8,r5,r6 | |
721 | addc r9,r7,r9 | |
722 | adde r10,r8,r10 | |
723 | addze r11,r11 | |
724 | addc r9,r7,r9 | |
725 | adde r10,r8,r10 | |
726 | addze r11,r11 | |
727 | #sqr_add_c2(a,5,4,c1,c2,c3); | |
728 | $LD r5,`4*$BNSZ`(r4) | |
729 | $LD r6,`5*$BNSZ`(r4) | |
730 | $UMULL r7,r5,r6 | |
731 | $UMULH r8,r5,r6 | |
732 | addc r9,r7,r9 | |
733 | adde r10,r8,r10 | |
734 | addze r11,r11 | |
735 | addc r9,r7,r9 | |
736 | adde r10,r8,r10 | |
737 | addze r11,r11 | |
738 | $ST r9,`9*$BNSZ`(r3) #r[9]=c1; | |
739 | #sqr_add_c(a,5,c2,c3,c1); | |
740 | $UMULL r7,r6,r6 | |
741 | $UMULH r8,r6,r6 | |
742 | addc r10,r7,r10 | |
743 | adde r11,r8,r11 | |
744 | addze r9,r0 | |
745 | #sqr_add_c2(a,6,4,c2,c3,c1); | |
746 | $LD r6,`6*$BNSZ`(r4) | |
747 | $UMULL r7,r5,r6 | |
748 | $UMULH r8,r5,r6 | |
749 | addc r10,r7,r10 | |
750 | adde r11,r8,r11 | |
751 | addze r9,r9 | |
752 | addc r10,r7,r10 | |
753 | adde r11,r8,r11 | |
754 | addze r9,r9 | |
755 | #sqr_add_c2(a,7,3,c2,c3,c1); | |
756 | $LD r5,`3*$BNSZ`(r4) | |
757 | $LD r6,`7*$BNSZ`(r4) | |
758 | $UMULL r7,r5,r6 | |
759 | $UMULH r8,r5,r6 | |
760 | addc r10,r7,r10 | |
761 | adde r11,r8,r11 | |
762 | addze r9,r9 | |
763 | addc r10,r7,r10 | |
764 | adde r11,r8,r11 | |
765 | addze r9,r9 | |
766 | $ST r10,`10*$BNSZ`(r3) #r[10]=c2; | |
767 | #sqr_add_c2(a,7,4,c3,c1,c2); | |
768 | $LD r5,`4*$BNSZ`(r4) | |
769 | $UMULL r7,r5,r6 | |
770 | $UMULH r8,r5,r6 | |
771 | addc r11,r7,r11 | |
772 | adde r9,r8,r9 | |
773 | addze r10,r0 | |
774 | addc r11,r7,r11 | |
775 | adde r9,r8,r9 | |
776 | addze r10,r10 | |
777 | #sqr_add_c2(a,6,5,c3,c1,c2); | |
778 | $LD r5,`5*$BNSZ`(r4) | |
779 | $LD r6,`6*$BNSZ`(r4) | |
780 | $UMULL r7,r5,r6 | |
781 | $UMULH r8,r5,r6 | |
782 | addc r11,r7,r11 | |
783 | adde r9,r8,r9 | |
784 | addze r10,r10 | |
785 | addc r11,r7,r11 | |
786 | adde r9,r8,r9 | |
787 | addze r10,r10 | |
788 | $ST r11,`11*$BNSZ`(r3) #r[11]=c3; | |
789 | #sqr_add_c(a,6,c1,c2,c3); | |
790 | $UMULL r7,r6,r6 | |
791 | $UMULH r8,r6,r6 | |
792 | addc r9,r7,r9 | |
793 | adde r10,r8,r10 | |
794 | addze r11,r0 | |
795 | #sqr_add_c2(a,7,5,c1,c2,c3) | |
796 | $LD r6,`7*$BNSZ`(r4) | |
797 | $UMULL r7,r5,r6 | |
798 | $UMULH r8,r5,r6 | |
799 | addc r9,r7,r9 | |
800 | adde r10,r8,r10 | |
801 | addze r11,r11 | |
802 | addc r9,r7,r9 | |
803 | adde r10,r8,r10 | |
804 | addze r11,r11 | |
805 | $ST r9,`12*$BNSZ`(r3) #r[12]=c1; | |
609b0852 | 806 | |
dd558806 AP |
807 | #sqr_add_c2(a,7,6,c2,c3,c1) |
808 | $LD r5,`6*$BNSZ`(r4) | |
809 | $UMULL r7,r5,r6 | |
810 | $UMULH r8,r5,r6 | |
811 | addc r10,r7,r10 | |
812 | adde r11,r8,r11 | |
813 | addze r9,r0 | |
814 | addc r10,r7,r10 | |
815 | adde r11,r8,r11 | |
816 | addze r9,r9 | |
817 | $ST r10,`13*$BNSZ`(r3) #r[13]=c2; | |
818 | #sqr_add_c(a,7,c3,c1,c2); | |
819 | $UMULL r7,r6,r6 | |
820 | $UMULH r8,r6,r6 | |
821 | addc r11,r7,r11 | |
822 | adde r9,r8,r9 | |
823 | $ST r11,`14*$BNSZ`(r3) #r[14]=c3; | |
824 | $ST r9, `15*$BNSZ`(r3) #r[15]=c1; | |
825 | ||
826 | ||
31439046 | 827 | blr |
67150340 AP |
828 | .long 0 |
829 | .byte 0,12,0x14,0,0,0,2,0 | |
830 | .long 0 | |
d6019e16 | 831 | .size .bn_sqr_comba8,.-.bn_sqr_comba8 |
dd558806 AP |
832 | |
833 | # | |
834 | # NOTE: The following label name should be changed to | |
835 | # "bn_mul_comba4" i.e. remove the first dot | |
836 | # for the gcc compiler. This should be automatically | |
837 | # done in the build | |
838 | # | |
839 | ||
840 | .align 4 | |
841 | .bn_mul_comba4: | |
842 | # | |
843 | # This is an optimized version of the bn_mul_comba4 routine. | |
844 | # | |
845 | # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
846 | # r3 contains r | |
847 | # r4 contains a | |
848 | # r5 contains b | |
849 | # r6, r7 are the 2 BN_ULONGs being multiplied. | |
850 | # r8, r9 are the results of the 32x32 giving 64 multiply. | |
851 | # r10, r11, r12 are the equivalents of c1, c2, and c3. | |
852 | # | |
853 | xor r0,r0,r0 #r0=0. Used in addze below. | |
854 | #mul_add_c(a[0],b[0],c1,c2,c3); | |
609b0852 DB |
855 | $LD r6,`0*$BNSZ`(r4) |
856 | $LD r7,`0*$BNSZ`(r5) | |
857 | $UMULL r10,r6,r7 | |
858 | $UMULH r11,r6,r7 | |
dd558806 AP |
859 | $ST r10,`0*$BNSZ`(r3) #r[0]=c1 |
860 | #mul_add_c(a[0],b[1],c2,c3,c1); | |
609b0852 | 861 | $LD r7,`1*$BNSZ`(r5) |
dd558806 AP |
862 | $UMULL r8,r6,r7 |
863 | $UMULH r9,r6,r7 | |
864 | addc r11,r8,r11 | |
865 | adde r12,r9,r0 | |
866 | addze r10,r0 | |
867 | #mul_add_c(a[1],b[0],c2,c3,c1); | |
609b0852 DB |
868 | $LD r6, `1*$BNSZ`(r4) |
869 | $LD r7, `0*$BNSZ`(r5) | |
dd558806 AP |
870 | $UMULL r8,r6,r7 |
871 | $UMULH r9,r6,r7 | |
872 | addc r11,r8,r11 | |
873 | adde r12,r9,r12 | |
874 | addze r10,r10 | |
875 | $ST r11,`1*$BNSZ`(r3) #r[1]=c2 | |
876 | #mul_add_c(a[2],b[0],c3,c1,c2); | |
609b0852 | 877 | $LD r6,`2*$BNSZ`(r4) |
dd558806 AP |
878 | $UMULL r8,r6,r7 |
879 | $UMULH r9,r6,r7 | |
880 | addc r12,r8,r12 | |
881 | adde r10,r9,r10 | |
882 | addze r11,r0 | |
883 | #mul_add_c(a[1],b[1],c3,c1,c2); | |
609b0852 DB |
884 | $LD r6,`1*$BNSZ`(r4) |
885 | $LD r7,`1*$BNSZ`(r5) | |
dd558806 AP |
886 | $UMULL r8,r6,r7 |
887 | $UMULH r9,r6,r7 | |
888 | addc r12,r8,r12 | |
889 | adde r10,r9,r10 | |
890 | addze r11,r11 | |
891 | #mul_add_c(a[0],b[2],c3,c1,c2); | |
609b0852 DB |
892 | $LD r6,`0*$BNSZ`(r4) |
893 | $LD r7,`2*$BNSZ`(r5) | |
dd558806 AP |
894 | $UMULL r8,r6,r7 |
895 | $UMULH r9,r6,r7 | |
896 | addc r12,r8,r12 | |
897 | adde r10,r9,r10 | |
898 | addze r11,r11 | |
899 | $ST r12,`2*$BNSZ`(r3) #r[2]=c3 | |
900 | #mul_add_c(a[0],b[3],c1,c2,c3); | |
609b0852 | 901 | $LD r7,`3*$BNSZ`(r5) |
dd558806 AP |
902 | $UMULL r8,r6,r7 |
903 | $UMULH r9,r6,r7 | |
904 | addc r10,r8,r10 | |
905 | adde r11,r9,r11 | |
906 | addze r12,r0 | |
907 | #mul_add_c(a[1],b[2],c1,c2,c3); | |
908 | $LD r6,`1*$BNSZ`(r4) | |
909 | $LD r7,`2*$BNSZ`(r5) | |
910 | $UMULL r8,r6,r7 | |
911 | $UMULH r9,r6,r7 | |
912 | addc r10,r8,r10 | |
913 | adde r11,r9,r11 | |
914 | addze r12,r12 | |
915 | #mul_add_c(a[2],b[1],c1,c2,c3); | |
916 | $LD r6,`2*$BNSZ`(r4) | |
917 | $LD r7,`1*$BNSZ`(r5) | |
918 | $UMULL r8,r6,r7 | |
919 | $UMULH r9,r6,r7 | |
920 | addc r10,r8,r10 | |
921 | adde r11,r9,r11 | |
922 | addze r12,r12 | |
923 | #mul_add_c(a[3],b[0],c1,c2,c3); | |
924 | $LD r6,`3*$BNSZ`(r4) | |
925 | $LD r7,`0*$BNSZ`(r5) | |
926 | $UMULL r8,r6,r7 | |
927 | $UMULH r9,r6,r7 | |
928 | addc r10,r8,r10 | |
929 | adde r11,r9,r11 | |
930 | addze r12,r12 | |
931 | $ST r10,`3*$BNSZ`(r3) #r[3]=c1 | |
932 | #mul_add_c(a[3],b[1],c2,c3,c1); | |
609b0852 | 933 | $LD r7,`1*$BNSZ`(r5) |
dd558806 AP |
934 | $UMULL r8,r6,r7 |
935 | $UMULH r9,r6,r7 | |
936 | addc r11,r8,r11 | |
937 | adde r12,r9,r12 | |
938 | addze r10,r0 | |
939 | #mul_add_c(a[2],b[2],c2,c3,c1); | |
940 | $LD r6,`2*$BNSZ`(r4) | |
941 | $LD r7,`2*$BNSZ`(r5) | |
942 | $UMULL r8,r6,r7 | |
943 | $UMULH r9,r6,r7 | |
944 | addc r11,r8,r11 | |
945 | adde r12,r9,r12 | |
946 | addze r10,r10 | |
947 | #mul_add_c(a[1],b[3],c2,c3,c1); | |
948 | $LD r6,`1*$BNSZ`(r4) | |
949 | $LD r7,`3*$BNSZ`(r5) | |
950 | $UMULL r8,r6,r7 | |
951 | $UMULH r9,r6,r7 | |
952 | addc r11,r8,r11 | |
953 | adde r12,r9,r12 | |
954 | addze r10,r10 | |
955 | $ST r11,`4*$BNSZ`(r3) #r[4]=c2 | |
956 | #mul_add_c(a[2],b[3],c3,c1,c2); | |
609b0852 | 957 | $LD r6,`2*$BNSZ`(r4) |
dd558806 AP |
958 | $UMULL r8,r6,r7 |
959 | $UMULH r9,r6,r7 | |
960 | addc r12,r8,r12 | |
961 | adde r10,r9,r10 | |
962 | addze r11,r0 | |
963 | #mul_add_c(a[3],b[2],c3,c1,c2); | |
964 | $LD r6,`3*$BNSZ`(r4) | |
09f40a3c | 965 | $LD r7,`2*$BNSZ`(r5) |
dd558806 AP |
966 | $UMULL r8,r6,r7 |
967 | $UMULH r9,r6,r7 | |
968 | addc r12,r8,r12 | |
969 | adde r10,r9,r10 | |
970 | addze r11,r11 | |
971 | $ST r12,`5*$BNSZ`(r3) #r[5]=c3 | |
972 | #mul_add_c(a[3],b[3],c1,c2,c3); | |
609b0852 | 973 | $LD r7,`3*$BNSZ`(r5) |
dd558806 AP |
974 | $UMULL r8,r6,r7 |
975 | $UMULH r9,r6,r7 | |
976 | addc r10,r8,r10 | |
977 | adde r11,r9,r11 | |
978 | ||
979 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 | |
980 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 | |
31439046 | 981 | blr |
67150340 AP |
982 | .long 0 |
983 | .byte 0,12,0x14,0,0,0,3,0 | |
984 | .long 0 | |
d6019e16 | 985 | .size .bn_mul_comba4,.-.bn_mul_comba4 |
dd558806 AP |
986 | |
987 | # | |
988 | # NOTE: The following label name should be changed to | |
989 | # "bn_mul_comba8" i.e. remove the first dot | |
990 | # for the gcc compiler. This should be automatically | |
991 | # done in the build | |
992 | # | |
609b0852 | 993 | |
dd558806 AP |
994 | .align 4 |
995 | .bn_mul_comba8: | |
996 | # | |
997 | # Optimized version of the bn_mul_comba8 routine. | |
998 | # | |
999 | # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
1000 | # r3 contains r | |
1001 | # r4 contains a | |
1002 | # r5 contains b | |
1003 | # r6, r7 are the 2 BN_ULONGs being multiplied. | |
1004 | # r8, r9 are the results of the 32x32 giving 64 multiply. | |
1005 | # r10, r11, r12 are the equivalents of c1, c2, and c3. | |
1006 | # | |
1007 | xor r0,r0,r0 #r0=0. Used in addze below. | |
609b0852 | 1008 | |
dd558806 AP |
1009 | #mul_add_c(a[0],b[0],c1,c2,c3); |
1010 | $LD r6,`0*$BNSZ`(r4) #a[0] | |
1011 | $LD r7,`0*$BNSZ`(r5) #b[0] | |
1012 | $UMULL r10,r6,r7 | |
1013 | $UMULH r11,r6,r7 | |
1014 | $ST r10,`0*$BNSZ`(r3) #r[0]=c1; | |
1015 | #mul_add_c(a[0],b[1],c2,c3,c1); | |
1016 | $LD r7,`1*$BNSZ`(r5) | |
1017 | $UMULL r8,r6,r7 | |
1018 | $UMULH r9,r6,r7 | |
1019 | addc r11,r11,r8 | |
60250017 | 1020 | addze r12,r9 # since we didn't set r12 to zero before. |
dd558806 AP |
1021 | addze r10,r0 |
1022 | #mul_add_c(a[1],b[0],c2,c3,c1); | |
1023 | $LD r6,`1*$BNSZ`(r4) | |
1024 | $LD r7,`0*$BNSZ`(r5) | |
1025 | $UMULL r8,r6,r7 | |
1026 | $UMULH r9,r6,r7 | |
1027 | addc r11,r11,r8 | |
1028 | adde r12,r12,r9 | |
1029 | addze r10,r10 | |
1030 | $ST r11,`1*$BNSZ`(r3) #r[1]=c2; | |
1031 | #mul_add_c(a[2],b[0],c3,c1,c2); | |
1032 | $LD r6,`2*$BNSZ`(r4) | |
1033 | $UMULL r8,r6,r7 | |
1034 | $UMULH r9,r6,r7 | |
1035 | addc r12,r12,r8 | |
1036 | adde r10,r10,r9 | |
1037 | addze r11,r0 | |
1038 | #mul_add_c(a[1],b[1],c3,c1,c2); | |
1039 | $LD r6,`1*$BNSZ`(r4) | |
1040 | $LD r7,`1*$BNSZ`(r5) | |
1041 | $UMULL r8,r6,r7 | |
1042 | $UMULH r9,r6,r7 | |
1043 | addc r12,r12,r8 | |
1044 | adde r10,r10,r9 | |
1045 | addze r11,r11 | |
1046 | #mul_add_c(a[0],b[2],c3,c1,c2); | |
1047 | $LD r6,`0*$BNSZ`(r4) | |
1048 | $LD r7,`2*$BNSZ`(r5) | |
1049 | $UMULL r8,r6,r7 | |
1050 | $UMULH r9,r6,r7 | |
1051 | addc r12,r12,r8 | |
1052 | adde r10,r10,r9 | |
1053 | addze r11,r11 | |
1054 | $ST r12,`2*$BNSZ`(r3) #r[2]=c3; | |
1055 | #mul_add_c(a[0],b[3],c1,c2,c3); | |
1056 | $LD r7,`3*$BNSZ`(r5) | |
1057 | $UMULL r8,r6,r7 | |
1058 | $UMULH r9,r6,r7 | |
1059 | addc r10,r10,r8 | |
1060 | adde r11,r11,r9 | |
1061 | addze r12,r0 | |
1062 | #mul_add_c(a[1],b[2],c1,c2,c3); | |
1063 | $LD r6,`1*$BNSZ`(r4) | |
1064 | $LD r7,`2*$BNSZ`(r5) | |
1065 | $UMULL r8,r6,r7 | |
1066 | $UMULH r9,r6,r7 | |
1067 | addc r10,r10,r8 | |
1068 | adde r11,r11,r9 | |
1069 | addze r12,r12 | |
609b0852 | 1070 | |
dd558806 AP |
1071 | #mul_add_c(a[2],b[1],c1,c2,c3); |
1072 | $LD r6,`2*$BNSZ`(r4) | |
1073 | $LD r7,`1*$BNSZ`(r5) | |
1074 | $UMULL r8,r6,r7 | |
1075 | $UMULH r9,r6,r7 | |
1076 | addc r10,r10,r8 | |
1077 | adde r11,r11,r9 | |
1078 | addze r12,r12 | |
1079 | #mul_add_c(a[3],b[0],c1,c2,c3); | |
1080 | $LD r6,`3*$BNSZ`(r4) | |
1081 | $LD r7,`0*$BNSZ`(r5) | |
1082 | $UMULL r8,r6,r7 | |
1083 | $UMULH r9,r6,r7 | |
1084 | addc r10,r10,r8 | |
1085 | adde r11,r11,r9 | |
1086 | addze r12,r12 | |
1087 | $ST r10,`3*$BNSZ`(r3) #r[3]=c1; | |
1088 | #mul_add_c(a[4],b[0],c2,c3,c1); | |
1089 | $LD r6,`4*$BNSZ`(r4) | |
1090 | $UMULL r8,r6,r7 | |
1091 | $UMULH r9,r6,r7 | |
1092 | addc r11,r11,r8 | |
1093 | adde r12,r12,r9 | |
1094 | addze r10,r0 | |
1095 | #mul_add_c(a[3],b[1],c2,c3,c1); | |
1096 | $LD r6,`3*$BNSZ`(r4) | |
1097 | $LD r7,`1*$BNSZ`(r5) | |
1098 | $UMULL r8,r6,r7 | |
1099 | $UMULH r9,r6,r7 | |
1100 | addc r11,r11,r8 | |
1101 | adde r12,r12,r9 | |
1102 | addze r10,r10 | |
1103 | #mul_add_c(a[2],b[2],c2,c3,c1); | |
1104 | $LD r6,`2*$BNSZ`(r4) | |
1105 | $LD r7,`2*$BNSZ`(r5) | |
1106 | $UMULL r8,r6,r7 | |
1107 | $UMULH r9,r6,r7 | |
1108 | addc r11,r11,r8 | |
1109 | adde r12,r12,r9 | |
1110 | addze r10,r10 | |
1111 | #mul_add_c(a[1],b[3],c2,c3,c1); | |
1112 | $LD r6,`1*$BNSZ`(r4) | |
1113 | $LD r7,`3*$BNSZ`(r5) | |
1114 | $UMULL r8,r6,r7 | |
1115 | $UMULH r9,r6,r7 | |
1116 | addc r11,r11,r8 | |
1117 | adde r12,r12,r9 | |
1118 | addze r10,r10 | |
1119 | #mul_add_c(a[0],b[4],c2,c3,c1); | |
1120 | $LD r6,`0*$BNSZ`(r4) | |
1121 | $LD r7,`4*$BNSZ`(r5) | |
1122 | $UMULL r8,r6,r7 | |
1123 | $UMULH r9,r6,r7 | |
1124 | addc r11,r11,r8 | |
1125 | adde r12,r12,r9 | |
1126 | addze r10,r10 | |
1127 | $ST r11,`4*$BNSZ`(r3) #r[4]=c2; | |
1128 | #mul_add_c(a[0],b[5],c3,c1,c2); | |
1129 | $LD r7,`5*$BNSZ`(r5) | |
1130 | $UMULL r8,r6,r7 | |
1131 | $UMULH r9,r6,r7 | |
1132 | addc r12,r12,r8 | |
1133 | adde r10,r10,r9 | |
1134 | addze r11,r0 | |
1135 | #mul_add_c(a[1],b[4],c3,c1,c2); | |
609b0852 | 1136 | $LD r6,`1*$BNSZ`(r4) |
dd558806 AP |
1137 | $LD r7,`4*$BNSZ`(r5) |
1138 | $UMULL r8,r6,r7 | |
1139 | $UMULH r9,r6,r7 | |
1140 | addc r12,r12,r8 | |
1141 | adde r10,r10,r9 | |
1142 | addze r11,r11 | |
1143 | #mul_add_c(a[2],b[3],c3,c1,c2); | |
609b0852 | 1144 | $LD r6,`2*$BNSZ`(r4) |
dd558806 AP |
1145 | $LD r7,`3*$BNSZ`(r5) |
1146 | $UMULL r8,r6,r7 | |
1147 | $UMULH r9,r6,r7 | |
1148 | addc r12,r12,r8 | |
1149 | adde r10,r10,r9 | |
1150 | addze r11,r11 | |
1151 | #mul_add_c(a[3],b[2],c3,c1,c2); | |
609b0852 | 1152 | $LD r6,`3*$BNSZ`(r4) |
dd558806 AP |
1153 | $LD r7,`2*$BNSZ`(r5) |
1154 | $UMULL r8,r6,r7 | |
1155 | $UMULH r9,r6,r7 | |
1156 | addc r12,r12,r8 | |
1157 | adde r10,r10,r9 | |
1158 | addze r11,r11 | |
1159 | #mul_add_c(a[4],b[1],c3,c1,c2); | |
609b0852 | 1160 | $LD r6,`4*$BNSZ`(r4) |
dd558806 AP |
1161 | $LD r7,`1*$BNSZ`(r5) |
1162 | $UMULL r8,r6,r7 | |
1163 | $UMULH r9,r6,r7 | |
1164 | addc r12,r12,r8 | |
1165 | adde r10,r10,r9 | |
1166 | addze r11,r11 | |
1167 | #mul_add_c(a[5],b[0],c3,c1,c2); | |
609b0852 | 1168 | $LD r6,`5*$BNSZ`(r4) |
dd558806 AP |
1169 | $LD r7,`0*$BNSZ`(r5) |
1170 | $UMULL r8,r6,r7 | |
1171 | $UMULH r9,r6,r7 | |
1172 | addc r12,r12,r8 | |
1173 | adde r10,r10,r9 | |
1174 | addze r11,r11 | |
1175 | $ST r12,`5*$BNSZ`(r3) #r[5]=c3; | |
1176 | #mul_add_c(a[6],b[0],c1,c2,c3); | |
1177 | $LD r6,`6*$BNSZ`(r4) | |
1178 | $UMULL r8,r6,r7 | |
1179 | $UMULH r9,r6,r7 | |
1180 | addc r10,r10,r8 | |
1181 | adde r11,r11,r9 | |
1182 | addze r12,r0 | |
1183 | #mul_add_c(a[5],b[1],c1,c2,c3); | |
1184 | $LD r6,`5*$BNSZ`(r4) | |
1185 | $LD r7,`1*$BNSZ`(r5) | |
1186 | $UMULL r8,r6,r7 | |
1187 | $UMULH r9,r6,r7 | |
1188 | addc r10,r10,r8 | |
1189 | adde r11,r11,r9 | |
1190 | addze r12,r12 | |
1191 | #mul_add_c(a[4],b[2],c1,c2,c3); | |
1192 | $LD r6,`4*$BNSZ`(r4) | |
1193 | $LD r7,`2*$BNSZ`(r5) | |
1194 | $UMULL r8,r6,r7 | |
1195 | $UMULH r9,r6,r7 | |
1196 | addc r10,r10,r8 | |
1197 | adde r11,r11,r9 | |
1198 | addze r12,r12 | |
1199 | #mul_add_c(a[3],b[3],c1,c2,c3); | |
1200 | $LD r6,`3*$BNSZ`(r4) | |
1201 | $LD r7,`3*$BNSZ`(r5) | |
1202 | $UMULL r8,r6,r7 | |
1203 | $UMULH r9,r6,r7 | |
1204 | addc r10,r10,r8 | |
1205 | adde r11,r11,r9 | |
1206 | addze r12,r12 | |
1207 | #mul_add_c(a[2],b[4],c1,c2,c3); | |
1208 | $LD r6,`2*$BNSZ`(r4) | |
1209 | $LD r7,`4*$BNSZ`(r5) | |
1210 | $UMULL r8,r6,r7 | |
1211 | $UMULH r9,r6,r7 | |
1212 | addc r10,r10,r8 | |
1213 | adde r11,r11,r9 | |
1214 | addze r12,r12 | |
1215 | #mul_add_c(a[1],b[5],c1,c2,c3); | |
1216 | $LD r6,`1*$BNSZ`(r4) | |
1217 | $LD r7,`5*$BNSZ`(r5) | |
1218 | $UMULL r8,r6,r7 | |
1219 | $UMULH r9,r6,r7 | |
1220 | addc r10,r10,r8 | |
1221 | adde r11,r11,r9 | |
1222 | addze r12,r12 | |
1223 | #mul_add_c(a[0],b[6],c1,c2,c3); | |
1224 | $LD r6,`0*$BNSZ`(r4) | |
1225 | $LD r7,`6*$BNSZ`(r5) | |
1226 | $UMULL r8,r6,r7 | |
1227 | $UMULH r9,r6,r7 | |
1228 | addc r10,r10,r8 | |
1229 | adde r11,r11,r9 | |
1230 | addze r12,r12 | |
1231 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1; | |
1232 | #mul_add_c(a[0],b[7],c2,c3,c1); | |
1233 | $LD r7,`7*$BNSZ`(r5) | |
1234 | $UMULL r8,r6,r7 | |
1235 | $UMULH r9,r6,r7 | |
1236 | addc r11,r11,r8 | |
1237 | adde r12,r12,r9 | |
1238 | addze r10,r0 | |
1239 | #mul_add_c(a[1],b[6],c2,c3,c1); | |
1240 | $LD r6,`1*$BNSZ`(r4) | |
1241 | $LD r7,`6*$BNSZ`(r5) | |
1242 | $UMULL r8,r6,r7 | |
1243 | $UMULH r9,r6,r7 | |
1244 | addc r11,r11,r8 | |
1245 | adde r12,r12,r9 | |
1246 | addze r10,r10 | |
1247 | #mul_add_c(a[2],b[5],c2,c3,c1); | |
1248 | $LD r6,`2*$BNSZ`(r4) | |
1249 | $LD r7,`5*$BNSZ`(r5) | |
1250 | $UMULL r8,r6,r7 | |
1251 | $UMULH r9,r6,r7 | |
1252 | addc r11,r11,r8 | |
1253 | adde r12,r12,r9 | |
1254 | addze r10,r10 | |
1255 | #mul_add_c(a[3],b[4],c2,c3,c1); | |
1256 | $LD r6,`3*$BNSZ`(r4) | |
1257 | $LD r7,`4*$BNSZ`(r5) | |
1258 | $UMULL r8,r6,r7 | |
1259 | $UMULH r9,r6,r7 | |
1260 | addc r11,r11,r8 | |
1261 | adde r12,r12,r9 | |
1262 | addze r10,r10 | |
1263 | #mul_add_c(a[4],b[3],c2,c3,c1); | |
1264 | $LD r6,`4*$BNSZ`(r4) | |
1265 | $LD r7,`3*$BNSZ`(r5) | |
1266 | $UMULL r8,r6,r7 | |
1267 | $UMULH r9,r6,r7 | |
1268 | addc r11,r11,r8 | |
1269 | adde r12,r12,r9 | |
1270 | addze r10,r10 | |
1271 | #mul_add_c(a[5],b[2],c2,c3,c1); | |
1272 | $LD r6,`5*$BNSZ`(r4) | |
1273 | $LD r7,`2*$BNSZ`(r5) | |
1274 | $UMULL r8,r6,r7 | |
1275 | $UMULH r9,r6,r7 | |
1276 | addc r11,r11,r8 | |
1277 | adde r12,r12,r9 | |
1278 | addze r10,r10 | |
1279 | #mul_add_c(a[6],b[1],c2,c3,c1); | |
1280 | $LD r6,`6*$BNSZ`(r4) | |
1281 | $LD r7,`1*$BNSZ`(r5) | |
1282 | $UMULL r8,r6,r7 | |
1283 | $UMULH r9,r6,r7 | |
1284 | addc r11,r11,r8 | |
1285 | adde r12,r12,r9 | |
1286 | addze r10,r10 | |
1287 | #mul_add_c(a[7],b[0],c2,c3,c1); | |
1288 | $LD r6,`7*$BNSZ`(r4) | |
1289 | $LD r7,`0*$BNSZ`(r5) | |
1290 | $UMULL r8,r6,r7 | |
1291 | $UMULH r9,r6,r7 | |
1292 | addc r11,r11,r8 | |
1293 | adde r12,r12,r9 | |
1294 | addze r10,r10 | |
1295 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2; | |
1296 | #mul_add_c(a[7],b[1],c3,c1,c2); | |
1297 | $LD r7,`1*$BNSZ`(r5) | |
1298 | $UMULL r8,r6,r7 | |
1299 | $UMULH r9,r6,r7 | |
1300 | addc r12,r12,r8 | |
1301 | adde r10,r10,r9 | |
1302 | addze r11,r0 | |
1303 | #mul_add_c(a[6],b[2],c3,c1,c2); | |
1304 | $LD r6,`6*$BNSZ`(r4) | |
1305 | $LD r7,`2*$BNSZ`(r5) | |
1306 | $UMULL r8,r6,r7 | |
1307 | $UMULH r9,r6,r7 | |
1308 | addc r12,r12,r8 | |
1309 | adde r10,r10,r9 | |
1310 | addze r11,r11 | |
1311 | #mul_add_c(a[5],b[3],c3,c1,c2); | |
1312 | $LD r6,`5*$BNSZ`(r4) | |
1313 | $LD r7,`3*$BNSZ`(r5) | |
1314 | $UMULL r8,r6,r7 | |
1315 | $UMULH r9,r6,r7 | |
1316 | addc r12,r12,r8 | |
1317 | adde r10,r10,r9 | |
1318 | addze r11,r11 | |
1319 | #mul_add_c(a[4],b[4],c3,c1,c2); | |
1320 | $LD r6,`4*$BNSZ`(r4) | |
1321 | $LD r7,`4*$BNSZ`(r5) | |
1322 | $UMULL r8,r6,r7 | |
1323 | $UMULH r9,r6,r7 | |
1324 | addc r12,r12,r8 | |
1325 | adde r10,r10,r9 | |
1326 | addze r11,r11 | |
1327 | #mul_add_c(a[3],b[5],c3,c1,c2); | |
1328 | $LD r6,`3*$BNSZ`(r4) | |
1329 | $LD r7,`5*$BNSZ`(r5) | |
1330 | $UMULL r8,r6,r7 | |
1331 | $UMULH r9,r6,r7 | |
1332 | addc r12,r12,r8 | |
1333 | adde r10,r10,r9 | |
1334 | addze r11,r11 | |
1335 | #mul_add_c(a[2],b[6],c3,c1,c2); | |
1336 | $LD r6,`2*$BNSZ`(r4) | |
1337 | $LD r7,`6*$BNSZ`(r5) | |
1338 | $UMULL r8,r6,r7 | |
1339 | $UMULH r9,r6,r7 | |
1340 | addc r12,r12,r8 | |
1341 | adde r10,r10,r9 | |
1342 | addze r11,r11 | |
1343 | #mul_add_c(a[1],b[7],c3,c1,c2); | |
1344 | $LD r6,`1*$BNSZ`(r4) | |
1345 | $LD r7,`7*$BNSZ`(r5) | |
1346 | $UMULL r8,r6,r7 | |
1347 | $UMULH r9,r6,r7 | |
1348 | addc r12,r12,r8 | |
1349 | adde r10,r10,r9 | |
1350 | addze r11,r11 | |
1351 | $ST r12,`8*$BNSZ`(r3) #r[8]=c3; | |
1352 | #mul_add_c(a[2],b[7],c1,c2,c3); | |
1353 | $LD r6,`2*$BNSZ`(r4) | |
1354 | $UMULL r8,r6,r7 | |
1355 | $UMULH r9,r6,r7 | |
1356 | addc r10,r10,r8 | |
1357 | adde r11,r11,r9 | |
1358 | addze r12,r0 | |
1359 | #mul_add_c(a[3],b[6],c1,c2,c3); | |
1360 | $LD r6,`3*$BNSZ`(r4) | |
1361 | $LD r7,`6*$BNSZ`(r5) | |
1362 | $UMULL r8,r6,r7 | |
1363 | $UMULH r9,r6,r7 | |
1364 | addc r10,r10,r8 | |
1365 | adde r11,r11,r9 | |
1366 | addze r12,r12 | |
1367 | #mul_add_c(a[4],b[5],c1,c2,c3); | |
1368 | $LD r6,`4*$BNSZ`(r4) | |
1369 | $LD r7,`5*$BNSZ`(r5) | |
1370 | $UMULL r8,r6,r7 | |
1371 | $UMULH r9,r6,r7 | |
1372 | addc r10,r10,r8 | |
1373 | adde r11,r11,r9 | |
1374 | addze r12,r12 | |
1375 | #mul_add_c(a[5],b[4],c1,c2,c3); | |
1376 | $LD r6,`5*$BNSZ`(r4) | |
1377 | $LD r7,`4*$BNSZ`(r5) | |
1378 | $UMULL r8,r6,r7 | |
1379 | $UMULH r9,r6,r7 | |
1380 | addc r10,r10,r8 | |
1381 | adde r11,r11,r9 | |
1382 | addze r12,r12 | |
1383 | #mul_add_c(a[6],b[3],c1,c2,c3); | |
1384 | $LD r6,`6*$BNSZ`(r4) | |
1385 | $LD r7,`3*$BNSZ`(r5) | |
1386 | $UMULL r8,r6,r7 | |
1387 | $UMULH r9,r6,r7 | |
1388 | addc r10,r10,r8 | |
1389 | adde r11,r11,r9 | |
1390 | addze r12,r12 | |
1391 | #mul_add_c(a[7],b[2],c1,c2,c3); | |
1392 | $LD r6,`7*$BNSZ`(r4) | |
1393 | $LD r7,`2*$BNSZ`(r5) | |
1394 | $UMULL r8,r6,r7 | |
1395 | $UMULH r9,r6,r7 | |
1396 | addc r10,r10,r8 | |
1397 | adde r11,r11,r9 | |
1398 | addze r12,r12 | |
1399 | $ST r10,`9*$BNSZ`(r3) #r[9]=c1; | |
1400 | #mul_add_c(a[7],b[3],c2,c3,c1); | |
1401 | $LD r7,`3*$BNSZ`(r5) | |
1402 | $UMULL r8,r6,r7 | |
1403 | $UMULH r9,r6,r7 | |
1404 | addc r11,r11,r8 | |
1405 | adde r12,r12,r9 | |
1406 | addze r10,r0 | |
1407 | #mul_add_c(a[6],b[4],c2,c3,c1); | |
1408 | $LD r6,`6*$BNSZ`(r4) | |
1409 | $LD r7,`4*$BNSZ`(r5) | |
1410 | $UMULL r8,r6,r7 | |
1411 | $UMULH r9,r6,r7 | |
1412 | addc r11,r11,r8 | |
1413 | adde r12,r12,r9 | |
1414 | addze r10,r10 | |
1415 | #mul_add_c(a[5],b[5],c2,c3,c1); | |
1416 | $LD r6,`5*$BNSZ`(r4) | |
1417 | $LD r7,`5*$BNSZ`(r5) | |
1418 | $UMULL r8,r6,r7 | |
1419 | $UMULH r9,r6,r7 | |
1420 | addc r11,r11,r8 | |
1421 | adde r12,r12,r9 | |
1422 | addze r10,r10 | |
1423 | #mul_add_c(a[4],b[6],c2,c3,c1); | |
1424 | $LD r6,`4*$BNSZ`(r4) | |
1425 | $LD r7,`6*$BNSZ`(r5) | |
1426 | $UMULL r8,r6,r7 | |
1427 | $UMULH r9,r6,r7 | |
1428 | addc r11,r11,r8 | |
1429 | adde r12,r12,r9 | |
1430 | addze r10,r10 | |
1431 | #mul_add_c(a[3],b[7],c2,c3,c1); | |
1432 | $LD r6,`3*$BNSZ`(r4) | |
1433 | $LD r7,`7*$BNSZ`(r5) | |
1434 | $UMULL r8,r6,r7 | |
1435 | $UMULH r9,r6,r7 | |
1436 | addc r11,r11,r8 | |
1437 | adde r12,r12,r9 | |
1438 | addze r10,r10 | |
1439 | $ST r11,`10*$BNSZ`(r3) #r[10]=c2; | |
1440 | #mul_add_c(a[4],b[7],c3,c1,c2); | |
1441 | $LD r6,`4*$BNSZ`(r4) | |
1442 | $UMULL r8,r6,r7 | |
1443 | $UMULH r9,r6,r7 | |
1444 | addc r12,r12,r8 | |
1445 | adde r10,r10,r9 | |
1446 | addze r11,r0 | |
1447 | #mul_add_c(a[5],b[6],c3,c1,c2); | |
1448 | $LD r6,`5*$BNSZ`(r4) | |
1449 | $LD r7,`6*$BNSZ`(r5) | |
1450 | $UMULL r8,r6,r7 | |
1451 | $UMULH r9,r6,r7 | |
1452 | addc r12,r12,r8 | |
1453 | adde r10,r10,r9 | |
1454 | addze r11,r11 | |
1455 | #mul_add_c(a[6],b[5],c3,c1,c2); | |
1456 | $LD r6,`6*$BNSZ`(r4) | |
1457 | $LD r7,`5*$BNSZ`(r5) | |
1458 | $UMULL r8,r6,r7 | |
1459 | $UMULH r9,r6,r7 | |
1460 | addc r12,r12,r8 | |
1461 | adde r10,r10,r9 | |
1462 | addze r11,r11 | |
1463 | #mul_add_c(a[7],b[4],c3,c1,c2); | |
1464 | $LD r6,`7*$BNSZ`(r4) | |
1465 | $LD r7,`4*$BNSZ`(r5) | |
1466 | $UMULL r8,r6,r7 | |
1467 | $UMULH r9,r6,r7 | |
1468 | addc r12,r12,r8 | |
1469 | adde r10,r10,r9 | |
1470 | addze r11,r11 | |
1471 | $ST r12,`11*$BNSZ`(r3) #r[11]=c3; | |
1472 | #mul_add_c(a[7],b[5],c1,c2,c3); | |
1473 | $LD r7,`5*$BNSZ`(r5) | |
1474 | $UMULL r8,r6,r7 | |
1475 | $UMULH r9,r6,r7 | |
1476 | addc r10,r10,r8 | |
1477 | adde r11,r11,r9 | |
1478 | addze r12,r0 | |
1479 | #mul_add_c(a[6],b[6],c1,c2,c3); | |
1480 | $LD r6,`6*$BNSZ`(r4) | |
1481 | $LD r7,`6*$BNSZ`(r5) | |
1482 | $UMULL r8,r6,r7 | |
1483 | $UMULH r9,r6,r7 | |
1484 | addc r10,r10,r8 | |
1485 | adde r11,r11,r9 | |
1486 | addze r12,r12 | |
1487 | #mul_add_c(a[5],b[7],c1,c2,c3); | |
1488 | $LD r6,`5*$BNSZ`(r4) | |
1489 | $LD r7,`7*$BNSZ`(r5) | |
1490 | $UMULL r8,r6,r7 | |
1491 | $UMULH r9,r6,r7 | |
1492 | addc r10,r10,r8 | |
1493 | adde r11,r11,r9 | |
1494 | addze r12,r12 | |
1495 | $ST r10,`12*$BNSZ`(r3) #r[12]=c1; | |
1496 | #mul_add_c(a[6],b[7],c2,c3,c1); | |
1497 | $LD r6,`6*$BNSZ`(r4) | |
1498 | $UMULL r8,r6,r7 | |
1499 | $UMULH r9,r6,r7 | |
1500 | addc r11,r11,r8 | |
1501 | adde r12,r12,r9 | |
1502 | addze r10,r0 | |
1503 | #mul_add_c(a[7],b[6],c2,c3,c1); | |
1504 | $LD r6,`7*$BNSZ`(r4) | |
1505 | $LD r7,`6*$BNSZ`(r5) | |
1506 | $UMULL r8,r6,r7 | |
1507 | $UMULH r9,r6,r7 | |
1508 | addc r11,r11,r8 | |
1509 | adde r12,r12,r9 | |
1510 | addze r10,r10 | |
1511 | $ST r11,`13*$BNSZ`(r3) #r[13]=c2; | |
1512 | #mul_add_c(a[7],b[7],c3,c1,c2); | |
1513 | $LD r7,`7*$BNSZ`(r5) | |
1514 | $UMULL r8,r6,r7 | |
1515 | $UMULH r9,r6,r7 | |
1516 | addc r12,r12,r8 | |
1517 | adde r10,r10,r9 | |
1518 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; | |
1519 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; | |
31439046 | 1520 | blr |
67150340 AP |
1521 | .long 0 |
1522 | .byte 0,12,0x14,0,0,0,3,0 | |
1523 | .long 0 | |
d6019e16 | 1524 | .size .bn_mul_comba8,.-.bn_mul_comba8 |
dd558806 AP |
1525 | |
1526 | # | |
1527 | # NOTE: The following label name should be changed to | |
1528 | # "bn_sub_words" i.e. remove the first dot | |
1529 | # for the gcc compiler. This should be automatically | |
1530 | # done in the build | |
1531 | # | |
1532 | # | |
1533 | .align 4 | |
1534 | .bn_sub_words: | |
1535 | # | |
1536 | # Handcoded version of bn_sub_words | |
1537 | # | |
1538 | #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
1539 | # | |
1540 | # r3 = r | |
1541 | # r4 = a | |
1542 | # r5 = b | |
1543 | # r6 = n | |
1544 | # | |
1545 | # Note: No loop unrolling done since this is not a performance | |
1546 | # critical loop. | |
1547 | ||
1548 | xor r0,r0,r0 #set r0 = 0 | |
1549 | # | |
1550 | # check for r6 = 0 AND set carry bit. | |
1551 | # | |
1552 | subfc. r7,r0,r6 # If r6 is 0 then result is 0. | |
1553 | # if r6 > 0 then result !=0 | |
1554 | # In either case carry bit is set. | |
31439046 | 1555 | beq Lppcasm_sub_adios |
dd558806 AP |
1556 | addi r4,r4,-$BNSZ |
1557 | addi r3,r3,-$BNSZ | |
1558 | addi r5,r5,-$BNSZ | |
1559 | mtctr r6 | |
609b0852 | 1560 | Lppcasm_sub_mainloop: |
dd558806 AP |
1561 | $LDU r7,$BNSZ(r4) |
1562 | $LDU r8,$BNSZ(r5) | |
1563 | subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) | |
1564 | # if carry = 1 this is r7-r8. Else it | |
1565 | # is r7-r8 -1 as we need. | |
1566 | $STU r6,$BNSZ(r3) | |
20b88bb1 | 1567 | bdnz Lppcasm_sub_mainloop |
609b0852 | 1568 | Lppcasm_sub_adios: |
dd558806 AP |
1569 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 |
1570 | andi. r3,r3,1 # keep only last bit. | |
31439046 | 1571 | blr |
67150340 AP |
1572 | .long 0 |
1573 | .byte 0,12,0x14,0,0,0,4,0 | |
1574 | .long 0 | |
d6019e16 | 1575 | .size .bn_sub_words,.-.bn_sub_words |
dd558806 AP |
1576 | |
1577 | # | |
1578 | # NOTE: The following label name should be changed to | |
1579 | # "bn_add_words" i.e. remove the first dot | |
1580 | # for the gcc compiler. This should be automatically | |
1581 | # done in the build | |
1582 | # | |
1583 | ||
1584 | .align 4 | |
1585 | .bn_add_words: | |
1586 | # | |
1587 | # Handcoded version of bn_add_words | |
1588 | # | |
1589 | #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
1590 | # | |
1591 | # r3 = r | |
1592 | # r4 = a | |
1593 | # r5 = b | |
1594 | # r6 = n | |
1595 | # | |
1596 | # Note: No loop unrolling done since this is not a performance | |
1597 | # critical loop. | |
1598 | ||
1599 | xor r0,r0,r0 | |
1600 | # | |
1601 | # check for r6 = 0. Is this needed? | |
1602 | # | |
1603 | addic. r6,r6,0 #test r6 and clear carry bit. | |
31439046 | 1604 | beq Lppcasm_add_adios |
dd558806 AP |
1605 | addi r4,r4,-$BNSZ |
1606 | addi r3,r3,-$BNSZ | |
1607 | addi r5,r5,-$BNSZ | |
1608 | mtctr r6 | |
609b0852 | 1609 | Lppcasm_add_mainloop: |
dd558806 AP |
1610 | $LDU r7,$BNSZ(r4) |
1611 | $LDU r8,$BNSZ(r5) | |
1612 | adde r8,r7,r8 | |
1613 | $STU r8,$BNSZ(r3) | |
20b88bb1 | 1614 | bdnz Lppcasm_add_mainloop |
609b0852 | 1615 | Lppcasm_add_adios: |
dd558806 | 1616 | addze r3,r0 #return carry bit. |
31439046 | 1617 | blr |
67150340 AP |
1618 | .long 0 |
1619 | .byte 0,12,0x14,0,0,0,4,0 | |
1620 | .long 0 | |
d6019e16 | 1621 | .size .bn_add_words,.-.bn_add_words |
dd558806 AP |
1622 | |
1623 | # | |
1624 | # NOTE: The following label name should be changed to | |
1625 | # "bn_div_words" i.e. remove the first dot | |
1626 | # for the gcc compiler. This should be automatically | |
1627 | # done in the build | |
1628 | # | |
1629 | ||
1630 | .align 4 | |
1631 | .bn_div_words: | |
1632 | # | |
1633 | # This is a cleaned up version of code generated by | |
1634 | # the AIX compiler. The only optimization is to use | |
1635 | # the PPC instruction to count leading zeros instead | |
1636 | # of call to num_bits_word. Since this was compiled | |
1637 | # only at level -O2 we can possibly squeeze it more? | |
609b0852 | 1638 | # |
dd558806 AP |
1639 | # r3 = h |
1640 | # r4 = l | |
1641 | # r5 = d | |
609b0852 | 1642 | |
dd558806 | 1643 | $UCMPI 0,r5,0 # compare r5 and 0 |
31439046 | 1644 | bne Lppcasm_div1 # proceed if d!=0 |
dd558806 | 1645 | li r3,-1 # d=0 return -1 |
31439046 | 1646 | blr |
dd558806 AP |
1647 | Lppcasm_div1: |
1648 | xor r0,r0,r0 #r0=0 | |
aaa5dc61 AP |
1649 | li r8,$BITS |
1650 | $CNTLZ. r7,r5 #r7 = num leading 0s in d. | |
31439046 | 1651 | beq Lppcasm_div2 #proceed if no leading zeros |
aaa5dc61 AP |
1652 | subf r8,r7,r8 #r8 = BN_num_bits_word(d) |
1653 | $SHR. r9,r3,r8 #are there any bits above r8'th? | |
31efffbd | 1654 | $TR 16,r9,r0 #if there're, signal to dump core... |
dd558806 AP |
1655 | Lppcasm_div2: |
1656 | $UCMP 0,r3,r5 #h>=d? | |
31439046 | 1657 | blt Lppcasm_div3 #goto Lppcasm_div3 if not |
609b0852 | 1658 | subf r3,r5,r3 #h-=d ; |
dd558806 AP |
1659 | Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i |
1660 | cmpi 0,0,r7,0 # is (i == 0)? | |
31439046 | 1661 | beq Lppcasm_div4 |
dd558806 AP |
1662 | $SHL r3,r3,r7 # h = (h<< i) |
1663 | $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) | |
1664 | $SHL r5,r5,r7 # d<<=i | |
1665 | or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) | |
1666 | $SHL r4,r4,r7 # l <<=i | |
1667 | Lppcasm_div4: | |
1668 | $SHRI r9,r5,`$BITS/2` # r9 = dh | |
1669 | # dl will be computed when needed | |
1670 | # as it saves registers. | |
1671 | li r6,2 #r6=2 | |
1672 | mtctr r6 #counter will be in count. | |
609b0852 | 1673 | Lppcasm_divouterloop: |
dd558806 AP |
1674 | $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) |
1675 | $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 | |
1676 | # compute here for innerloop. | |
1677 | $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh | |
31439046 | 1678 | bne Lppcasm_div5 # goto Lppcasm_div5 if not |
dd558806 AP |
1679 | |
1680 | li r8,-1 | |
609b0852 | 1681 | $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l |
dd558806 AP |
1682 | b Lppcasm_div6 |
1683 | Lppcasm_div5: | |
1684 | $UDIV r8,r3,r9 #q = h/dh | |
1685 | Lppcasm_div6: | |
1686 | $UMULL r12,r9,r8 #th = q*dh | |
1687 | $CLRU r10,r5,`$BITS/2` #r10=dl | |
1688 | $UMULL r6,r8,r10 #tl = q*dl | |
609b0852 | 1689 | |
dd558806 AP |
1690 | Lppcasm_divinnerloop: |
1691 | subf r10,r12,r3 #t = h -th | |
1692 | $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... | |
1693 | addic. r7,r7,0 #test if r7 == 0. used below. | |
1694 | # now want to compute | |
1695 | # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) | |
1696 | # the following 2 instructions do that | |
1697 | $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) | |
1698 | or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) | |
31439046 AP |
1699 | $UCMP cr1,r6,r7 # compare (tl <= r7) |
1700 | bne Lppcasm_divinnerexit | |
1701 | ble cr1,Lppcasm_divinnerexit | |
dd558806 AP |
1702 | addi r8,r8,-1 #q-- |
1703 | subf r12,r9,r12 #th -=dh | |
1704 | $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. | |
1705 | subf r6,r10,r6 #tl -=dl | |
1706 | b Lppcasm_divinnerloop | |
1707 | Lppcasm_divinnerexit: | |
1708 | $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) | |
1709 | $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; | |
31439046 | 1710 | $UCMP cr1,r4,r11 # compare l and tl |
dd558806 | 1711 | add r12,r12,r10 # th+=t |
31439046 | 1712 | bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 |
dd558806 AP |
1713 | addi r12,r12,1 # th++ |
1714 | Lppcasm_div7: | |
1715 | subf r11,r11,r4 #r11=l-tl | |
31439046 AP |
1716 | $UCMP cr1,r3,r12 #compare h and th |
1717 | bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 | |
dd558806 AP |
1718 | addi r8,r8,-1 # q-- |
1719 | add r3,r5,r3 # h+=d | |
1720 | Lppcasm_div8: | |
1721 | subf r12,r12,r3 #r12 = h-th | |
1722 | $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 | |
1723 | # want to compute | |
1724 | # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 | |
1725 | # the following 2 instructions will do this. | |
1726 | $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. | |
1727 | $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 | |
31439046 | 1728 | bdz Lppcasm_div9 #if (count==0) break ; |
dd558806 AP |
1729 | $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 |
1730 | b Lppcasm_divouterloop | |
1731 | Lppcasm_div9: | |
1732 | or r3,r8,r0 | |
31439046 | 1733 | blr |
67150340 AP |
1734 | .long 0 |
1735 | .byte 0,12,0x14,0,0,0,3,0 | |
1736 | .long 0 | |
d6019e16 | 1737 | .size .bn_div_words,.-.bn_div_words |
dd558806 AP |
1738 | |
1739 | # | |
1740 | # NOTE: The following label name should be changed to | |
1741 | # "bn_sqr_words" i.e. remove the first dot | |
1742 | # for the gcc compiler. This should be automatically | |
1743 | # done in the build | |
1744 | # | |
1745 | .align 4 | |
1746 | .bn_sqr_words: | |
1747 | # | |
1748 | # Optimized version of bn_sqr_words | |
1749 | # | |
1750 | # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) | |
1751 | # | |
1752 | # r3 = r | |
1753 | # r4 = a | |
1754 | # r5 = n | |
1755 | # | |
1756 | # r6 = a[i]. | |
1757 | # r7,r8 = product. | |
1758 | # | |
1759 | # No unrolling done here. Not performance critical. | |
1760 | ||
1761 | addic. r5,r5,0 #test r5. | |
31439046 | 1762 | beq Lppcasm_sqr_adios |
dd558806 AP |
1763 | addi r4,r4,-$BNSZ |
1764 | addi r3,r3,-$BNSZ | |
1765 | mtctr r5 | |
609b0852 | 1766 | Lppcasm_sqr_mainloop: |
dd558806 AP |
1767 | #sqr(r[0],r[1],a[0]); |
1768 | $LDU r6,$BNSZ(r4) | |
1769 | $UMULL r7,r6,r6 | |
1770 | $UMULH r8,r6,r6 | |
1771 | $STU r7,$BNSZ(r3) | |
1772 | $STU r8,$BNSZ(r3) | |
20b88bb1 | 1773 | bdnz Lppcasm_sqr_mainloop |
609b0852 | 1774 | Lppcasm_sqr_adios: |
31439046 | 1775 | blr |
67150340 AP |
1776 | .long 0 |
1777 | .byte 0,12,0x14,0,0,0,3,0 | |
1778 | .long 0 | |
d6019e16 | 1779 | .size .bn_sqr_words,.-.bn_sqr_words |
dd558806 AP |
1780 | |
1781 | # | |
1782 | # NOTE: The following label name should be changed to | |
1783 | # "bn_mul_words" i.e. remove the first dot | |
1784 | # for the gcc compiler. This should be automatically | |
1785 | # done in the build | |
1786 | # | |
1787 | ||
609b0852 | 1788 | .align 4 |
dd558806 AP |
1789 | .bn_mul_words: |
1790 | # | |
1791 | # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
1792 | # | |
1793 | # r3 = rp | |
1794 | # r4 = ap | |
1795 | # r5 = num | |
1796 | # r6 = w | |
1797 | xor r0,r0,r0 | |
1798 | xor r12,r12,r12 # used for carry | |
1799 | rlwinm. r7,r5,30,2,31 # num >> 2 | |
31439046 | 1800 | beq Lppcasm_mw_REM |
dd558806 | 1801 | mtctr r7 |
609b0852 | 1802 | Lppcasm_mw_LOOP: |
dd558806 AP |
1803 | #mul(rp[0],ap[0],w,c1); |
1804 | $LD r8,`0*$BNSZ`(r4) | |
1805 | $UMULL r9,r6,r8 | |
1806 | $UMULH r10,r6,r8 | |
1807 | addc r9,r9,r12 | |
1808 | #addze r10,r10 #carry is NOT ignored. | |
1809 | #will be taken care of | |
1810 | #in second spin below | |
1811 | #using adde. | |
1812 | $ST r9,`0*$BNSZ`(r3) | |
1813 | #mul(rp[1],ap[1],w,c1); | |
609b0852 | 1814 | $LD r8,`1*$BNSZ`(r4) |
dd558806 AP |
1815 | $UMULL r11,r6,r8 |
1816 | $UMULH r12,r6,r8 | |
1817 | adde r11,r11,r10 | |
1818 | #addze r12,r12 | |
1819 | $ST r11,`1*$BNSZ`(r3) | |
1820 | #mul(rp[2],ap[2],w,c1); | |
1821 | $LD r8,`2*$BNSZ`(r4) | |
1822 | $UMULL r9,r6,r8 | |
1823 | $UMULH r10,r6,r8 | |
1824 | adde r9,r9,r12 | |
1825 | #addze r10,r10 | |
1826 | $ST r9,`2*$BNSZ`(r3) | |
1827 | #mul_add(rp[3],ap[3],w,c1); | |
1828 | $LD r8,`3*$BNSZ`(r4) | |
1829 | $UMULL r11,r6,r8 | |
1830 | $UMULH r12,r6,r8 | |
1831 | adde r11,r11,r10 | |
1832 | addze r12,r12 #this spin we collect carry into | |
1833 | #r12 | |
1834 | $ST r11,`3*$BNSZ`(r3) | |
609b0852 | 1835 | |
dd558806 AP |
1836 | addi r3,r3,`4*$BNSZ` |
1837 | addi r4,r4,`4*$BNSZ` | |
20b88bb1 | 1838 | bdnz Lppcasm_mw_LOOP |
dd558806 AP |
1839 | |
1840 | Lppcasm_mw_REM: | |
1841 | andi. r5,r5,0x3 | |
31439046 | 1842 | beq Lppcasm_mw_OVER |
dd558806 AP |
1843 | #mul(rp[0],ap[0],w,c1); |
1844 | $LD r8,`0*$BNSZ`(r4) | |
1845 | $UMULL r9,r6,r8 | |
1846 | $UMULH r10,r6,r8 | |
1847 | addc r9,r9,r12 | |
1848 | addze r10,r10 | |
1849 | $ST r9,`0*$BNSZ`(r3) | |
1850 | addi r12,r10,0 | |
609b0852 | 1851 | |
dd558806 AP |
1852 | addi r5,r5,-1 |
1853 | cmpli 0,0,r5,0 | |
31439046 | 1854 | beq Lppcasm_mw_OVER |
dd558806 | 1855 | |
609b0852 | 1856 | |
dd558806 | 1857 | #mul(rp[1],ap[1],w,c1); |
609b0852 | 1858 | $LD r8,`1*$BNSZ`(r4) |
dd558806 AP |
1859 | $UMULL r9,r6,r8 |
1860 | $UMULH r10,r6,r8 | |
1861 | addc r9,r9,r12 | |
1862 | addze r10,r10 | |
1863 | $ST r9,`1*$BNSZ`(r3) | |
1864 | addi r12,r10,0 | |
609b0852 | 1865 | |
dd558806 AP |
1866 | addi r5,r5,-1 |
1867 | cmpli 0,0,r5,0 | |
31439046 | 1868 | beq Lppcasm_mw_OVER |
609b0852 | 1869 | |
dd558806 AP |
1870 | #mul_add(rp[2],ap[2],w,c1); |
1871 | $LD r8,`2*$BNSZ`(r4) | |
1872 | $UMULL r9,r6,r8 | |
1873 | $UMULH r10,r6,r8 | |
1874 | addc r9,r9,r12 | |
1875 | addze r10,r10 | |
1876 | $ST r9,`2*$BNSZ`(r3) | |
1877 | addi r12,r10,0 | |
609b0852 DB |
1878 | |
1879 | Lppcasm_mw_OVER: | |
dd558806 | 1880 | addi r3,r12,0 |
31439046 | 1881 | blr |
67150340 AP |
1882 | .long 0 |
1883 | .byte 0,12,0x14,0,0,0,4,0 | |
1884 | .long 0 | |
fca8f5de | 1885 | .size .bn_mul_words,.-.bn_mul_words |
dd558806 AP |
1886 | |
1887 | # | |
1888 | # NOTE: The following label name should be changed to | |
1889 | # "bn_mul_add_words" i.e. remove the first dot | |
1890 | # for the gcc compiler. This should be automatically | |
1891 | # done in the build | |
1892 | # | |
1893 | ||
1894 | .align 4 | |
1895 | .bn_mul_add_words: | |
1896 | # | |
1897 | # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
1898 | # | |
1899 | # r3 = rp | |
1900 | # r4 = ap | |
1901 | # r5 = num | |
1902 | # r6 = w | |
1903 | # | |
1904 | # empirical evidence suggests that unrolled version performs best!! | |
1905 | # | |
1906 | xor r0,r0,r0 #r0 = 0 | |
609b0852 | 1907 | xor r12,r12,r12 #r12 = 0 . used for carry |
dd558806 | 1908 | rlwinm. r7,r5,30,2,31 # num >> 2 |
31439046 | 1909 | beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover |
dd558806 | 1910 | mtctr r7 |
609b0852 | 1911 | Lppcasm_maw_mainloop: |
dd558806 AP |
1912 | #mul_add(rp[0],ap[0],w,c1); |
1913 | $LD r8,`0*$BNSZ`(r4) | |
1914 | $LD r11,`0*$BNSZ`(r3) | |
1915 | $UMULL r9,r6,r8 | |
1916 | $UMULH r10,r6,r8 | |
1917 | addc r9,r9,r12 #r12 is carry. | |
1918 | addze r10,r10 | |
1919 | addc r9,r9,r11 | |
1920 | #addze r10,r10 | |
1921 | #the above instruction addze | |
1922 | #is NOT needed. Carry will NOT | |
1923 | #be ignored. It's not affected | |
1924 | #by multiply and will be collected | |
1925 | #in the next spin | |
1926 | $ST r9,`0*$BNSZ`(r3) | |
609b0852 | 1927 | |
dd558806 | 1928 | #mul_add(rp[1],ap[1],w,c1); |
609b0852 | 1929 | $LD r8,`1*$BNSZ`(r4) |
dd558806 AP |
1930 | $LD r9,`1*$BNSZ`(r3) |
1931 | $UMULL r11,r6,r8 | |
1932 | $UMULH r12,r6,r8 | |
1933 | adde r11,r11,r10 #r10 is carry. | |
1934 | addze r12,r12 | |
1935 | addc r11,r11,r9 | |
1936 | #addze r12,r12 | |
1937 | $ST r11,`1*$BNSZ`(r3) | |
609b0852 | 1938 | |
dd558806 AP |
1939 | #mul_add(rp[2],ap[2],w,c1); |
1940 | $LD r8,`2*$BNSZ`(r4) | |
1941 | $UMULL r9,r6,r8 | |
1942 | $LD r11,`2*$BNSZ`(r3) | |
1943 | $UMULH r10,r6,r8 | |
1944 | adde r9,r9,r12 | |
1945 | addze r10,r10 | |
1946 | addc r9,r9,r11 | |
1947 | #addze r10,r10 | |
1948 | $ST r9,`2*$BNSZ`(r3) | |
609b0852 | 1949 | |
dd558806 AP |
1950 | #mul_add(rp[3],ap[3],w,c1); |
1951 | $LD r8,`3*$BNSZ`(r4) | |
1952 | $UMULL r11,r6,r8 | |
1953 | $LD r9,`3*$BNSZ`(r3) | |
1954 | $UMULH r12,r6,r8 | |
1955 | adde r11,r11,r10 | |
1956 | addze r12,r12 | |
1957 | addc r11,r11,r9 | |
1958 | addze r12,r12 | |
1959 | $ST r11,`3*$BNSZ`(r3) | |
1960 | addi r3,r3,`4*$BNSZ` | |
1961 | addi r4,r4,`4*$BNSZ` | |
20b88bb1 | 1962 | bdnz Lppcasm_maw_mainloop |
609b0852 | 1963 | |
dd558806 AP |
1964 | Lppcasm_maw_leftover: |
1965 | andi. r5,r5,0x3 | |
31439046 | 1966 | beq Lppcasm_maw_adios |
dd558806 AP |
1967 | addi r3,r3,-$BNSZ |
1968 | addi r4,r4,-$BNSZ | |
1969 | #mul_add(rp[0],ap[0],w,c1); | |
1970 | mtctr r5 | |
1971 | $LDU r8,$BNSZ(r4) | |
1972 | $UMULL r9,r6,r8 | |
1973 | $UMULH r10,r6,r8 | |
1974 | $LDU r11,$BNSZ(r3) | |
1975 | addc r9,r9,r11 | |
1976 | addze r10,r10 | |
1977 | addc r9,r9,r12 | |
1978 | addze r12,r10 | |
1979 | $ST r9,0(r3) | |
609b0852 | 1980 | |
31439046 | 1981 | bdz Lppcasm_maw_adios |
dd558806 | 1982 | #mul_add(rp[1],ap[1],w,c1); |
609b0852 | 1983 | $LDU r8,$BNSZ(r4) |
dd558806 AP |
1984 | $UMULL r9,r6,r8 |
1985 | $UMULH r10,r6,r8 | |
1986 | $LDU r11,$BNSZ(r3) | |
1987 | addc r9,r9,r11 | |
1988 | addze r10,r10 | |
1989 | addc r9,r9,r12 | |
1990 | addze r12,r10 | |
1991 | $ST r9,0(r3) | |
609b0852 | 1992 | |
31439046 | 1993 | bdz Lppcasm_maw_adios |
dd558806 AP |
1994 | #mul_add(rp[2],ap[2],w,c1); |
1995 | $LDU r8,$BNSZ(r4) | |
1996 | $UMULL r9,r6,r8 | |
1997 | $UMULH r10,r6,r8 | |
1998 | $LDU r11,$BNSZ(r3) | |
1999 | addc r9,r9,r11 | |
2000 | addze r10,r10 | |
2001 | addc r9,r9,r12 | |
2002 | addze r12,r10 | |
2003 | $ST r9,0(r3) | |
609b0852 DB |
2004 | |
2005 | Lppcasm_maw_adios: | |
dd558806 | 2006 | addi r3,r12,0 |
31439046 | 2007 | blr |
67150340 AP |
2008 | .long 0 |
2009 | .byte 0,12,0x14,0,0,0,4,0 | |
2010 | .long 0 | |
d6019e16 | 2011 | .size .bn_mul_add_words,.-.bn_mul_add_words |
dd558806 AP |
2012 | .align 4 |
2013 | EOF | |
31439046 AP |
2014 | $data =~ s/\`([^\`]*)\`/eval $1/gem; |
2015 | print $data; | |
a21314db | 2016 | close STDOUT or die "error closing STDOUT: $!"; |