]>
Commit | Line | Data |
---|---|---|
dd558806 AP |
1 | #!/usr/bin/env perl |
2 | # | |
3 | # Implemented as a Perl wrapper as we want to support several different | |
4 | # architectures with single file. We pick up the target based on the | |
5 | # file name we are asked to generate. | |
6 | # | |
7 | # It should be noted though that this perl code is nothing like | |
8 | # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much | |
9 | # as pre-processor to cover for platform differences in name decoration, | |
10 | # linker tables, 32-/64-bit instruction sets... | |
11 | # | |
12 | # As you might know there're several PowerPC ABI in use. Most notably | |
13 | # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs | |
14 | # are similar enough to implement leaf(!) functions, which would be ABI | |
15 | # neutral. And that's what you find here: ABI neutral leaf functions. | |
16 | # In case you wonder what that is... | |
17 | # | |
18 | # AIX performance | |
19 | # | |
20 | # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. | |
21 | # | |
22 | # The following is the performance of 32-bit compiler | |
23 | # generated code: | |
24 | # | |
25 | # OpenSSL 0.9.6c 21 dec 2001 | |
26 | # built on: Tue Jun 11 11:06:51 EDT 2002 | |
27 | # options:bn(64,32) ... | |
28 | #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 | |
29 | # sign verify sign/s verify/s | |
30 | #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 | |
31 | #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 | |
32 | #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 | |
33 | #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 | |
34 | #dsa 512 bits 0.0087s 0.0106s 114.3 94.5 | |
35 | #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 | |
36 | # | |
37 | # Same bechmark with this assembler code: | |
38 | # | |
39 | #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 | |
40 | #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 | |
41 | #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 | |
42 | #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 | |
43 | #dsa 512 bits 0.0052s 0.0062s 191.6 162.0 | |
44 | #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 | |
45 | # | |
46 | # Number of operations increases by at almost 75% | |
47 | # | |
48 | # Here are performance numbers for 64-bit compiler | |
49 | # generated code: | |
50 | # | |
51 | # OpenSSL 0.9.6g [engine] 9 Aug 2002 | |
52 | # built on: Fri Apr 18 16:59:20 EDT 2003 | |
53 | # options:bn(64,64) ... | |
54 | # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 | |
55 | # sign verify sign/s verify/s | |
56 | #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 | |
57 | #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 | |
58 | #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 | |
59 | #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 | |
60 | #dsa 512 bits 0.0026s 0.0032s 382.5 313.7 | |
61 | #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 | |
62 | # | |
63 | # Same benchmark with this assembler code: | |
64 | # | |
65 | #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 | |
66 | #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 | |
67 | #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 | |
68 | #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 | |
69 | #dsa 512 bits 0.0016s 0.0020s 610.7 507.1 | |
70 | #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 | |
71 | # | |
72 | # Again, performance increases by at about 75% | |
73 | # | |
74 | # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) | |
75 | # OpenSSL 0.9.7c 30 Sep 2003 | |
76 | # | |
77 | # Original code. | |
78 | # | |
79 | #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 | |
80 | #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 | |
81 | #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 | |
82 | #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 | |
83 | #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 | |
84 | #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 | |
85 | #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 | |
86 | # | |
87 | # Same benchmark with this assembler code: | |
88 | # | |
89 | #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 | |
90 | #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 | |
91 | #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 | |
92 | #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 | |
93 | #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 | |
94 | #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 | |
95 | #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 | |
96 | # | |
97 | # Performance increase of ~60% | |
98 | # | |
99 | # If you have comments or suggestions to improve code send | |
100 | # me a note at schari@us.ibm.com | |
101 | # | |
102 | ||
addd641f | 103 | $flavour = shift; |
dd558806 | 104 | |
addd641f | 105 | if ($flavour =~ /32/) { |
dd558806 AP |
106 | $BITS= 32; |
107 | $BNSZ= $BITS/8; | |
108 | $ISA= "\"ppc\""; | |
109 | ||
110 | $LD= "lwz"; # load | |
111 | $LDU= "lwzu"; # load and update | |
112 | $ST= "stw"; # store | |
113 | $STU= "stwu"; # store and update | |
114 | $UMULL= "mullw"; # unsigned multiply low | |
115 | $UMULH= "mulhwu"; # unsigned multiply high | |
116 | $UDIV= "divwu"; # unsigned divide | |
117 | $UCMPI= "cmplwi"; # unsigned compare with immediate | |
118 | $UCMP= "cmplw"; # unsigned compare | |
aaa5dc61 | 119 | $CNTLZ= "cntlzw"; # count leading zeros |
dd558806 AP |
120 | $SHL= "slw"; # shift left |
121 | $SHR= "srw"; # unsigned shift right | |
122 | $SHRI= "srwi"; # unsigned shift right by immediate | |
123 | $SHLI= "slwi"; # shift left by immediate | |
124 | $CLRU= "clrlwi"; # clear upper bits | |
125 | $INSR= "insrwi"; # insert right | |
126 | $ROTL= "rotlwi"; # rotate left by immediate | |
31efffbd | 127 | $TR= "tw"; # conditional trap |
addd641f | 128 | } elsif ($flavour =~ /64/) { |
dd558806 AP |
129 | $BITS= 64; |
130 | $BNSZ= $BITS/8; | |
131 | $ISA= "\"ppc64\""; | |
132 | ||
133 | # same as above, but 64-bit mnemonics... | |
134 | $LD= "ld"; # load | |
135 | $LDU= "ldu"; # load and update | |
136 | $ST= "std"; # store | |
137 | $STU= "stdu"; # store and update | |
138 | $UMULL= "mulld"; # unsigned multiply low | |
139 | $UMULH= "mulhdu"; # unsigned multiply high | |
140 | $UDIV= "divdu"; # unsigned divide | |
141 | $UCMPI= "cmpldi"; # unsigned compare with immediate | |
142 | $UCMP= "cmpld"; # unsigned compare | |
aaa5dc61 | 143 | $CNTLZ= "cntlzd"; # count leading zeros |
dd558806 AP |
144 | $SHL= "sld"; # shift left |
145 | $SHR= "srd"; # unsigned shift right | |
146 | $SHRI= "srdi"; # unsigned shift right by immediate | |
147 | $SHLI= "sldi"; # shift left by immediate | |
148 | $CLRU= "clrldi"; # clear upper bits | |
149 | $INSR= "insrdi"; # insert right | |
150 | $ROTL= "rotldi"; # rotate left by immediate | |
31efffbd | 151 | $TR= "td"; # conditional trap |
addd641f | 152 | } else { die "nonsense $flavour"; } |
dd558806 | 153 | |
31439046 AP |
154 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
155 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
156 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
157 | die "can't locate ppc-xlate.pl"; | |
dd558806 | 158 | |
addd641f | 159 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
dd558806 | 160 | |
31439046 | 161 | $data=<<EOF; |
dd558806 AP |
162 | #-------------------------------------------------------------------- |
163 | # | |
164 | # | |
165 | # | |
166 | # | |
167 | # File: ppc32.s | |
168 | # | |
169 | # Created by: Suresh Chari | |
170 | # IBM Thomas J. Watson Research Library | |
171 | # Hawthorne, NY | |
172 | # | |
173 | # | |
174 | # Description: Optimized assembly routines for OpenSSL crypto | |
175 | # on the 32 bitPowerPC platform. | |
176 | # | |
177 | # | |
178 | # Version History | |
179 | # | |
180 | # 2. Fixed bn_add,bn_sub and bn_div_words, added comments, | |
181 | # cleaned up code. Also made a single version which can | |
182 | # be used for both the AIX and Linux compilers. See NOTE | |
183 | # below. | |
184 | # 12/05/03 Suresh Chari | |
185 | # (with lots of help from) Andy Polyakov | |
186 | ## | |
187 | # 1. Initial version 10/20/02 Suresh Chari | |
188 | # | |
189 | # | |
190 | # The following file works for the xlc,cc | |
191 | # and gcc compilers. | |
192 | # | |
193 | # NOTE: To get the file to link correctly with the gcc compiler | |
194 | # you have to change the names of the routines and remove | |
195 | # the first .(dot) character. This should automatically | |
196 | # be done in the build process. | |
197 | # | |
198 | # Hand optimized assembly code for the following routines | |
199 | # | |
200 | # bn_sqr_comba4 | |
201 | # bn_sqr_comba8 | |
202 | # bn_mul_comba4 | |
203 | # bn_mul_comba8 | |
204 | # bn_sub_words | |
205 | # bn_add_words | |
206 | # bn_div_words | |
207 | # bn_sqr_words | |
208 | # bn_mul_words | |
209 | # bn_mul_add_words | |
210 | # | |
211 | # NOTE: It is possible to optimize this code more for | |
212 | # specific PowerPC or Power architectures. On the Northstar | |
213 | # architecture the optimizations in this file do | |
214 | # NOT provide much improvement. | |
215 | # | |
216 | # If you have comments or suggestions to improve code send | |
217 | # me a note at schari\@us.ibm.com | |
218 | # | |
219 | #-------------------------------------------------------------------------- | |
220 | # | |
221 | # Defines to be used in the assembly code. | |
222 | # | |
31439046 AP |
223 | #.set r0,0 # we use it as storage for value of 0 |
224 | #.set SP,1 # preserved | |
225 | #.set RTOC,2 # preserved | |
226 | #.set r3,3 # 1st argument/return value | |
227 | #.set r4,4 # 2nd argument/volatile register | |
228 | #.set r5,5 # 3rd argument/volatile register | |
229 | #.set r6,6 # ... | |
230 | #.set r7,7 | |
231 | #.set r8,8 | |
232 | #.set r9,9 | |
233 | #.set r10,10 | |
234 | #.set r11,11 | |
235 | #.set r12,12 | |
236 | #.set r13,13 # not used, nor any other "below" it... | |
dd558806 AP |
237 | |
238 | # Declare function names to be global | |
239 | # NOTE: For gcc these names MUST be changed to remove | |
240 | # the first . i.e. for example change ".bn_sqr_comba4" | |
241 | # to "bn_sqr_comba4". This should be automatically done | |
242 | # in the build. | |
243 | ||
244 | .globl .bn_sqr_comba4 | |
245 | .globl .bn_sqr_comba8 | |
246 | .globl .bn_mul_comba4 | |
247 | .globl .bn_mul_comba8 | |
248 | .globl .bn_sub_words | |
249 | .globl .bn_add_words | |
250 | .globl .bn_div_words | |
251 | .globl .bn_sqr_words | |
252 | .globl .bn_mul_words | |
253 | .globl .bn_mul_add_words | |
254 | ||
255 | # .text section | |
256 | ||
492279f6 | 257 | .machine "any" |
dd558806 AP |
258 | |
259 | # | |
260 | # NOTE: The following label name should be changed to | |
261 | # "bn_sqr_comba4" i.e. remove the first dot | |
262 | # for the gcc compiler. This should be automatically | |
263 | # done in the build | |
264 | # | |
265 | ||
266 | .align 4 | |
267 | .bn_sqr_comba4: | |
268 | # | |
269 | # Optimized version of bn_sqr_comba4. | |
270 | # | |
271 | # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | |
272 | # r3 contains r | |
273 | # r4 contains a | |
274 | # | |
275 | # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: | |
276 | # | |
277 | # r5,r6 are the two BN_ULONGs being multiplied. | |
278 | # r7,r8 are the results of the 32x32 giving 64 bit multiply. | |
279 | # r9,r10, r11 are the equivalents of c1,c2, c3. | |
280 | # Here's the assembly | |
281 | # | |
282 | # | |
283 | xor r0,r0,r0 # set r0 = 0. Used in the addze | |
284 | # instructions below | |
285 | ||
286 | #sqr_add_c(a,0,c1,c2,c3) | |
287 | $LD r5,`0*$BNSZ`(r4) | |
288 | $UMULL r9,r5,r5 | |
289 | $UMULH r10,r5,r5 #in first iteration. No need | |
290 | #to add since c1=c2=c3=0. | |
291 | # Note c3(r11) is NOT set to 0 | |
292 | # but will be. | |
293 | ||
294 | $ST r9,`0*$BNSZ`(r3) # r[0]=c1; | |
295 | # sqr_add_c2(a,1,0,c2,c3,c1); | |
296 | $LD r6,`1*$BNSZ`(r4) | |
297 | $UMULL r7,r5,r6 | |
298 | $UMULH r8,r5,r6 | |
299 | ||
300 | addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) | |
301 | adde r8,r8,r8 | |
302 | addze r9,r0 # catch carry if any. | |
303 | # r9= r0(=0) and carry | |
304 | ||
305 | addc r10,r7,r10 # now add to temp result. | |
306 | addze r11,r8 # r8 added to r11 which is 0 | |
307 | addze r9,r9 | |
308 | ||
309 | $ST r10,`1*$BNSZ`(r3) #r[1]=c2; | |
310 | #sqr_add_c(a,1,c3,c1,c2) | |
311 | $UMULL r7,r6,r6 | |
312 | $UMULH r8,r6,r6 | |
313 | addc r11,r7,r11 | |
314 | adde r9,r8,r9 | |
315 | addze r10,r0 | |
316 | #sqr_add_c2(a,2,0,c3,c1,c2) | |
317 | $LD r6,`2*$BNSZ`(r4) | |
318 | $UMULL r7,r5,r6 | |
319 | $UMULH r8,r5,r6 | |
320 | ||
321 | addc r7,r7,r7 | |
322 | adde r8,r8,r8 | |
323 | addze r10,r10 | |
324 | ||
325 | addc r11,r7,r11 | |
326 | adde r9,r8,r9 | |
327 | addze r10,r10 | |
328 | $ST r11,`2*$BNSZ`(r3) #r[2]=c3 | |
329 | #sqr_add_c2(a,3,0,c1,c2,c3); | |
330 | $LD r6,`3*$BNSZ`(r4) | |
331 | $UMULL r7,r5,r6 | |
332 | $UMULH r8,r5,r6 | |
333 | addc r7,r7,r7 | |
334 | adde r8,r8,r8 | |
335 | addze r11,r0 | |
336 | ||
337 | addc r9,r7,r9 | |
338 | adde r10,r8,r10 | |
339 | addze r11,r11 | |
340 | #sqr_add_c2(a,2,1,c1,c2,c3); | |
341 | $LD r5,`1*$BNSZ`(r4) | |
342 | $LD r6,`2*$BNSZ`(r4) | |
343 | $UMULL r7,r5,r6 | |
344 | $UMULH r8,r5,r6 | |
345 | ||
346 | addc r7,r7,r7 | |
347 | adde r8,r8,r8 | |
348 | addze r11,r11 | |
349 | addc r9,r7,r9 | |
350 | adde r10,r8,r10 | |
351 | addze r11,r11 | |
352 | $ST r9,`3*$BNSZ`(r3) #r[3]=c1 | |
353 | #sqr_add_c(a,2,c2,c3,c1); | |
354 | $UMULL r7,r6,r6 | |
355 | $UMULH r8,r6,r6 | |
356 | addc r10,r7,r10 | |
357 | adde r11,r8,r11 | |
358 | addze r9,r0 | |
359 | #sqr_add_c2(a,3,1,c2,c3,c1); | |
360 | $LD r6,`3*$BNSZ`(r4) | |
361 | $UMULL r7,r5,r6 | |
362 | $UMULH r8,r5,r6 | |
363 | addc r7,r7,r7 | |
364 | adde r8,r8,r8 | |
365 | addze r9,r9 | |
366 | ||
367 | addc r10,r7,r10 | |
368 | adde r11,r8,r11 | |
369 | addze r9,r9 | |
370 | $ST r10,`4*$BNSZ`(r3) #r[4]=c2 | |
371 | #sqr_add_c2(a,3,2,c3,c1,c2); | |
372 | $LD r5,`2*$BNSZ`(r4) | |
373 | $UMULL r7,r5,r6 | |
374 | $UMULH r8,r5,r6 | |
375 | addc r7,r7,r7 | |
376 | adde r8,r8,r8 | |
377 | addze r10,r0 | |
378 | ||
379 | addc r11,r7,r11 | |
380 | adde r9,r8,r9 | |
381 | addze r10,r10 | |
382 | $ST r11,`5*$BNSZ`(r3) #r[5] = c3 | |
383 | #sqr_add_c(a,3,c1,c2,c3); | |
384 | $UMULL r7,r6,r6 | |
385 | $UMULH r8,r6,r6 | |
386 | addc r9,r7,r9 | |
387 | adde r10,r8,r10 | |
388 | ||
389 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 | |
390 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 | |
31439046 | 391 | blr |
dd558806 AP |
392 | .long 0x00000000 |
393 | ||
394 | # | |
395 | # NOTE: The following label name should be changed to | |
396 | # "bn_sqr_comba8" i.e. remove the first dot | |
397 | # for the gcc compiler. This should be automatically | |
398 | # done in the build | |
399 | # | |
400 | ||
401 | .align 4 | |
402 | .bn_sqr_comba8: | |
403 | # | |
404 | # This is an optimized version of the bn_sqr_comba8 routine. | |
405 | # Tightly uses the adde instruction | |
406 | # | |
407 | # | |
408 | # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | |
409 | # r3 contains r | |
410 | # r4 contains a | |
411 | # | |
412 | # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: | |
413 | # | |
414 | # r5,r6 are the two BN_ULONGs being multiplied. | |
415 | # r7,r8 are the results of the 32x32 giving 64 bit multiply. | |
416 | # r9,r10, r11 are the equivalents of c1,c2, c3. | |
417 | # | |
418 | # Possible optimization of loading all 8 longs of a into registers | |
419 | # doesnt provide any speedup | |
420 | # | |
421 | ||
422 | xor r0,r0,r0 #set r0 = 0.Used in addze | |
423 | #instructions below. | |
424 | ||
425 | #sqr_add_c(a,0,c1,c2,c3); | |
426 | $LD r5,`0*$BNSZ`(r4) | |
427 | $UMULL r9,r5,r5 #1st iteration: no carries. | |
428 | $UMULH r10,r5,r5 | |
429 | $ST r9,`0*$BNSZ`(r3) # r[0]=c1; | |
430 | #sqr_add_c2(a,1,0,c2,c3,c1); | |
431 | $LD r6,`1*$BNSZ`(r4) | |
432 | $UMULL r7,r5,r6 | |
433 | $UMULH r8,r5,r6 | |
434 | ||
435 | addc r10,r7,r10 #add the two register number | |
436 | adde r11,r8,r0 # (r8,r7) to the three register | |
437 | addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 | |
438 | ||
439 | addc r10,r7,r10 #add the two register number | |
440 | adde r11,r8,r11 # (r8,r7) to the three register | |
441 | addze r9,r9 # number (r9,r11,r10). | |
442 | ||
443 | $ST r10,`1*$BNSZ`(r3) # r[1]=c2 | |
444 | ||
445 | #sqr_add_c(a,1,c3,c1,c2); | |
446 | $UMULL r7,r6,r6 | |
447 | $UMULH r8,r6,r6 | |
448 | addc r11,r7,r11 | |
449 | adde r9,r8,r9 | |
450 | addze r10,r0 | |
451 | #sqr_add_c2(a,2,0,c3,c1,c2); | |
452 | $LD r6,`2*$BNSZ`(r4) | |
453 | $UMULL r7,r5,r6 | |
454 | $UMULH r8,r5,r6 | |
455 | ||
456 | addc r11,r7,r11 | |
457 | adde r9,r8,r9 | |
458 | addze r10,r10 | |
459 | ||
460 | addc r11,r7,r11 | |
461 | adde r9,r8,r9 | |
462 | addze r10,r10 | |
463 | ||
464 | $ST r11,`2*$BNSZ`(r3) #r[2]=c3 | |
465 | #sqr_add_c2(a,3,0,c1,c2,c3); | |
466 | $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. | |
467 | $UMULL r7,r5,r6 | |
468 | $UMULH r8,r5,r6 | |
469 | ||
470 | addc r9,r7,r9 | |
471 | adde r10,r8,r10 | |
472 | addze r11,r0 | |
473 | ||
474 | addc r9,r7,r9 | |
475 | adde r10,r8,r10 | |
476 | addze r11,r11 | |
477 | #sqr_add_c2(a,2,1,c1,c2,c3); | |
478 | $LD r5,`1*$BNSZ`(r4) | |
479 | $LD r6,`2*$BNSZ`(r4) | |
480 | $UMULL r7,r5,r6 | |
481 | $UMULH r8,r5,r6 | |
482 | ||
483 | addc r9,r7,r9 | |
484 | adde r10,r8,r10 | |
485 | addze r11,r11 | |
486 | ||
487 | addc r9,r7,r9 | |
488 | adde r10,r8,r10 | |
489 | addze r11,r11 | |
490 | ||
491 | $ST r9,`3*$BNSZ`(r3) #r[3]=c1; | |
492 | #sqr_add_c(a,2,c2,c3,c1); | |
493 | $UMULL r7,r6,r6 | |
494 | $UMULH r8,r6,r6 | |
495 | ||
496 | addc r10,r7,r10 | |
497 | adde r11,r8,r11 | |
498 | addze r9,r0 | |
499 | #sqr_add_c2(a,3,1,c2,c3,c1); | |
500 | $LD r6,`3*$BNSZ`(r4) | |
501 | $UMULL r7,r5,r6 | |
502 | $UMULH r8,r5,r6 | |
503 | ||
504 | addc r10,r7,r10 | |
505 | adde r11,r8,r11 | |
506 | addze r9,r9 | |
507 | ||
508 | addc r10,r7,r10 | |
509 | adde r11,r8,r11 | |
510 | addze r9,r9 | |
511 | #sqr_add_c2(a,4,0,c2,c3,c1); | |
512 | $LD r5,`0*$BNSZ`(r4) | |
513 | $LD r6,`4*$BNSZ`(r4) | |
514 | $UMULL r7,r5,r6 | |
515 | $UMULH r8,r5,r6 | |
516 | ||
517 | addc r10,r7,r10 | |
518 | adde r11,r8,r11 | |
519 | addze r9,r9 | |
520 | ||
521 | addc r10,r7,r10 | |
522 | adde r11,r8,r11 | |
523 | addze r9,r9 | |
524 | $ST r10,`4*$BNSZ`(r3) #r[4]=c2; | |
525 | #sqr_add_c2(a,5,0,c3,c1,c2); | |
526 | $LD r6,`5*$BNSZ`(r4) | |
527 | $UMULL r7,r5,r6 | |
528 | $UMULH r8,r5,r6 | |
529 | ||
530 | addc r11,r7,r11 | |
531 | adde r9,r8,r9 | |
532 | addze r10,r0 | |
533 | ||
534 | addc r11,r7,r11 | |
535 | adde r9,r8,r9 | |
536 | addze r10,r10 | |
537 | #sqr_add_c2(a,4,1,c3,c1,c2); | |
538 | $LD r5,`1*$BNSZ`(r4) | |
539 | $LD r6,`4*$BNSZ`(r4) | |
540 | $UMULL r7,r5,r6 | |
541 | $UMULH r8,r5,r6 | |
542 | ||
543 | addc r11,r7,r11 | |
544 | adde r9,r8,r9 | |
545 | addze r10,r10 | |
546 | ||
547 | addc r11,r7,r11 | |
548 | adde r9,r8,r9 | |
549 | addze r10,r10 | |
550 | #sqr_add_c2(a,3,2,c3,c1,c2); | |
551 | $LD r5,`2*$BNSZ`(r4) | |
552 | $LD r6,`3*$BNSZ`(r4) | |
553 | $UMULL r7,r5,r6 | |
554 | $UMULH r8,r5,r6 | |
555 | ||
556 | addc r11,r7,r11 | |
557 | adde r9,r8,r9 | |
558 | addze r10,r10 | |
559 | ||
560 | addc r11,r7,r11 | |
561 | adde r9,r8,r9 | |
562 | addze r10,r10 | |
563 | $ST r11,`5*$BNSZ`(r3) #r[5]=c3; | |
564 | #sqr_add_c(a,3,c1,c2,c3); | |
565 | $UMULL r7,r6,r6 | |
566 | $UMULH r8,r6,r6 | |
567 | addc r9,r7,r9 | |
568 | adde r10,r8,r10 | |
569 | addze r11,r0 | |
570 | #sqr_add_c2(a,4,2,c1,c2,c3); | |
571 | $LD r6,`4*$BNSZ`(r4) | |
572 | $UMULL r7,r5,r6 | |
573 | $UMULH r8,r5,r6 | |
574 | ||
575 | addc r9,r7,r9 | |
576 | adde r10,r8,r10 | |
577 | addze r11,r11 | |
578 | ||
579 | addc r9,r7,r9 | |
580 | adde r10,r8,r10 | |
581 | addze r11,r11 | |
582 | #sqr_add_c2(a,5,1,c1,c2,c3); | |
583 | $LD r5,`1*$BNSZ`(r4) | |
584 | $LD r6,`5*$BNSZ`(r4) | |
585 | $UMULL r7,r5,r6 | |
586 | $UMULH r8,r5,r6 | |
587 | ||
588 | addc r9,r7,r9 | |
589 | adde r10,r8,r10 | |
590 | addze r11,r11 | |
591 | ||
592 | addc r9,r7,r9 | |
593 | adde r10,r8,r10 | |
594 | addze r11,r11 | |
595 | #sqr_add_c2(a,6,0,c1,c2,c3); | |
596 | $LD r5,`0*$BNSZ`(r4) | |
597 | $LD r6,`6*$BNSZ`(r4) | |
598 | $UMULL r7,r5,r6 | |
599 | $UMULH r8,r5,r6 | |
600 | addc r9,r7,r9 | |
601 | adde r10,r8,r10 | |
602 | addze r11,r11 | |
603 | addc r9,r7,r9 | |
604 | adde r10,r8,r10 | |
605 | addze r11,r11 | |
606 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1; | |
607 | #sqr_add_c2(a,7,0,c2,c3,c1); | |
608 | $LD r6,`7*$BNSZ`(r4) | |
609 | $UMULL r7,r5,r6 | |
610 | $UMULH r8,r5,r6 | |
611 | ||
612 | addc r10,r7,r10 | |
613 | adde r11,r8,r11 | |
614 | addze r9,r0 | |
615 | addc r10,r7,r10 | |
616 | adde r11,r8,r11 | |
617 | addze r9,r9 | |
618 | #sqr_add_c2(a,6,1,c2,c3,c1); | |
619 | $LD r5,`1*$BNSZ`(r4) | |
620 | $LD r6,`6*$BNSZ`(r4) | |
621 | $UMULL r7,r5,r6 | |
622 | $UMULH r8,r5,r6 | |
623 | ||
624 | addc r10,r7,r10 | |
625 | adde r11,r8,r11 | |
626 | addze r9,r9 | |
627 | addc r10,r7,r10 | |
628 | adde r11,r8,r11 | |
629 | addze r9,r9 | |
630 | #sqr_add_c2(a,5,2,c2,c3,c1); | |
631 | $LD r5,`2*$BNSZ`(r4) | |
632 | $LD r6,`5*$BNSZ`(r4) | |
633 | $UMULL r7,r5,r6 | |
634 | $UMULH r8,r5,r6 | |
635 | addc r10,r7,r10 | |
636 | adde r11,r8,r11 | |
637 | addze r9,r9 | |
638 | addc r10,r7,r10 | |
639 | adde r11,r8,r11 | |
640 | addze r9,r9 | |
641 | #sqr_add_c2(a,4,3,c2,c3,c1); | |
642 | $LD r5,`3*$BNSZ`(r4) | |
643 | $LD r6,`4*$BNSZ`(r4) | |
644 | $UMULL r7,r5,r6 | |
645 | $UMULH r8,r5,r6 | |
646 | ||
647 | addc r10,r7,r10 | |
648 | adde r11,r8,r11 | |
649 | addze r9,r9 | |
650 | addc r10,r7,r10 | |
651 | adde r11,r8,r11 | |
652 | addze r9,r9 | |
653 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2; | |
654 | #sqr_add_c(a,4,c3,c1,c2); | |
655 | $UMULL r7,r6,r6 | |
656 | $UMULH r8,r6,r6 | |
657 | addc r11,r7,r11 | |
658 | adde r9,r8,r9 | |
659 | addze r10,r0 | |
660 | #sqr_add_c2(a,5,3,c3,c1,c2); | |
661 | $LD r6,`5*$BNSZ`(r4) | |
662 | $UMULL r7,r5,r6 | |
663 | $UMULH r8,r5,r6 | |
664 | addc r11,r7,r11 | |
665 | adde r9,r8,r9 | |
666 | addze r10,r10 | |
667 | addc r11,r7,r11 | |
668 | adde r9,r8,r9 | |
669 | addze r10,r10 | |
670 | #sqr_add_c2(a,6,2,c3,c1,c2); | |
671 | $LD r5,`2*$BNSZ`(r4) | |
672 | $LD r6,`6*$BNSZ`(r4) | |
673 | $UMULL r7,r5,r6 | |
674 | $UMULH r8,r5,r6 | |
675 | addc r11,r7,r11 | |
676 | adde r9,r8,r9 | |
677 | addze r10,r10 | |
678 | ||
679 | addc r11,r7,r11 | |
680 | adde r9,r8,r9 | |
681 | addze r10,r10 | |
682 | #sqr_add_c2(a,7,1,c3,c1,c2); | |
683 | $LD r5,`1*$BNSZ`(r4) | |
684 | $LD r6,`7*$BNSZ`(r4) | |
685 | $UMULL r7,r5,r6 | |
686 | $UMULH r8,r5,r6 | |
687 | addc r11,r7,r11 | |
688 | adde r9,r8,r9 | |
689 | addze r10,r10 | |
690 | addc r11,r7,r11 | |
691 | adde r9,r8,r9 | |
692 | addze r10,r10 | |
693 | $ST r11,`8*$BNSZ`(r3) #r[8]=c3; | |
694 | #sqr_add_c2(a,7,2,c1,c2,c3); | |
695 | $LD r5,`2*$BNSZ`(r4) | |
696 | $UMULL r7,r5,r6 | |
697 | $UMULH r8,r5,r6 | |
698 | ||
699 | addc r9,r7,r9 | |
700 | adde r10,r8,r10 | |
701 | addze r11,r0 | |
702 | addc r9,r7,r9 | |
703 | adde r10,r8,r10 | |
704 | addze r11,r11 | |
705 | #sqr_add_c2(a,6,3,c1,c2,c3); | |
706 | $LD r5,`3*$BNSZ`(r4) | |
707 | $LD r6,`6*$BNSZ`(r4) | |
708 | $UMULL r7,r5,r6 | |
709 | $UMULH r8,r5,r6 | |
710 | addc r9,r7,r9 | |
711 | adde r10,r8,r10 | |
712 | addze r11,r11 | |
713 | addc r9,r7,r9 | |
714 | adde r10,r8,r10 | |
715 | addze r11,r11 | |
716 | #sqr_add_c2(a,5,4,c1,c2,c3); | |
717 | $LD r5,`4*$BNSZ`(r4) | |
718 | $LD r6,`5*$BNSZ`(r4) | |
719 | $UMULL r7,r5,r6 | |
720 | $UMULH r8,r5,r6 | |
721 | addc r9,r7,r9 | |
722 | adde r10,r8,r10 | |
723 | addze r11,r11 | |
724 | addc r9,r7,r9 | |
725 | adde r10,r8,r10 | |
726 | addze r11,r11 | |
727 | $ST r9,`9*$BNSZ`(r3) #r[9]=c1; | |
728 | #sqr_add_c(a,5,c2,c3,c1); | |
729 | $UMULL r7,r6,r6 | |
730 | $UMULH r8,r6,r6 | |
731 | addc r10,r7,r10 | |
732 | adde r11,r8,r11 | |
733 | addze r9,r0 | |
734 | #sqr_add_c2(a,6,4,c2,c3,c1); | |
735 | $LD r6,`6*$BNSZ`(r4) | |
736 | $UMULL r7,r5,r6 | |
737 | $UMULH r8,r5,r6 | |
738 | addc r10,r7,r10 | |
739 | adde r11,r8,r11 | |
740 | addze r9,r9 | |
741 | addc r10,r7,r10 | |
742 | adde r11,r8,r11 | |
743 | addze r9,r9 | |
744 | #sqr_add_c2(a,7,3,c2,c3,c1); | |
745 | $LD r5,`3*$BNSZ`(r4) | |
746 | $LD r6,`7*$BNSZ`(r4) | |
747 | $UMULL r7,r5,r6 | |
748 | $UMULH r8,r5,r6 | |
749 | addc r10,r7,r10 | |
750 | adde r11,r8,r11 | |
751 | addze r9,r9 | |
752 | addc r10,r7,r10 | |
753 | adde r11,r8,r11 | |
754 | addze r9,r9 | |
755 | $ST r10,`10*$BNSZ`(r3) #r[10]=c2; | |
756 | #sqr_add_c2(a,7,4,c3,c1,c2); | |
757 | $LD r5,`4*$BNSZ`(r4) | |
758 | $UMULL r7,r5,r6 | |
759 | $UMULH r8,r5,r6 | |
760 | addc r11,r7,r11 | |
761 | adde r9,r8,r9 | |
762 | addze r10,r0 | |
763 | addc r11,r7,r11 | |
764 | adde r9,r8,r9 | |
765 | addze r10,r10 | |
766 | #sqr_add_c2(a,6,5,c3,c1,c2); | |
767 | $LD r5,`5*$BNSZ`(r4) | |
768 | $LD r6,`6*$BNSZ`(r4) | |
769 | $UMULL r7,r5,r6 | |
770 | $UMULH r8,r5,r6 | |
771 | addc r11,r7,r11 | |
772 | adde r9,r8,r9 | |
773 | addze r10,r10 | |
774 | addc r11,r7,r11 | |
775 | adde r9,r8,r9 | |
776 | addze r10,r10 | |
777 | $ST r11,`11*$BNSZ`(r3) #r[11]=c3; | |
778 | #sqr_add_c(a,6,c1,c2,c3); | |
779 | $UMULL r7,r6,r6 | |
780 | $UMULH r8,r6,r6 | |
781 | addc r9,r7,r9 | |
782 | adde r10,r8,r10 | |
783 | addze r11,r0 | |
784 | #sqr_add_c2(a,7,5,c1,c2,c3) | |
785 | $LD r6,`7*$BNSZ`(r4) | |
786 | $UMULL r7,r5,r6 | |
787 | $UMULH r8,r5,r6 | |
788 | addc r9,r7,r9 | |
789 | adde r10,r8,r10 | |
790 | addze r11,r11 | |
791 | addc r9,r7,r9 | |
792 | adde r10,r8,r10 | |
793 | addze r11,r11 | |
794 | $ST r9,`12*$BNSZ`(r3) #r[12]=c1; | |
795 | ||
796 | #sqr_add_c2(a,7,6,c2,c3,c1) | |
797 | $LD r5,`6*$BNSZ`(r4) | |
798 | $UMULL r7,r5,r6 | |
799 | $UMULH r8,r5,r6 | |
800 | addc r10,r7,r10 | |
801 | adde r11,r8,r11 | |
802 | addze r9,r0 | |
803 | addc r10,r7,r10 | |
804 | adde r11,r8,r11 | |
805 | addze r9,r9 | |
806 | $ST r10,`13*$BNSZ`(r3) #r[13]=c2; | |
807 | #sqr_add_c(a,7,c3,c1,c2); | |
808 | $UMULL r7,r6,r6 | |
809 | $UMULH r8,r6,r6 | |
810 | addc r11,r7,r11 | |
811 | adde r9,r8,r9 | |
812 | $ST r11,`14*$BNSZ`(r3) #r[14]=c3; | |
813 | $ST r9, `15*$BNSZ`(r3) #r[15]=c1; | |
814 | ||
815 | ||
31439046 | 816 | blr |
dd558806 AP |
817 | |
818 | .long 0x00000000 | |
819 | ||
820 | # | |
821 | # NOTE: The following label name should be changed to | |
822 | # "bn_mul_comba4" i.e. remove the first dot | |
823 | # for the gcc compiler. This should be automatically | |
824 | # done in the build | |
825 | # | |
826 | ||
827 | .align 4 | |
828 | .bn_mul_comba4: | |
829 | # | |
830 | # This is an optimized version of the bn_mul_comba4 routine. | |
831 | # | |
832 | # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
833 | # r3 contains r | |
834 | # r4 contains a | |
835 | # r5 contains b | |
836 | # r6, r7 are the 2 BN_ULONGs being multiplied. | |
837 | # r8, r9 are the results of the 32x32 giving 64 multiply. | |
838 | # r10, r11, r12 are the equivalents of c1, c2, and c3. | |
839 | # | |
840 | xor r0,r0,r0 #r0=0. Used in addze below. | |
841 | #mul_add_c(a[0],b[0],c1,c2,c3); | |
842 | $LD r6,`0*$BNSZ`(r4) | |
843 | $LD r7,`0*$BNSZ`(r5) | |
844 | $UMULL r10,r6,r7 | |
845 | $UMULH r11,r6,r7 | |
846 | $ST r10,`0*$BNSZ`(r3) #r[0]=c1 | |
847 | #mul_add_c(a[0],b[1],c2,c3,c1); | |
848 | $LD r7,`1*$BNSZ`(r5) | |
849 | $UMULL r8,r6,r7 | |
850 | $UMULH r9,r6,r7 | |
851 | addc r11,r8,r11 | |
852 | adde r12,r9,r0 | |
853 | addze r10,r0 | |
854 | #mul_add_c(a[1],b[0],c2,c3,c1); | |
855 | $LD r6, `1*$BNSZ`(r4) | |
856 | $LD r7, `0*$BNSZ`(r5) | |
857 | $UMULL r8,r6,r7 | |
858 | $UMULH r9,r6,r7 | |
859 | addc r11,r8,r11 | |
860 | adde r12,r9,r12 | |
861 | addze r10,r10 | |
862 | $ST r11,`1*$BNSZ`(r3) #r[1]=c2 | |
863 | #mul_add_c(a[2],b[0],c3,c1,c2); | |
864 | $LD r6,`2*$BNSZ`(r4) | |
865 | $UMULL r8,r6,r7 | |
866 | $UMULH r9,r6,r7 | |
867 | addc r12,r8,r12 | |
868 | adde r10,r9,r10 | |
869 | addze r11,r0 | |
870 | #mul_add_c(a[1],b[1],c3,c1,c2); | |
871 | $LD r6,`1*$BNSZ`(r4) | |
872 | $LD r7,`1*$BNSZ`(r5) | |
873 | $UMULL r8,r6,r7 | |
874 | $UMULH r9,r6,r7 | |
875 | addc r12,r8,r12 | |
876 | adde r10,r9,r10 | |
877 | addze r11,r11 | |
878 | #mul_add_c(a[0],b[2],c3,c1,c2); | |
879 | $LD r6,`0*$BNSZ`(r4) | |
880 | $LD r7,`2*$BNSZ`(r5) | |
881 | $UMULL r8,r6,r7 | |
882 | $UMULH r9,r6,r7 | |
883 | addc r12,r8,r12 | |
884 | adde r10,r9,r10 | |
885 | addze r11,r11 | |
886 | $ST r12,`2*$BNSZ`(r3) #r[2]=c3 | |
887 | #mul_add_c(a[0],b[3],c1,c2,c3); | |
888 | $LD r7,`3*$BNSZ`(r5) | |
889 | $UMULL r8,r6,r7 | |
890 | $UMULH r9,r6,r7 | |
891 | addc r10,r8,r10 | |
892 | adde r11,r9,r11 | |
893 | addze r12,r0 | |
894 | #mul_add_c(a[1],b[2],c1,c2,c3); | |
895 | $LD r6,`1*$BNSZ`(r4) | |
896 | $LD r7,`2*$BNSZ`(r5) | |
897 | $UMULL r8,r6,r7 | |
898 | $UMULH r9,r6,r7 | |
899 | addc r10,r8,r10 | |
900 | adde r11,r9,r11 | |
901 | addze r12,r12 | |
902 | #mul_add_c(a[2],b[1],c1,c2,c3); | |
903 | $LD r6,`2*$BNSZ`(r4) | |
904 | $LD r7,`1*$BNSZ`(r5) | |
905 | $UMULL r8,r6,r7 | |
906 | $UMULH r9,r6,r7 | |
907 | addc r10,r8,r10 | |
908 | adde r11,r9,r11 | |
909 | addze r12,r12 | |
910 | #mul_add_c(a[3],b[0],c1,c2,c3); | |
911 | $LD r6,`3*$BNSZ`(r4) | |
912 | $LD r7,`0*$BNSZ`(r5) | |
913 | $UMULL r8,r6,r7 | |
914 | $UMULH r9,r6,r7 | |
915 | addc r10,r8,r10 | |
916 | adde r11,r9,r11 | |
917 | addze r12,r12 | |
918 | $ST r10,`3*$BNSZ`(r3) #r[3]=c1 | |
919 | #mul_add_c(a[3],b[1],c2,c3,c1); | |
920 | $LD r7,`1*$BNSZ`(r5) | |
921 | $UMULL r8,r6,r7 | |
922 | $UMULH r9,r6,r7 | |
923 | addc r11,r8,r11 | |
924 | adde r12,r9,r12 | |
925 | addze r10,r0 | |
926 | #mul_add_c(a[2],b[2],c2,c3,c1); | |
927 | $LD r6,`2*$BNSZ`(r4) | |
928 | $LD r7,`2*$BNSZ`(r5) | |
929 | $UMULL r8,r6,r7 | |
930 | $UMULH r9,r6,r7 | |
931 | addc r11,r8,r11 | |
932 | adde r12,r9,r12 | |
933 | addze r10,r10 | |
934 | #mul_add_c(a[1],b[3],c2,c3,c1); | |
935 | $LD r6,`1*$BNSZ`(r4) | |
936 | $LD r7,`3*$BNSZ`(r5) | |
937 | $UMULL r8,r6,r7 | |
938 | $UMULH r9,r6,r7 | |
939 | addc r11,r8,r11 | |
940 | adde r12,r9,r12 | |
941 | addze r10,r10 | |
942 | $ST r11,`4*$BNSZ`(r3) #r[4]=c2 | |
943 | #mul_add_c(a[2],b[3],c3,c1,c2); | |
944 | $LD r6,`2*$BNSZ`(r4) | |
945 | $UMULL r8,r6,r7 | |
946 | $UMULH r9,r6,r7 | |
947 | addc r12,r8,r12 | |
948 | adde r10,r9,r10 | |
949 | addze r11,r0 | |
950 | #mul_add_c(a[3],b[2],c3,c1,c2); | |
951 | $LD r6,`3*$BNSZ`(r4) | |
952 | $LD r7,`2*$BNSZ`(r4) | |
953 | $UMULL r8,r6,r7 | |
954 | $UMULH r9,r6,r7 | |
955 | addc r12,r8,r12 | |
956 | adde r10,r9,r10 | |
957 | addze r11,r11 | |
958 | $ST r12,`5*$BNSZ`(r3) #r[5]=c3 | |
959 | #mul_add_c(a[3],b[3],c1,c2,c3); | |
960 | $LD r7,`3*$BNSZ`(r5) | |
961 | $UMULL r8,r6,r7 | |
962 | $UMULH r9,r6,r7 | |
963 | addc r10,r8,r10 | |
964 | adde r11,r9,r11 | |
965 | ||
966 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 | |
967 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 | |
31439046 | 968 | blr |
dd558806 AP |
969 | .long 0x00000000 |
970 | ||
971 | # | |
972 | # NOTE: The following label name should be changed to | |
973 | # "bn_mul_comba8" i.e. remove the first dot | |
974 | # for the gcc compiler. This should be automatically | |
975 | # done in the build | |
976 | # | |
977 | ||
978 | .align 4 | |
979 | .bn_mul_comba8: | |
980 | # | |
981 | # Optimized version of the bn_mul_comba8 routine. | |
982 | # | |
983 | # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
984 | # r3 contains r | |
985 | # r4 contains a | |
986 | # r5 contains b | |
987 | # r6, r7 are the 2 BN_ULONGs being multiplied. | |
988 | # r8, r9 are the results of the 32x32 giving 64 multiply. | |
989 | # r10, r11, r12 are the equivalents of c1, c2, and c3. | |
990 | # | |
991 | xor r0,r0,r0 #r0=0. Used in addze below. | |
992 | ||
993 | #mul_add_c(a[0],b[0],c1,c2,c3); | |
994 | $LD r6,`0*$BNSZ`(r4) #a[0] | |
995 | $LD r7,`0*$BNSZ`(r5) #b[0] | |
996 | $UMULL r10,r6,r7 | |
997 | $UMULH r11,r6,r7 | |
998 | $ST r10,`0*$BNSZ`(r3) #r[0]=c1; | |
999 | #mul_add_c(a[0],b[1],c2,c3,c1); | |
1000 | $LD r7,`1*$BNSZ`(r5) | |
1001 | $UMULL r8,r6,r7 | |
1002 | $UMULH r9,r6,r7 | |
1003 | addc r11,r11,r8 | |
1004 | addze r12,r9 # since we didnt set r12 to zero before. | |
1005 | addze r10,r0 | |
1006 | #mul_add_c(a[1],b[0],c2,c3,c1); | |
1007 | $LD r6,`1*$BNSZ`(r4) | |
1008 | $LD r7,`0*$BNSZ`(r5) | |
1009 | $UMULL r8,r6,r7 | |
1010 | $UMULH r9,r6,r7 | |
1011 | addc r11,r11,r8 | |
1012 | adde r12,r12,r9 | |
1013 | addze r10,r10 | |
1014 | $ST r11,`1*$BNSZ`(r3) #r[1]=c2; | |
1015 | #mul_add_c(a[2],b[0],c3,c1,c2); | |
1016 | $LD r6,`2*$BNSZ`(r4) | |
1017 | $UMULL r8,r6,r7 | |
1018 | $UMULH r9,r6,r7 | |
1019 | addc r12,r12,r8 | |
1020 | adde r10,r10,r9 | |
1021 | addze r11,r0 | |
1022 | #mul_add_c(a[1],b[1],c3,c1,c2); | |
1023 | $LD r6,`1*$BNSZ`(r4) | |
1024 | $LD r7,`1*$BNSZ`(r5) | |
1025 | $UMULL r8,r6,r7 | |
1026 | $UMULH r9,r6,r7 | |
1027 | addc r12,r12,r8 | |
1028 | adde r10,r10,r9 | |
1029 | addze r11,r11 | |
1030 | #mul_add_c(a[0],b[2],c3,c1,c2); | |
1031 | $LD r6,`0*$BNSZ`(r4) | |
1032 | $LD r7,`2*$BNSZ`(r5) | |
1033 | $UMULL r8,r6,r7 | |
1034 | $UMULH r9,r6,r7 | |
1035 | addc r12,r12,r8 | |
1036 | adde r10,r10,r9 | |
1037 | addze r11,r11 | |
1038 | $ST r12,`2*$BNSZ`(r3) #r[2]=c3; | |
1039 | #mul_add_c(a[0],b[3],c1,c2,c3); | |
1040 | $LD r7,`3*$BNSZ`(r5) | |
1041 | $UMULL r8,r6,r7 | |
1042 | $UMULH r9,r6,r7 | |
1043 | addc r10,r10,r8 | |
1044 | adde r11,r11,r9 | |
1045 | addze r12,r0 | |
1046 | #mul_add_c(a[1],b[2],c1,c2,c3); | |
1047 | $LD r6,`1*$BNSZ`(r4) | |
1048 | $LD r7,`2*$BNSZ`(r5) | |
1049 | $UMULL r8,r6,r7 | |
1050 | $UMULH r9,r6,r7 | |
1051 | addc r10,r10,r8 | |
1052 | adde r11,r11,r9 | |
1053 | addze r12,r12 | |
1054 | ||
1055 | #mul_add_c(a[2],b[1],c1,c2,c3); | |
1056 | $LD r6,`2*$BNSZ`(r4) | |
1057 | $LD r7,`1*$BNSZ`(r5) | |
1058 | $UMULL r8,r6,r7 | |
1059 | $UMULH r9,r6,r7 | |
1060 | addc r10,r10,r8 | |
1061 | adde r11,r11,r9 | |
1062 | addze r12,r12 | |
1063 | #mul_add_c(a[3],b[0],c1,c2,c3); | |
1064 | $LD r6,`3*$BNSZ`(r4) | |
1065 | $LD r7,`0*$BNSZ`(r5) | |
1066 | $UMULL r8,r6,r7 | |
1067 | $UMULH r9,r6,r7 | |
1068 | addc r10,r10,r8 | |
1069 | adde r11,r11,r9 | |
1070 | addze r12,r12 | |
1071 | $ST r10,`3*$BNSZ`(r3) #r[3]=c1; | |
1072 | #mul_add_c(a[4],b[0],c2,c3,c1); | |
1073 | $LD r6,`4*$BNSZ`(r4) | |
1074 | $UMULL r8,r6,r7 | |
1075 | $UMULH r9,r6,r7 | |
1076 | addc r11,r11,r8 | |
1077 | adde r12,r12,r9 | |
1078 | addze r10,r0 | |
1079 | #mul_add_c(a[3],b[1],c2,c3,c1); | |
1080 | $LD r6,`3*$BNSZ`(r4) | |
1081 | $LD r7,`1*$BNSZ`(r5) | |
1082 | $UMULL r8,r6,r7 | |
1083 | $UMULH r9,r6,r7 | |
1084 | addc r11,r11,r8 | |
1085 | adde r12,r12,r9 | |
1086 | addze r10,r10 | |
1087 | #mul_add_c(a[2],b[2],c2,c3,c1); | |
1088 | $LD r6,`2*$BNSZ`(r4) | |
1089 | $LD r7,`2*$BNSZ`(r5) | |
1090 | $UMULL r8,r6,r7 | |
1091 | $UMULH r9,r6,r7 | |
1092 | addc r11,r11,r8 | |
1093 | adde r12,r12,r9 | |
1094 | addze r10,r10 | |
1095 | #mul_add_c(a[1],b[3],c2,c3,c1); | |
1096 | $LD r6,`1*$BNSZ`(r4) | |
1097 | $LD r7,`3*$BNSZ`(r5) | |
1098 | $UMULL r8,r6,r7 | |
1099 | $UMULH r9,r6,r7 | |
1100 | addc r11,r11,r8 | |
1101 | adde r12,r12,r9 | |
1102 | addze r10,r10 | |
1103 | #mul_add_c(a[0],b[4],c2,c3,c1); | |
1104 | $LD r6,`0*$BNSZ`(r4) | |
1105 | $LD r7,`4*$BNSZ`(r5) | |
1106 | $UMULL r8,r6,r7 | |
1107 | $UMULH r9,r6,r7 | |
1108 | addc r11,r11,r8 | |
1109 | adde r12,r12,r9 | |
1110 | addze r10,r10 | |
1111 | $ST r11,`4*$BNSZ`(r3) #r[4]=c2; | |
1112 | #mul_add_c(a[0],b[5],c3,c1,c2); | |
1113 | $LD r7,`5*$BNSZ`(r5) | |
1114 | $UMULL r8,r6,r7 | |
1115 | $UMULH r9,r6,r7 | |
1116 | addc r12,r12,r8 | |
1117 | adde r10,r10,r9 | |
1118 | addze r11,r0 | |
1119 | #mul_add_c(a[1],b[4],c3,c1,c2); | |
1120 | $LD r6,`1*$BNSZ`(r4) | |
1121 | $LD r7,`4*$BNSZ`(r5) | |
1122 | $UMULL r8,r6,r7 | |
1123 | $UMULH r9,r6,r7 | |
1124 | addc r12,r12,r8 | |
1125 | adde r10,r10,r9 | |
1126 | addze r11,r11 | |
1127 | #mul_add_c(a[2],b[3],c3,c1,c2); | |
1128 | $LD r6,`2*$BNSZ`(r4) | |
1129 | $LD r7,`3*$BNSZ`(r5) | |
1130 | $UMULL r8,r6,r7 | |
1131 | $UMULH r9,r6,r7 | |
1132 | addc r12,r12,r8 | |
1133 | adde r10,r10,r9 | |
1134 | addze r11,r11 | |
1135 | #mul_add_c(a[3],b[2],c3,c1,c2); | |
1136 | $LD r6,`3*$BNSZ`(r4) | |
1137 | $LD r7,`2*$BNSZ`(r5) | |
1138 | $UMULL r8,r6,r7 | |
1139 | $UMULH r9,r6,r7 | |
1140 | addc r12,r12,r8 | |
1141 | adde r10,r10,r9 | |
1142 | addze r11,r11 | |
1143 | #mul_add_c(a[4],b[1],c3,c1,c2); | |
1144 | $LD r6,`4*$BNSZ`(r4) | |
1145 | $LD r7,`1*$BNSZ`(r5) | |
1146 | $UMULL r8,r6,r7 | |
1147 | $UMULH r9,r6,r7 | |
1148 | addc r12,r12,r8 | |
1149 | adde r10,r10,r9 | |
1150 | addze r11,r11 | |
1151 | #mul_add_c(a[5],b[0],c3,c1,c2); | |
1152 | $LD r6,`5*$BNSZ`(r4) | |
1153 | $LD r7,`0*$BNSZ`(r5) | |
1154 | $UMULL r8,r6,r7 | |
1155 | $UMULH r9,r6,r7 | |
1156 | addc r12,r12,r8 | |
1157 | adde r10,r10,r9 | |
1158 | addze r11,r11 | |
1159 | $ST r12,`5*$BNSZ`(r3) #r[5]=c3; | |
1160 | #mul_add_c(a[6],b[0],c1,c2,c3); | |
1161 | $LD r6,`6*$BNSZ`(r4) | |
1162 | $UMULL r8,r6,r7 | |
1163 | $UMULH r9,r6,r7 | |
1164 | addc r10,r10,r8 | |
1165 | adde r11,r11,r9 | |
1166 | addze r12,r0 | |
1167 | #mul_add_c(a[5],b[1],c1,c2,c3); | |
1168 | $LD r6,`5*$BNSZ`(r4) | |
1169 | $LD r7,`1*$BNSZ`(r5) | |
1170 | $UMULL r8,r6,r7 | |
1171 | $UMULH r9,r6,r7 | |
1172 | addc r10,r10,r8 | |
1173 | adde r11,r11,r9 | |
1174 | addze r12,r12 | |
1175 | #mul_add_c(a[4],b[2],c1,c2,c3); | |
1176 | $LD r6,`4*$BNSZ`(r4) | |
1177 | $LD r7,`2*$BNSZ`(r5) | |
1178 | $UMULL r8,r6,r7 | |
1179 | $UMULH r9,r6,r7 | |
1180 | addc r10,r10,r8 | |
1181 | adde r11,r11,r9 | |
1182 | addze r12,r12 | |
1183 | #mul_add_c(a[3],b[3],c1,c2,c3); | |
1184 | $LD r6,`3*$BNSZ`(r4) | |
1185 | $LD r7,`3*$BNSZ`(r5) | |
1186 | $UMULL r8,r6,r7 | |
1187 | $UMULH r9,r6,r7 | |
1188 | addc r10,r10,r8 | |
1189 | adde r11,r11,r9 | |
1190 | addze r12,r12 | |
1191 | #mul_add_c(a[2],b[4],c1,c2,c3); | |
1192 | $LD r6,`2*$BNSZ`(r4) | |
1193 | $LD r7,`4*$BNSZ`(r5) | |
1194 | $UMULL r8,r6,r7 | |
1195 | $UMULH r9,r6,r7 | |
1196 | addc r10,r10,r8 | |
1197 | adde r11,r11,r9 | |
1198 | addze r12,r12 | |
1199 | #mul_add_c(a[1],b[5],c1,c2,c3); | |
1200 | $LD r6,`1*$BNSZ`(r4) | |
1201 | $LD r7,`5*$BNSZ`(r5) | |
1202 | $UMULL r8,r6,r7 | |
1203 | $UMULH r9,r6,r7 | |
1204 | addc r10,r10,r8 | |
1205 | adde r11,r11,r9 | |
1206 | addze r12,r12 | |
1207 | #mul_add_c(a[0],b[6],c1,c2,c3); | |
1208 | $LD r6,`0*$BNSZ`(r4) | |
1209 | $LD r7,`6*$BNSZ`(r5) | |
1210 | $UMULL r8,r6,r7 | |
1211 | $UMULH r9,r6,r7 | |
1212 | addc r10,r10,r8 | |
1213 | adde r11,r11,r9 | |
1214 | addze r12,r12 | |
1215 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1; | |
1216 | #mul_add_c(a[0],b[7],c2,c3,c1); | |
1217 | $LD r7,`7*$BNSZ`(r5) | |
1218 | $UMULL r8,r6,r7 | |
1219 | $UMULH r9,r6,r7 | |
1220 | addc r11,r11,r8 | |
1221 | adde r12,r12,r9 | |
1222 | addze r10,r0 | |
1223 | #mul_add_c(a[1],b[6],c2,c3,c1); | |
1224 | $LD r6,`1*$BNSZ`(r4) | |
1225 | $LD r7,`6*$BNSZ`(r5) | |
1226 | $UMULL r8,r6,r7 | |
1227 | $UMULH r9,r6,r7 | |
1228 | addc r11,r11,r8 | |
1229 | adde r12,r12,r9 | |
1230 | addze r10,r10 | |
1231 | #mul_add_c(a[2],b[5],c2,c3,c1); | |
1232 | $LD r6,`2*$BNSZ`(r4) | |
1233 | $LD r7,`5*$BNSZ`(r5) | |
1234 | $UMULL r8,r6,r7 | |
1235 | $UMULH r9,r6,r7 | |
1236 | addc r11,r11,r8 | |
1237 | adde r12,r12,r9 | |
1238 | addze r10,r10 | |
1239 | #mul_add_c(a[3],b[4],c2,c3,c1); | |
1240 | $LD r6,`3*$BNSZ`(r4) | |
1241 | $LD r7,`4*$BNSZ`(r5) | |
1242 | $UMULL r8,r6,r7 | |
1243 | $UMULH r9,r6,r7 | |
1244 | addc r11,r11,r8 | |
1245 | adde r12,r12,r9 | |
1246 | addze r10,r10 | |
1247 | #mul_add_c(a[4],b[3],c2,c3,c1); | |
1248 | $LD r6,`4*$BNSZ`(r4) | |
1249 | $LD r7,`3*$BNSZ`(r5) | |
1250 | $UMULL r8,r6,r7 | |
1251 | $UMULH r9,r6,r7 | |
1252 | addc r11,r11,r8 | |
1253 | adde r12,r12,r9 | |
1254 | addze r10,r10 | |
1255 | #mul_add_c(a[5],b[2],c2,c3,c1); | |
1256 | $LD r6,`5*$BNSZ`(r4) | |
1257 | $LD r7,`2*$BNSZ`(r5) | |
1258 | $UMULL r8,r6,r7 | |
1259 | $UMULH r9,r6,r7 | |
1260 | addc r11,r11,r8 | |
1261 | adde r12,r12,r9 | |
1262 | addze r10,r10 | |
1263 | #mul_add_c(a[6],b[1],c2,c3,c1); | |
1264 | $LD r6,`6*$BNSZ`(r4) | |
1265 | $LD r7,`1*$BNSZ`(r5) | |
1266 | $UMULL r8,r6,r7 | |
1267 | $UMULH r9,r6,r7 | |
1268 | addc r11,r11,r8 | |
1269 | adde r12,r12,r9 | |
1270 | addze r10,r10 | |
1271 | #mul_add_c(a[7],b[0],c2,c3,c1); | |
1272 | $LD r6,`7*$BNSZ`(r4) | |
1273 | $LD r7,`0*$BNSZ`(r5) | |
1274 | $UMULL r8,r6,r7 | |
1275 | $UMULH r9,r6,r7 | |
1276 | addc r11,r11,r8 | |
1277 | adde r12,r12,r9 | |
1278 | addze r10,r10 | |
1279 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2; | |
1280 | #mul_add_c(a[7],b[1],c3,c1,c2); | |
1281 | $LD r7,`1*$BNSZ`(r5) | |
1282 | $UMULL r8,r6,r7 | |
1283 | $UMULH r9,r6,r7 | |
1284 | addc r12,r12,r8 | |
1285 | adde r10,r10,r9 | |
1286 | addze r11,r0 | |
1287 | #mul_add_c(a[6],b[2],c3,c1,c2); | |
1288 | $LD r6,`6*$BNSZ`(r4) | |
1289 | $LD r7,`2*$BNSZ`(r5) | |
1290 | $UMULL r8,r6,r7 | |
1291 | $UMULH r9,r6,r7 | |
1292 | addc r12,r12,r8 | |
1293 | adde r10,r10,r9 | |
1294 | addze r11,r11 | |
1295 | #mul_add_c(a[5],b[3],c3,c1,c2); | |
1296 | $LD r6,`5*$BNSZ`(r4) | |
1297 | $LD r7,`3*$BNSZ`(r5) | |
1298 | $UMULL r8,r6,r7 | |
1299 | $UMULH r9,r6,r7 | |
1300 | addc r12,r12,r8 | |
1301 | adde r10,r10,r9 | |
1302 | addze r11,r11 | |
1303 | #mul_add_c(a[4],b[4],c3,c1,c2); | |
1304 | $LD r6,`4*$BNSZ`(r4) | |
1305 | $LD r7,`4*$BNSZ`(r5) | |
1306 | $UMULL r8,r6,r7 | |
1307 | $UMULH r9,r6,r7 | |
1308 | addc r12,r12,r8 | |
1309 | adde r10,r10,r9 | |
1310 | addze r11,r11 | |
1311 | #mul_add_c(a[3],b[5],c3,c1,c2); | |
1312 | $LD r6,`3*$BNSZ`(r4) | |
1313 | $LD r7,`5*$BNSZ`(r5) | |
1314 | $UMULL r8,r6,r7 | |
1315 | $UMULH r9,r6,r7 | |
1316 | addc r12,r12,r8 | |
1317 | adde r10,r10,r9 | |
1318 | addze r11,r11 | |
1319 | #mul_add_c(a[2],b[6],c3,c1,c2); | |
1320 | $LD r6,`2*$BNSZ`(r4) | |
1321 | $LD r7,`6*$BNSZ`(r5) | |
1322 | $UMULL r8,r6,r7 | |
1323 | $UMULH r9,r6,r7 | |
1324 | addc r12,r12,r8 | |
1325 | adde r10,r10,r9 | |
1326 | addze r11,r11 | |
1327 | #mul_add_c(a[1],b[7],c3,c1,c2); | |
1328 | $LD r6,`1*$BNSZ`(r4) | |
1329 | $LD r7,`7*$BNSZ`(r5) | |
1330 | $UMULL r8,r6,r7 | |
1331 | $UMULH r9,r6,r7 | |
1332 | addc r12,r12,r8 | |
1333 | adde r10,r10,r9 | |
1334 | addze r11,r11 | |
1335 | $ST r12,`8*$BNSZ`(r3) #r[8]=c3; | |
1336 | #mul_add_c(a[2],b[7],c1,c2,c3); | |
1337 | $LD r6,`2*$BNSZ`(r4) | |
1338 | $UMULL r8,r6,r7 | |
1339 | $UMULH r9,r6,r7 | |
1340 | addc r10,r10,r8 | |
1341 | adde r11,r11,r9 | |
1342 | addze r12,r0 | |
1343 | #mul_add_c(a[3],b[6],c1,c2,c3); | |
1344 | $LD r6,`3*$BNSZ`(r4) | |
1345 | $LD r7,`6*$BNSZ`(r5) | |
1346 | $UMULL r8,r6,r7 | |
1347 | $UMULH r9,r6,r7 | |
1348 | addc r10,r10,r8 | |
1349 | adde r11,r11,r9 | |
1350 | addze r12,r12 | |
1351 | #mul_add_c(a[4],b[5],c1,c2,c3); | |
1352 | $LD r6,`4*$BNSZ`(r4) | |
1353 | $LD r7,`5*$BNSZ`(r5) | |
1354 | $UMULL r8,r6,r7 | |
1355 | $UMULH r9,r6,r7 | |
1356 | addc r10,r10,r8 | |
1357 | adde r11,r11,r9 | |
1358 | addze r12,r12 | |
1359 | #mul_add_c(a[5],b[4],c1,c2,c3); | |
1360 | $LD r6,`5*$BNSZ`(r4) | |
1361 | $LD r7,`4*$BNSZ`(r5) | |
1362 | $UMULL r8,r6,r7 | |
1363 | $UMULH r9,r6,r7 | |
1364 | addc r10,r10,r8 | |
1365 | adde r11,r11,r9 | |
1366 | addze r12,r12 | |
1367 | #mul_add_c(a[6],b[3],c1,c2,c3); | |
1368 | $LD r6,`6*$BNSZ`(r4) | |
1369 | $LD r7,`3*$BNSZ`(r5) | |
1370 | $UMULL r8,r6,r7 | |
1371 | $UMULH r9,r6,r7 | |
1372 | addc r10,r10,r8 | |
1373 | adde r11,r11,r9 | |
1374 | addze r12,r12 | |
1375 | #mul_add_c(a[7],b[2],c1,c2,c3); | |
1376 | $LD r6,`7*$BNSZ`(r4) | |
1377 | $LD r7,`2*$BNSZ`(r5) | |
1378 | $UMULL r8,r6,r7 | |
1379 | $UMULH r9,r6,r7 | |
1380 | addc r10,r10,r8 | |
1381 | adde r11,r11,r9 | |
1382 | addze r12,r12 | |
1383 | $ST r10,`9*$BNSZ`(r3) #r[9]=c1; | |
1384 | #mul_add_c(a[7],b[3],c2,c3,c1); | |
1385 | $LD r7,`3*$BNSZ`(r5) | |
1386 | $UMULL r8,r6,r7 | |
1387 | $UMULH r9,r6,r7 | |
1388 | addc r11,r11,r8 | |
1389 | adde r12,r12,r9 | |
1390 | addze r10,r0 | |
1391 | #mul_add_c(a[6],b[4],c2,c3,c1); | |
1392 | $LD r6,`6*$BNSZ`(r4) | |
1393 | $LD r7,`4*$BNSZ`(r5) | |
1394 | $UMULL r8,r6,r7 | |
1395 | $UMULH r9,r6,r7 | |
1396 | addc r11,r11,r8 | |
1397 | adde r12,r12,r9 | |
1398 | addze r10,r10 | |
1399 | #mul_add_c(a[5],b[5],c2,c3,c1); | |
1400 | $LD r6,`5*$BNSZ`(r4) | |
1401 | $LD r7,`5*$BNSZ`(r5) | |
1402 | $UMULL r8,r6,r7 | |
1403 | $UMULH r9,r6,r7 | |
1404 | addc r11,r11,r8 | |
1405 | adde r12,r12,r9 | |
1406 | addze r10,r10 | |
1407 | #mul_add_c(a[4],b[6],c2,c3,c1); | |
1408 | $LD r6,`4*$BNSZ`(r4) | |
1409 | $LD r7,`6*$BNSZ`(r5) | |
1410 | $UMULL r8,r6,r7 | |
1411 | $UMULH r9,r6,r7 | |
1412 | addc r11,r11,r8 | |
1413 | adde r12,r12,r9 | |
1414 | addze r10,r10 | |
1415 | #mul_add_c(a[3],b[7],c2,c3,c1); | |
1416 | $LD r6,`3*$BNSZ`(r4) | |
1417 | $LD r7,`7*$BNSZ`(r5) | |
1418 | $UMULL r8,r6,r7 | |
1419 | $UMULH r9,r6,r7 | |
1420 | addc r11,r11,r8 | |
1421 | adde r12,r12,r9 | |
1422 | addze r10,r10 | |
1423 | $ST r11,`10*$BNSZ`(r3) #r[10]=c2; | |
1424 | #mul_add_c(a[4],b[7],c3,c1,c2); | |
1425 | $LD r6,`4*$BNSZ`(r4) | |
1426 | $UMULL r8,r6,r7 | |
1427 | $UMULH r9,r6,r7 | |
1428 | addc r12,r12,r8 | |
1429 | adde r10,r10,r9 | |
1430 | addze r11,r0 | |
1431 | #mul_add_c(a[5],b[6],c3,c1,c2); | |
1432 | $LD r6,`5*$BNSZ`(r4) | |
1433 | $LD r7,`6*$BNSZ`(r5) | |
1434 | $UMULL r8,r6,r7 | |
1435 | $UMULH r9,r6,r7 | |
1436 | addc r12,r12,r8 | |
1437 | adde r10,r10,r9 | |
1438 | addze r11,r11 | |
1439 | #mul_add_c(a[6],b[5],c3,c1,c2); | |
1440 | $LD r6,`6*$BNSZ`(r4) | |
1441 | $LD r7,`5*$BNSZ`(r5) | |
1442 | $UMULL r8,r6,r7 | |
1443 | $UMULH r9,r6,r7 | |
1444 | addc r12,r12,r8 | |
1445 | adde r10,r10,r9 | |
1446 | addze r11,r11 | |
1447 | #mul_add_c(a[7],b[4],c3,c1,c2); | |
1448 | $LD r6,`7*$BNSZ`(r4) | |
1449 | $LD r7,`4*$BNSZ`(r5) | |
1450 | $UMULL r8,r6,r7 | |
1451 | $UMULH r9,r6,r7 | |
1452 | addc r12,r12,r8 | |
1453 | adde r10,r10,r9 | |
1454 | addze r11,r11 | |
1455 | $ST r12,`11*$BNSZ`(r3) #r[11]=c3; | |
1456 | #mul_add_c(a[7],b[5],c1,c2,c3); | |
1457 | $LD r7,`5*$BNSZ`(r5) | |
1458 | $UMULL r8,r6,r7 | |
1459 | $UMULH r9,r6,r7 | |
1460 | addc r10,r10,r8 | |
1461 | adde r11,r11,r9 | |
1462 | addze r12,r0 | |
1463 | #mul_add_c(a[6],b[6],c1,c2,c3); | |
1464 | $LD r6,`6*$BNSZ`(r4) | |
1465 | $LD r7,`6*$BNSZ`(r5) | |
1466 | $UMULL r8,r6,r7 | |
1467 | $UMULH r9,r6,r7 | |
1468 | addc r10,r10,r8 | |
1469 | adde r11,r11,r9 | |
1470 | addze r12,r12 | |
1471 | #mul_add_c(a[5],b[7],c1,c2,c3); | |
1472 | $LD r6,`5*$BNSZ`(r4) | |
1473 | $LD r7,`7*$BNSZ`(r5) | |
1474 | $UMULL r8,r6,r7 | |
1475 | $UMULH r9,r6,r7 | |
1476 | addc r10,r10,r8 | |
1477 | adde r11,r11,r9 | |
1478 | addze r12,r12 | |
1479 | $ST r10,`12*$BNSZ`(r3) #r[12]=c1; | |
1480 | #mul_add_c(a[6],b[7],c2,c3,c1); | |
1481 | $LD r6,`6*$BNSZ`(r4) | |
1482 | $UMULL r8,r6,r7 | |
1483 | $UMULH r9,r6,r7 | |
1484 | addc r11,r11,r8 | |
1485 | adde r12,r12,r9 | |
1486 | addze r10,r0 | |
1487 | #mul_add_c(a[7],b[6],c2,c3,c1); | |
1488 | $LD r6,`7*$BNSZ`(r4) | |
1489 | $LD r7,`6*$BNSZ`(r5) | |
1490 | $UMULL r8,r6,r7 | |
1491 | $UMULH r9,r6,r7 | |
1492 | addc r11,r11,r8 | |
1493 | adde r12,r12,r9 | |
1494 | addze r10,r10 | |
1495 | $ST r11,`13*$BNSZ`(r3) #r[13]=c2; | |
1496 | #mul_add_c(a[7],b[7],c3,c1,c2); | |
1497 | $LD r7,`7*$BNSZ`(r5) | |
1498 | $UMULL r8,r6,r7 | |
1499 | $UMULH r9,r6,r7 | |
1500 | addc r12,r12,r8 | |
1501 | adde r10,r10,r9 | |
1502 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; | |
1503 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; | |
31439046 | 1504 | blr |
dd558806 AP |
1505 | .long 0x00000000 |
1506 | ||
1507 | # | |
1508 | # NOTE: The following label name should be changed to | |
1509 | # "bn_sub_words" i.e. remove the first dot | |
1510 | # for the gcc compiler. This should be automatically | |
1511 | # done in the build | |
1512 | # | |
1513 | # | |
1514 | .align 4 | |
1515 | .bn_sub_words: | |
1516 | # | |
1517 | # Handcoded version of bn_sub_words | |
1518 | # | |
1519 | #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
1520 | # | |
1521 | # r3 = r | |
1522 | # r4 = a | |
1523 | # r5 = b | |
1524 | # r6 = n | |
1525 | # | |
1526 | # Note: No loop unrolling done since this is not a performance | |
1527 | # critical loop. | |
1528 | ||
1529 | xor r0,r0,r0 #set r0 = 0 | |
1530 | # | |
1531 | # check for r6 = 0 AND set carry bit. | |
1532 | # | |
1533 | subfc. r7,r0,r6 # If r6 is 0 then result is 0. | |
1534 | # if r6 > 0 then result !=0 | |
1535 | # In either case carry bit is set. | |
31439046 | 1536 | beq Lppcasm_sub_adios |
dd558806 AP |
1537 | addi r4,r4,-$BNSZ |
1538 | addi r3,r3,-$BNSZ | |
1539 | addi r5,r5,-$BNSZ | |
1540 | mtctr r6 | |
1541 | Lppcasm_sub_mainloop: | |
1542 | $LDU r7,$BNSZ(r4) | |
1543 | $LDU r8,$BNSZ(r5) | |
1544 | subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) | |
1545 | # if carry = 1 this is r7-r8. Else it | |
1546 | # is r7-r8 -1 as we need. | |
1547 | $STU r6,$BNSZ(r3) | |
31439046 | 1548 | bdnz- Lppcasm_sub_mainloop |
dd558806 AP |
1549 | Lppcasm_sub_adios: |
1550 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 | |
1551 | andi. r3,r3,1 # keep only last bit. | |
31439046 | 1552 | blr |
dd558806 AP |
1553 | .long 0x00000000 |
1554 | ||
1555 | ||
1556 | # | |
1557 | # NOTE: The following label name should be changed to | |
1558 | # "bn_add_words" i.e. remove the first dot | |
1559 | # for the gcc compiler. This should be automatically | |
1560 | # done in the build | |
1561 | # | |
1562 | ||
1563 | .align 4 | |
1564 | .bn_add_words: | |
1565 | # | |
1566 | # Handcoded version of bn_add_words | |
1567 | # | |
1568 | #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
1569 | # | |
1570 | # r3 = r | |
1571 | # r4 = a | |
1572 | # r5 = b | |
1573 | # r6 = n | |
1574 | # | |
1575 | # Note: No loop unrolling done since this is not a performance | |
1576 | # critical loop. | |
1577 | ||
1578 | xor r0,r0,r0 | |
1579 | # | |
1580 | # check for r6 = 0. Is this needed? | |
1581 | # | |
1582 | addic. r6,r6,0 #test r6 and clear carry bit. | |
31439046 | 1583 | beq Lppcasm_add_adios |
dd558806 AP |
1584 | addi r4,r4,-$BNSZ |
1585 | addi r3,r3,-$BNSZ | |
1586 | addi r5,r5,-$BNSZ | |
1587 | mtctr r6 | |
1588 | Lppcasm_add_mainloop: | |
1589 | $LDU r7,$BNSZ(r4) | |
1590 | $LDU r8,$BNSZ(r5) | |
1591 | adde r8,r7,r8 | |
1592 | $STU r8,$BNSZ(r3) | |
31439046 | 1593 | bdnz- Lppcasm_add_mainloop |
dd558806 AP |
1594 | Lppcasm_add_adios: |
1595 | addze r3,r0 #return carry bit. | |
31439046 | 1596 | blr |
dd558806 AP |
1597 | .long 0x00000000 |
1598 | ||
1599 | # | |
1600 | # NOTE: The following label name should be changed to | |
1601 | # "bn_div_words" i.e. remove the first dot | |
1602 | # for the gcc compiler. This should be automatically | |
1603 | # done in the build | |
1604 | # | |
1605 | ||
1606 | .align 4 | |
1607 | .bn_div_words: | |
1608 | # | |
1609 | # This is a cleaned up version of code generated by | |
1610 | # the AIX compiler. The only optimization is to use | |
1611 | # the PPC instruction to count leading zeros instead | |
1612 | # of call to num_bits_word. Since this was compiled | |
1613 | # only at level -O2 we can possibly squeeze it more? | |
1614 | # | |
1615 | # r3 = h | |
1616 | # r4 = l | |
1617 | # r5 = d | |
1618 | ||
1619 | $UCMPI 0,r5,0 # compare r5 and 0 | |
31439046 | 1620 | bne Lppcasm_div1 # proceed if d!=0 |
dd558806 | 1621 | li r3,-1 # d=0 return -1 |
31439046 | 1622 | blr |
dd558806 AP |
1623 | Lppcasm_div1: |
1624 | xor r0,r0,r0 #r0=0 | |
aaa5dc61 AP |
1625 | li r8,$BITS |
1626 | $CNTLZ. r7,r5 #r7 = num leading 0s in d. | |
31439046 | 1627 | beq Lppcasm_div2 #proceed if no leading zeros |
aaa5dc61 AP |
1628 | subf r8,r7,r8 #r8 = BN_num_bits_word(d) |
1629 | $SHR. r9,r3,r8 #are there any bits above r8'th? | |
31efffbd | 1630 | $TR 16,r9,r0 #if there're, signal to dump core... |
dd558806 AP |
1631 | Lppcasm_div2: |
1632 | $UCMP 0,r3,r5 #h>=d? | |
31439046 | 1633 | blt Lppcasm_div3 #goto Lppcasm_div3 if not |
dd558806 AP |
1634 | subf r3,r5,r3 #h-=d ; |
1635 | Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i | |
1636 | cmpi 0,0,r7,0 # is (i == 0)? | |
31439046 | 1637 | beq Lppcasm_div4 |
dd558806 AP |
1638 | $SHL r3,r3,r7 # h = (h<< i) |
1639 | $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) | |
1640 | $SHL r5,r5,r7 # d<<=i | |
1641 | or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) | |
1642 | $SHL r4,r4,r7 # l <<=i | |
1643 | Lppcasm_div4: | |
1644 | $SHRI r9,r5,`$BITS/2` # r9 = dh | |
1645 | # dl will be computed when needed | |
1646 | # as it saves registers. | |
1647 | li r6,2 #r6=2 | |
1648 | mtctr r6 #counter will be in count. | |
1649 | Lppcasm_divouterloop: | |
1650 | $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) | |
1651 | $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 | |
1652 | # compute here for innerloop. | |
1653 | $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh | |
31439046 | 1654 | bne Lppcasm_div5 # goto Lppcasm_div5 if not |
dd558806 AP |
1655 | |
1656 | li r8,-1 | |
1657 | $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l | |
1658 | b Lppcasm_div6 | |
1659 | Lppcasm_div5: | |
1660 | $UDIV r8,r3,r9 #q = h/dh | |
1661 | Lppcasm_div6: | |
1662 | $UMULL r12,r9,r8 #th = q*dh | |
1663 | $CLRU r10,r5,`$BITS/2` #r10=dl | |
1664 | $UMULL r6,r8,r10 #tl = q*dl | |
1665 | ||
1666 | Lppcasm_divinnerloop: | |
1667 | subf r10,r12,r3 #t = h -th | |
1668 | $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... | |
1669 | addic. r7,r7,0 #test if r7 == 0. used below. | |
1670 | # now want to compute | |
1671 | # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) | |
1672 | # the following 2 instructions do that | |
1673 | $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) | |
1674 | or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) | |
31439046 AP |
1675 | $UCMP cr1,r6,r7 # compare (tl <= r7) |
1676 | bne Lppcasm_divinnerexit | |
1677 | ble cr1,Lppcasm_divinnerexit | |
dd558806 AP |
1678 | addi r8,r8,-1 #q-- |
1679 | subf r12,r9,r12 #th -=dh | |
1680 | $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. | |
1681 | subf r6,r10,r6 #tl -=dl | |
1682 | b Lppcasm_divinnerloop | |
1683 | Lppcasm_divinnerexit: | |
1684 | $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) | |
1685 | $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; | |
31439046 | 1686 | $UCMP cr1,r4,r11 # compare l and tl |
dd558806 | 1687 | add r12,r12,r10 # th+=t |
31439046 | 1688 | bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 |
dd558806 AP |
1689 | addi r12,r12,1 # th++ |
1690 | Lppcasm_div7: | |
1691 | subf r11,r11,r4 #r11=l-tl | |
31439046 AP |
1692 | $UCMP cr1,r3,r12 #compare h and th |
1693 | bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 | |
dd558806 AP |
1694 | addi r8,r8,-1 # q-- |
1695 | add r3,r5,r3 # h+=d | |
1696 | Lppcasm_div8: | |
1697 | subf r12,r12,r3 #r12 = h-th | |
1698 | $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 | |
1699 | # want to compute | |
1700 | # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 | |
1701 | # the following 2 instructions will do this. | |
1702 | $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. | |
1703 | $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 | |
31439046 | 1704 | bdz Lppcasm_div9 #if (count==0) break ; |
dd558806 AP |
1705 | $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 |
1706 | b Lppcasm_divouterloop | |
1707 | Lppcasm_div9: | |
1708 | or r3,r8,r0 | |
31439046 | 1709 | blr |
dd558806 AP |
1710 | .long 0x00000000 |
1711 | ||
1712 | # | |
1713 | # NOTE: The following label name should be changed to | |
1714 | # "bn_sqr_words" i.e. remove the first dot | |
1715 | # for the gcc compiler. This should be automatically | |
1716 | # done in the build | |
1717 | # | |
1718 | .align 4 | |
1719 | .bn_sqr_words: | |
1720 | # | |
1721 | # Optimized version of bn_sqr_words | |
1722 | # | |
1723 | # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) | |
1724 | # | |
1725 | # r3 = r | |
1726 | # r4 = a | |
1727 | # r5 = n | |
1728 | # | |
1729 | # r6 = a[i]. | |
1730 | # r7,r8 = product. | |
1731 | # | |
1732 | # No unrolling done here. Not performance critical. | |
1733 | ||
1734 | addic. r5,r5,0 #test r5. | |
31439046 | 1735 | beq Lppcasm_sqr_adios |
dd558806 AP |
1736 | addi r4,r4,-$BNSZ |
1737 | addi r3,r3,-$BNSZ | |
1738 | mtctr r5 | |
1739 | Lppcasm_sqr_mainloop: | |
1740 | #sqr(r[0],r[1],a[0]); | |
1741 | $LDU r6,$BNSZ(r4) | |
1742 | $UMULL r7,r6,r6 | |
1743 | $UMULH r8,r6,r6 | |
1744 | $STU r7,$BNSZ(r3) | |
1745 | $STU r8,$BNSZ(r3) | |
31439046 | 1746 | bdnz- Lppcasm_sqr_mainloop |
dd558806 | 1747 | Lppcasm_sqr_adios: |
31439046 | 1748 | blr |
dd558806 AP |
1749 | .long 0x00000000 |
1750 | ||
1751 | ||
1752 | # | |
1753 | # NOTE: The following label name should be changed to | |
1754 | # "bn_mul_words" i.e. remove the first dot | |
1755 | # for the gcc compiler. This should be automatically | |
1756 | # done in the build | |
1757 | # | |
1758 | ||
1759 | .align 4 | |
1760 | .bn_mul_words: | |
1761 | # | |
1762 | # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
1763 | # | |
1764 | # r3 = rp | |
1765 | # r4 = ap | |
1766 | # r5 = num | |
1767 | # r6 = w | |
1768 | xor r0,r0,r0 | |
1769 | xor r12,r12,r12 # used for carry | |
1770 | rlwinm. r7,r5,30,2,31 # num >> 2 | |
31439046 | 1771 | beq Lppcasm_mw_REM |
dd558806 AP |
1772 | mtctr r7 |
1773 | Lppcasm_mw_LOOP: | |
1774 | #mul(rp[0],ap[0],w,c1); | |
1775 | $LD r8,`0*$BNSZ`(r4) | |
1776 | $UMULL r9,r6,r8 | |
1777 | $UMULH r10,r6,r8 | |
1778 | addc r9,r9,r12 | |
1779 | #addze r10,r10 #carry is NOT ignored. | |
1780 | #will be taken care of | |
1781 | #in second spin below | |
1782 | #using adde. | |
1783 | $ST r9,`0*$BNSZ`(r3) | |
1784 | #mul(rp[1],ap[1],w,c1); | |
1785 | $LD r8,`1*$BNSZ`(r4) | |
1786 | $UMULL r11,r6,r8 | |
1787 | $UMULH r12,r6,r8 | |
1788 | adde r11,r11,r10 | |
1789 | #addze r12,r12 | |
1790 | $ST r11,`1*$BNSZ`(r3) | |
1791 | #mul(rp[2],ap[2],w,c1); | |
1792 | $LD r8,`2*$BNSZ`(r4) | |
1793 | $UMULL r9,r6,r8 | |
1794 | $UMULH r10,r6,r8 | |
1795 | adde r9,r9,r12 | |
1796 | #addze r10,r10 | |
1797 | $ST r9,`2*$BNSZ`(r3) | |
1798 | #mul_add(rp[3],ap[3],w,c1); | |
1799 | $LD r8,`3*$BNSZ`(r4) | |
1800 | $UMULL r11,r6,r8 | |
1801 | $UMULH r12,r6,r8 | |
1802 | adde r11,r11,r10 | |
1803 | addze r12,r12 #this spin we collect carry into | |
1804 | #r12 | |
1805 | $ST r11,`3*$BNSZ`(r3) | |
1806 | ||
1807 | addi r3,r3,`4*$BNSZ` | |
1808 | addi r4,r4,`4*$BNSZ` | |
31439046 | 1809 | bdnz- Lppcasm_mw_LOOP |
dd558806 AP |
1810 | |
1811 | Lppcasm_mw_REM: | |
1812 | andi. r5,r5,0x3 | |
31439046 | 1813 | beq Lppcasm_mw_OVER |
dd558806 AP |
1814 | #mul(rp[0],ap[0],w,c1); |
1815 | $LD r8,`0*$BNSZ`(r4) | |
1816 | $UMULL r9,r6,r8 | |
1817 | $UMULH r10,r6,r8 | |
1818 | addc r9,r9,r12 | |
1819 | addze r10,r10 | |
1820 | $ST r9,`0*$BNSZ`(r3) | |
1821 | addi r12,r10,0 | |
1822 | ||
1823 | addi r5,r5,-1 | |
1824 | cmpli 0,0,r5,0 | |
31439046 | 1825 | beq Lppcasm_mw_OVER |
dd558806 AP |
1826 | |
1827 | ||
1828 | #mul(rp[1],ap[1],w,c1); | |
1829 | $LD r8,`1*$BNSZ`(r4) | |
1830 | $UMULL r9,r6,r8 | |
1831 | $UMULH r10,r6,r8 | |
1832 | addc r9,r9,r12 | |
1833 | addze r10,r10 | |
1834 | $ST r9,`1*$BNSZ`(r3) | |
1835 | addi r12,r10,0 | |
1836 | ||
1837 | addi r5,r5,-1 | |
1838 | cmpli 0,0,r5,0 | |
31439046 | 1839 | beq Lppcasm_mw_OVER |
dd558806 AP |
1840 | |
1841 | #mul_add(rp[2],ap[2],w,c1); | |
1842 | $LD r8,`2*$BNSZ`(r4) | |
1843 | $UMULL r9,r6,r8 | |
1844 | $UMULH r10,r6,r8 | |
1845 | addc r9,r9,r12 | |
1846 | addze r10,r10 | |
1847 | $ST r9,`2*$BNSZ`(r3) | |
1848 | addi r12,r10,0 | |
1849 | ||
1850 | Lppcasm_mw_OVER: | |
1851 | addi r3,r12,0 | |
31439046 | 1852 | blr |
dd558806 AP |
1853 | .long 0x00000000 |
1854 | ||
1855 | # | |
1856 | # NOTE: The following label name should be changed to | |
1857 | # "bn_mul_add_words" i.e. remove the first dot | |
1858 | # for the gcc compiler. This should be automatically | |
1859 | # done in the build | |
1860 | # | |
1861 | ||
1862 | .align 4 | |
1863 | .bn_mul_add_words: | |
1864 | # | |
1865 | # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
1866 | # | |
1867 | # r3 = rp | |
1868 | # r4 = ap | |
1869 | # r5 = num | |
1870 | # r6 = w | |
1871 | # | |
1872 | # empirical evidence suggests that unrolled version performs best!! | |
1873 | # | |
1874 | xor r0,r0,r0 #r0 = 0 | |
1875 | xor r12,r12,r12 #r12 = 0 . used for carry | |
1876 | rlwinm. r7,r5,30,2,31 # num >> 2 | |
31439046 | 1877 | beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover |
dd558806 AP |
1878 | mtctr r7 |
1879 | Lppcasm_maw_mainloop: | |
1880 | #mul_add(rp[0],ap[0],w,c1); | |
1881 | $LD r8,`0*$BNSZ`(r4) | |
1882 | $LD r11,`0*$BNSZ`(r3) | |
1883 | $UMULL r9,r6,r8 | |
1884 | $UMULH r10,r6,r8 | |
1885 | addc r9,r9,r12 #r12 is carry. | |
1886 | addze r10,r10 | |
1887 | addc r9,r9,r11 | |
1888 | #addze r10,r10 | |
1889 | #the above instruction addze | |
1890 | #is NOT needed. Carry will NOT | |
1891 | #be ignored. It's not affected | |
1892 | #by multiply and will be collected | |
1893 | #in the next spin | |
1894 | $ST r9,`0*$BNSZ`(r3) | |
1895 | ||
1896 | #mul_add(rp[1],ap[1],w,c1); | |
1897 | $LD r8,`1*$BNSZ`(r4) | |
1898 | $LD r9,`1*$BNSZ`(r3) | |
1899 | $UMULL r11,r6,r8 | |
1900 | $UMULH r12,r6,r8 | |
1901 | adde r11,r11,r10 #r10 is carry. | |
1902 | addze r12,r12 | |
1903 | addc r11,r11,r9 | |
1904 | #addze r12,r12 | |
1905 | $ST r11,`1*$BNSZ`(r3) | |
1906 | ||
1907 | #mul_add(rp[2],ap[2],w,c1); | |
1908 | $LD r8,`2*$BNSZ`(r4) | |
1909 | $UMULL r9,r6,r8 | |
1910 | $LD r11,`2*$BNSZ`(r3) | |
1911 | $UMULH r10,r6,r8 | |
1912 | adde r9,r9,r12 | |
1913 | addze r10,r10 | |
1914 | addc r9,r9,r11 | |
1915 | #addze r10,r10 | |
1916 | $ST r9,`2*$BNSZ`(r3) | |
1917 | ||
1918 | #mul_add(rp[3],ap[3],w,c1); | |
1919 | $LD r8,`3*$BNSZ`(r4) | |
1920 | $UMULL r11,r6,r8 | |
1921 | $LD r9,`3*$BNSZ`(r3) | |
1922 | $UMULH r12,r6,r8 | |
1923 | adde r11,r11,r10 | |
1924 | addze r12,r12 | |
1925 | addc r11,r11,r9 | |
1926 | addze r12,r12 | |
1927 | $ST r11,`3*$BNSZ`(r3) | |
1928 | addi r3,r3,`4*$BNSZ` | |
1929 | addi r4,r4,`4*$BNSZ` | |
31439046 | 1930 | bdnz- Lppcasm_maw_mainloop |
dd558806 AP |
1931 | |
1932 | Lppcasm_maw_leftover: | |
1933 | andi. r5,r5,0x3 | |
31439046 | 1934 | beq Lppcasm_maw_adios |
dd558806 AP |
1935 | addi r3,r3,-$BNSZ |
1936 | addi r4,r4,-$BNSZ | |
1937 | #mul_add(rp[0],ap[0],w,c1); | |
1938 | mtctr r5 | |
1939 | $LDU r8,$BNSZ(r4) | |
1940 | $UMULL r9,r6,r8 | |
1941 | $UMULH r10,r6,r8 | |
1942 | $LDU r11,$BNSZ(r3) | |
1943 | addc r9,r9,r11 | |
1944 | addze r10,r10 | |
1945 | addc r9,r9,r12 | |
1946 | addze r12,r10 | |
1947 | $ST r9,0(r3) | |
1948 | ||
31439046 | 1949 | bdz Lppcasm_maw_adios |
dd558806 AP |
1950 | #mul_add(rp[1],ap[1],w,c1); |
1951 | $LDU r8,$BNSZ(r4) | |
1952 | $UMULL r9,r6,r8 | |
1953 | $UMULH r10,r6,r8 | |
1954 | $LDU r11,$BNSZ(r3) | |
1955 | addc r9,r9,r11 | |
1956 | addze r10,r10 | |
1957 | addc r9,r9,r12 | |
1958 | addze r12,r10 | |
1959 | $ST r9,0(r3) | |
1960 | ||
31439046 | 1961 | bdz Lppcasm_maw_adios |
dd558806 AP |
1962 | #mul_add(rp[2],ap[2],w,c1); |
1963 | $LDU r8,$BNSZ(r4) | |
1964 | $UMULL r9,r6,r8 | |
1965 | $UMULH r10,r6,r8 | |
1966 | $LDU r11,$BNSZ(r3) | |
1967 | addc r9,r9,r11 | |
1968 | addze r10,r10 | |
1969 | addc r9,r9,r12 | |
1970 | addze r12,r10 | |
1971 | $ST r9,0(r3) | |
1972 | ||
1973 | Lppcasm_maw_adios: | |
1974 | addi r3,r12,0 | |
31439046 | 1975 | blr |
dd558806 AP |
1976 | .long 0x00000000 |
1977 | .align 4 | |
1978 | EOF | |
31439046 AP |
1979 | $data =~ s/\`([^\`]*)\`/eval $1/gem; |
1980 | print $data; | |
1981 | close STDOUT; |