]>
Commit | Line | Data |
---|---|---|
dd558806 AP |
1 | #!/usr/bin/env perl |
2 | # | |
3 | # Implemented as a Perl wrapper as we want to support several different | |
4 | # architectures with single file. We pick up the target based on the | |
5 | # file name we are asked to generate. | |
6 | # | |
7 | # It should be noted though that this perl code is nothing like | |
8 | # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much | |
9 | # as pre-processor to cover for platform differences in name decoration, | |
10 | # linker tables, 32-/64-bit instruction sets... | |
11 | # | |
12 | # As you might know there're several PowerPC ABI in use. Most notably | |
13 | # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs | |
14 | # are similar enough to implement leaf(!) functions, which would be ABI | |
15 | # neutral. And that's what you find here: ABI neutral leaf functions. | |
16 | # In case you wonder what that is... | |
17 | # | |
18 | # AIX performance | |
19 | # | |
20 | # MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. | |
21 | # | |
22 | # The following is the performance of 32-bit compiler | |
23 | # generated code: | |
24 | # | |
25 | # OpenSSL 0.9.6c 21 dec 2001 | |
26 | # built on: Tue Jun 11 11:06:51 EDT 2002 | |
27 | # options:bn(64,32) ... | |
28 | #compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 | |
29 | # sign verify sign/s verify/s | |
30 | #rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 | |
31 | #rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 | |
32 | #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 | |
33 | #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 | |
34 | #dsa 512 bits 0.0087s 0.0106s 114.3 94.5 | |
35 | #dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 | |
36 | # | |
37 | # Same bechmark with this assembler code: | |
38 | # | |
39 | #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 | |
40 | #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 | |
41 | #rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 | |
42 | #rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 | |
43 | #dsa 512 bits 0.0052s 0.0062s 191.6 162.0 | |
44 | #dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 | |
45 | # | |
46 | # Number of operations increases by at almost 75% | |
47 | # | |
48 | # Here are performance numbers for 64-bit compiler | |
49 | # generated code: | |
50 | # | |
51 | # OpenSSL 0.9.6g [engine] 9 Aug 2002 | |
52 | # built on: Fri Apr 18 16:59:20 EDT 2003 | |
53 | # options:bn(64,64) ... | |
54 | # compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 | |
55 | # sign verify sign/s verify/s | |
56 | #rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 | |
57 | #rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 | |
58 | #rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 | |
59 | #rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 | |
60 | #dsa 512 bits 0.0026s 0.0032s 382.5 313.7 | |
61 | #dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 | |
62 | # | |
63 | # Same benchmark with this assembler code: | |
64 | # | |
65 | #rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 | |
66 | #rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 | |
67 | #rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 | |
68 | #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 | |
69 | #dsa 512 bits 0.0016s 0.0020s 610.7 507.1 | |
70 | #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 | |
71 | # | |
72 | # Again, performance increases by at about 75% | |
73 | # | |
74 | # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) | |
75 | # OpenSSL 0.9.7c 30 Sep 2003 | |
76 | # | |
77 | # Original code. | |
78 | # | |
79 | #rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 | |
80 | #rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 | |
81 | #rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 | |
82 | #rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 | |
83 | #dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 | |
84 | #dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 | |
85 | #dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 | |
86 | # | |
87 | # Same benchmark with this assembler code: | |
88 | # | |
89 | #rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 | |
90 | #rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 | |
91 | #rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 | |
92 | #rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 | |
93 | #dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 | |
94 | #dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 | |
95 | #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 | |
96 | # | |
97 | # Performance increase of ~60% | |
98 | # | |
99 | # If you have comments or suggestions to improve code send | |
100 | # me a note at schari@us.ibm.com | |
101 | # | |
102 | ||
addd641f | 103 | $flavour = shift; |
dd558806 | 104 | |
addd641f | 105 | if ($flavour =~ /32/) { |
dd558806 AP |
106 | $BITS= 32; |
107 | $BNSZ= $BITS/8; | |
108 | $ISA= "\"ppc\""; | |
109 | ||
110 | $LD= "lwz"; # load | |
111 | $LDU= "lwzu"; # load and update | |
112 | $ST= "stw"; # store | |
113 | $STU= "stwu"; # store and update | |
114 | $UMULL= "mullw"; # unsigned multiply low | |
115 | $UMULH= "mulhwu"; # unsigned multiply high | |
116 | $UDIV= "divwu"; # unsigned divide | |
117 | $UCMPI= "cmplwi"; # unsigned compare with immediate | |
118 | $UCMP= "cmplw"; # unsigned compare | |
aaa5dc61 | 119 | $CNTLZ= "cntlzw"; # count leading zeros |
dd558806 AP |
120 | $SHL= "slw"; # shift left |
121 | $SHR= "srw"; # unsigned shift right | |
122 | $SHRI= "srwi"; # unsigned shift right by immediate | |
123 | $SHLI= "slwi"; # shift left by immediate | |
124 | $CLRU= "clrlwi"; # clear upper bits | |
125 | $INSR= "insrwi"; # insert right | |
126 | $ROTL= "rotlwi"; # rotate left by immediate | |
31efffbd | 127 | $TR= "tw"; # conditional trap |
addd641f | 128 | } elsif ($flavour =~ /64/) { |
dd558806 AP |
129 | $BITS= 64; |
130 | $BNSZ= $BITS/8; | |
131 | $ISA= "\"ppc64\""; | |
132 | ||
133 | # same as above, but 64-bit mnemonics... | |
134 | $LD= "ld"; # load | |
135 | $LDU= "ldu"; # load and update | |
136 | $ST= "std"; # store | |
137 | $STU= "stdu"; # store and update | |
138 | $UMULL= "mulld"; # unsigned multiply low | |
139 | $UMULH= "mulhdu"; # unsigned multiply high | |
140 | $UDIV= "divdu"; # unsigned divide | |
141 | $UCMPI= "cmpldi"; # unsigned compare with immediate | |
142 | $UCMP= "cmpld"; # unsigned compare | |
aaa5dc61 | 143 | $CNTLZ= "cntlzd"; # count leading zeros |
dd558806 AP |
144 | $SHL= "sld"; # shift left |
145 | $SHR= "srd"; # unsigned shift right | |
146 | $SHRI= "srdi"; # unsigned shift right by immediate | |
147 | $SHLI= "sldi"; # shift left by immediate | |
148 | $CLRU= "clrldi"; # clear upper bits | |
149 | $INSR= "insrdi"; # insert right | |
150 | $ROTL= "rotldi"; # rotate left by immediate | |
31efffbd | 151 | $TR= "td"; # conditional trap |
addd641f | 152 | } else { die "nonsense $flavour"; } |
dd558806 | 153 | |
31439046 AP |
154 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
155 | ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
156 | ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
157 | die "can't locate ppc-xlate.pl"; | |
dd558806 | 158 | |
addd641f | 159 | open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; |
dd558806 | 160 | |
31439046 | 161 | $data=<<EOF; |
dd558806 AP |
162 | #-------------------------------------------------------------------- |
163 | # | |
164 | # | |
165 | # | |
166 | # | |
167 | # File: ppc32.s | |
168 | # | |
169 | # Created by: Suresh Chari | |
170 | # IBM Thomas J. Watson Research Library | |
171 | # Hawthorne, NY | |
172 | # | |
173 | # | |
174 | # Description: Optimized assembly routines for OpenSSL crypto | |
175 | # on the 32 bitPowerPC platform. | |
176 | # | |
177 | # | |
178 | # Version History | |
179 | # | |
180 | # 2. Fixed bn_add,bn_sub and bn_div_words, added comments, | |
181 | # cleaned up code. Also made a single version which can | |
182 | # be used for both the AIX and Linux compilers. See NOTE | |
183 | # below. | |
184 | # 12/05/03 Suresh Chari | |
185 | # (with lots of help from) Andy Polyakov | |
186 | ## | |
187 | # 1. Initial version 10/20/02 Suresh Chari | |
188 | # | |
189 | # | |
190 | # The following file works for the xlc,cc | |
191 | # and gcc compilers. | |
192 | # | |
193 | # NOTE: To get the file to link correctly with the gcc compiler | |
194 | # you have to change the names of the routines and remove | |
195 | # the first .(dot) character. This should automatically | |
196 | # be done in the build process. | |
197 | # | |
198 | # Hand optimized assembly code for the following routines | |
199 | # | |
200 | # bn_sqr_comba4 | |
201 | # bn_sqr_comba8 | |
202 | # bn_mul_comba4 | |
203 | # bn_mul_comba8 | |
204 | # bn_sub_words | |
205 | # bn_add_words | |
206 | # bn_div_words | |
207 | # bn_sqr_words | |
208 | # bn_mul_words | |
209 | # bn_mul_add_words | |
210 | # | |
211 | # NOTE: It is possible to optimize this code more for | |
212 | # specific PowerPC or Power architectures. On the Northstar | |
213 | # architecture the optimizations in this file do | |
214 | # NOT provide much improvement. | |
215 | # | |
216 | # If you have comments or suggestions to improve code send | |
217 | # me a note at schari\@us.ibm.com | |
218 | # | |
219 | #-------------------------------------------------------------------------- | |
220 | # | |
221 | # Defines to be used in the assembly code. | |
222 | # | |
31439046 AP |
223 | #.set r0,0 # we use it as storage for value of 0 |
224 | #.set SP,1 # preserved | |
225 | #.set RTOC,2 # preserved | |
226 | #.set r3,3 # 1st argument/return value | |
227 | #.set r4,4 # 2nd argument/volatile register | |
228 | #.set r5,5 # 3rd argument/volatile register | |
229 | #.set r6,6 # ... | |
230 | #.set r7,7 | |
231 | #.set r8,8 | |
232 | #.set r9,9 | |
233 | #.set r10,10 | |
234 | #.set r11,11 | |
235 | #.set r12,12 | |
236 | #.set r13,13 # not used, nor any other "below" it... | |
dd558806 AP |
237 | |
238 | # Declare function names to be global | |
239 | # NOTE: For gcc these names MUST be changed to remove | |
240 | # the first . i.e. for example change ".bn_sqr_comba4" | |
241 | # to "bn_sqr_comba4". This should be automatically done | |
242 | # in the build. | |
243 | ||
244 | .globl .bn_sqr_comba4 | |
245 | .globl .bn_sqr_comba8 | |
246 | .globl .bn_mul_comba4 | |
247 | .globl .bn_mul_comba8 | |
248 | .globl .bn_sub_words | |
249 | .globl .bn_add_words | |
250 | .globl .bn_div_words | |
251 | .globl .bn_sqr_words | |
252 | .globl .bn_mul_words | |
253 | .globl .bn_mul_add_words | |
254 | ||
255 | # .text section | |
256 | ||
492279f6 | 257 | .machine "any" |
dd558806 AP |
258 | |
259 | # | |
260 | # NOTE: The following label name should be changed to | |
261 | # "bn_sqr_comba4" i.e. remove the first dot | |
262 | # for the gcc compiler. This should be automatically | |
263 | # done in the build | |
264 | # | |
265 | ||
266 | .align 4 | |
267 | .bn_sqr_comba4: | |
268 | # | |
269 | # Optimized version of bn_sqr_comba4. | |
270 | # | |
271 | # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) | |
272 | # r3 contains r | |
273 | # r4 contains a | |
274 | # | |
275 | # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: | |
276 | # | |
277 | # r5,r6 are the two BN_ULONGs being multiplied. | |
278 | # r7,r8 are the results of the 32x32 giving 64 bit multiply. | |
279 | # r9,r10, r11 are the equivalents of c1,c2, c3. | |
280 | # Here's the assembly | |
281 | # | |
282 | # | |
283 | xor r0,r0,r0 # set r0 = 0. Used in the addze | |
284 | # instructions below | |
285 | ||
286 | #sqr_add_c(a,0,c1,c2,c3) | |
287 | $LD r5,`0*$BNSZ`(r4) | |
288 | $UMULL r9,r5,r5 | |
289 | $UMULH r10,r5,r5 #in first iteration. No need | |
290 | #to add since c1=c2=c3=0. | |
291 | # Note c3(r11) is NOT set to 0 | |
292 | # but will be. | |
293 | ||
294 | $ST r9,`0*$BNSZ`(r3) # r[0]=c1; | |
295 | # sqr_add_c2(a,1,0,c2,c3,c1); | |
296 | $LD r6,`1*$BNSZ`(r4) | |
297 | $UMULL r7,r5,r6 | |
298 | $UMULH r8,r5,r6 | |
299 | ||
300 | addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) | |
301 | adde r8,r8,r8 | |
302 | addze r9,r0 # catch carry if any. | |
303 | # r9= r0(=0) and carry | |
304 | ||
305 | addc r10,r7,r10 # now add to temp result. | |
306 | addze r11,r8 # r8 added to r11 which is 0 | |
307 | addze r9,r9 | |
308 | ||
309 | $ST r10,`1*$BNSZ`(r3) #r[1]=c2; | |
310 | #sqr_add_c(a,1,c3,c1,c2) | |
311 | $UMULL r7,r6,r6 | |
312 | $UMULH r8,r6,r6 | |
313 | addc r11,r7,r11 | |
314 | adde r9,r8,r9 | |
315 | addze r10,r0 | |
316 | #sqr_add_c2(a,2,0,c3,c1,c2) | |
317 | $LD r6,`2*$BNSZ`(r4) | |
318 | $UMULL r7,r5,r6 | |
319 | $UMULH r8,r5,r6 | |
320 | ||
321 | addc r7,r7,r7 | |
322 | adde r8,r8,r8 | |
323 | addze r10,r10 | |
324 | ||
325 | addc r11,r7,r11 | |
326 | adde r9,r8,r9 | |
327 | addze r10,r10 | |
328 | $ST r11,`2*$BNSZ`(r3) #r[2]=c3 | |
329 | #sqr_add_c2(a,3,0,c1,c2,c3); | |
330 | $LD r6,`3*$BNSZ`(r4) | |
331 | $UMULL r7,r5,r6 | |
332 | $UMULH r8,r5,r6 | |
333 | addc r7,r7,r7 | |
334 | adde r8,r8,r8 | |
335 | addze r11,r0 | |
336 | ||
337 | addc r9,r7,r9 | |
338 | adde r10,r8,r10 | |
339 | addze r11,r11 | |
340 | #sqr_add_c2(a,2,1,c1,c2,c3); | |
341 | $LD r5,`1*$BNSZ`(r4) | |
342 | $LD r6,`2*$BNSZ`(r4) | |
343 | $UMULL r7,r5,r6 | |
344 | $UMULH r8,r5,r6 | |
345 | ||
346 | addc r7,r7,r7 | |
347 | adde r8,r8,r8 | |
348 | addze r11,r11 | |
349 | addc r9,r7,r9 | |
350 | adde r10,r8,r10 | |
351 | addze r11,r11 | |
352 | $ST r9,`3*$BNSZ`(r3) #r[3]=c1 | |
353 | #sqr_add_c(a,2,c2,c3,c1); | |
354 | $UMULL r7,r6,r6 | |
355 | $UMULH r8,r6,r6 | |
356 | addc r10,r7,r10 | |
357 | adde r11,r8,r11 | |
358 | addze r9,r0 | |
359 | #sqr_add_c2(a,3,1,c2,c3,c1); | |
360 | $LD r6,`3*$BNSZ`(r4) | |
361 | $UMULL r7,r5,r6 | |
362 | $UMULH r8,r5,r6 | |
363 | addc r7,r7,r7 | |
364 | adde r8,r8,r8 | |
365 | addze r9,r9 | |
366 | ||
367 | addc r10,r7,r10 | |
368 | adde r11,r8,r11 | |
369 | addze r9,r9 | |
370 | $ST r10,`4*$BNSZ`(r3) #r[4]=c2 | |
371 | #sqr_add_c2(a,3,2,c3,c1,c2); | |
372 | $LD r5,`2*$BNSZ`(r4) | |
373 | $UMULL r7,r5,r6 | |
374 | $UMULH r8,r5,r6 | |
375 | addc r7,r7,r7 | |
376 | adde r8,r8,r8 | |
377 | addze r10,r0 | |
378 | ||
379 | addc r11,r7,r11 | |
380 | adde r9,r8,r9 | |
381 | addze r10,r10 | |
382 | $ST r11,`5*$BNSZ`(r3) #r[5] = c3 | |
383 | #sqr_add_c(a,3,c1,c2,c3); | |
384 | $UMULL r7,r6,r6 | |
385 | $UMULH r8,r6,r6 | |
386 | addc r9,r7,r9 | |
387 | adde r10,r8,r10 | |
388 | ||
389 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1 | |
390 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2 | |
31439046 | 391 | blr |
67150340 AP |
392 | .long 0 |
393 | .byte 0,12,0x14,0,0,0,2,0 | |
394 | .long 0 | |
dd558806 AP |
395 | |
396 | # | |
397 | # NOTE: The following label name should be changed to | |
398 | # "bn_sqr_comba8" i.e. remove the first dot | |
399 | # for the gcc compiler. This should be automatically | |
400 | # done in the build | |
401 | # | |
402 | ||
403 | .align 4 | |
404 | .bn_sqr_comba8: | |
405 | # | |
406 | # This is an optimized version of the bn_sqr_comba8 routine. | |
407 | # Tightly uses the adde instruction | |
408 | # | |
409 | # | |
410 | # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) | |
411 | # r3 contains r | |
412 | # r4 contains a | |
413 | # | |
414 | # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: | |
415 | # | |
416 | # r5,r6 are the two BN_ULONGs being multiplied. | |
417 | # r7,r8 are the results of the 32x32 giving 64 bit multiply. | |
418 | # r9,r10, r11 are the equivalents of c1,c2, c3. | |
419 | # | |
420 | # Possible optimization of loading all 8 longs of a into registers | |
421 | # doesnt provide any speedup | |
422 | # | |
423 | ||
424 | xor r0,r0,r0 #set r0 = 0.Used in addze | |
425 | #instructions below. | |
426 | ||
427 | #sqr_add_c(a,0,c1,c2,c3); | |
428 | $LD r5,`0*$BNSZ`(r4) | |
429 | $UMULL r9,r5,r5 #1st iteration: no carries. | |
430 | $UMULH r10,r5,r5 | |
431 | $ST r9,`0*$BNSZ`(r3) # r[0]=c1; | |
432 | #sqr_add_c2(a,1,0,c2,c3,c1); | |
433 | $LD r6,`1*$BNSZ`(r4) | |
434 | $UMULL r7,r5,r6 | |
435 | $UMULH r8,r5,r6 | |
436 | ||
437 | addc r10,r7,r10 #add the two register number | |
438 | adde r11,r8,r0 # (r8,r7) to the three register | |
439 | addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 | |
440 | ||
441 | addc r10,r7,r10 #add the two register number | |
442 | adde r11,r8,r11 # (r8,r7) to the three register | |
443 | addze r9,r9 # number (r9,r11,r10). | |
444 | ||
445 | $ST r10,`1*$BNSZ`(r3) # r[1]=c2 | |
446 | ||
447 | #sqr_add_c(a,1,c3,c1,c2); | |
448 | $UMULL r7,r6,r6 | |
449 | $UMULH r8,r6,r6 | |
450 | addc r11,r7,r11 | |
451 | adde r9,r8,r9 | |
452 | addze r10,r0 | |
453 | #sqr_add_c2(a,2,0,c3,c1,c2); | |
454 | $LD r6,`2*$BNSZ`(r4) | |
455 | $UMULL r7,r5,r6 | |
456 | $UMULH r8,r5,r6 | |
457 | ||
458 | addc r11,r7,r11 | |
459 | adde r9,r8,r9 | |
460 | addze r10,r10 | |
461 | ||
462 | addc r11,r7,r11 | |
463 | adde r9,r8,r9 | |
464 | addze r10,r10 | |
465 | ||
466 | $ST r11,`2*$BNSZ`(r3) #r[2]=c3 | |
467 | #sqr_add_c2(a,3,0,c1,c2,c3); | |
468 | $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. | |
469 | $UMULL r7,r5,r6 | |
470 | $UMULH r8,r5,r6 | |
471 | ||
472 | addc r9,r7,r9 | |
473 | adde r10,r8,r10 | |
474 | addze r11,r0 | |
475 | ||
476 | addc r9,r7,r9 | |
477 | adde r10,r8,r10 | |
478 | addze r11,r11 | |
479 | #sqr_add_c2(a,2,1,c1,c2,c3); | |
480 | $LD r5,`1*$BNSZ`(r4) | |
481 | $LD r6,`2*$BNSZ`(r4) | |
482 | $UMULL r7,r5,r6 | |
483 | $UMULH r8,r5,r6 | |
484 | ||
485 | addc r9,r7,r9 | |
486 | adde r10,r8,r10 | |
487 | addze r11,r11 | |
488 | ||
489 | addc r9,r7,r9 | |
490 | adde r10,r8,r10 | |
491 | addze r11,r11 | |
492 | ||
493 | $ST r9,`3*$BNSZ`(r3) #r[3]=c1; | |
494 | #sqr_add_c(a,2,c2,c3,c1); | |
495 | $UMULL r7,r6,r6 | |
496 | $UMULH r8,r6,r6 | |
497 | ||
498 | addc r10,r7,r10 | |
499 | adde r11,r8,r11 | |
500 | addze r9,r0 | |
501 | #sqr_add_c2(a,3,1,c2,c3,c1); | |
502 | $LD r6,`3*$BNSZ`(r4) | |
503 | $UMULL r7,r5,r6 | |
504 | $UMULH r8,r5,r6 | |
505 | ||
506 | addc r10,r7,r10 | |
507 | adde r11,r8,r11 | |
508 | addze r9,r9 | |
509 | ||
510 | addc r10,r7,r10 | |
511 | adde r11,r8,r11 | |
512 | addze r9,r9 | |
513 | #sqr_add_c2(a,4,0,c2,c3,c1); | |
514 | $LD r5,`0*$BNSZ`(r4) | |
515 | $LD r6,`4*$BNSZ`(r4) | |
516 | $UMULL r7,r5,r6 | |
517 | $UMULH r8,r5,r6 | |
518 | ||
519 | addc r10,r7,r10 | |
520 | adde r11,r8,r11 | |
521 | addze r9,r9 | |
522 | ||
523 | addc r10,r7,r10 | |
524 | adde r11,r8,r11 | |
525 | addze r9,r9 | |
526 | $ST r10,`4*$BNSZ`(r3) #r[4]=c2; | |
527 | #sqr_add_c2(a,5,0,c3,c1,c2); | |
528 | $LD r6,`5*$BNSZ`(r4) | |
529 | $UMULL r7,r5,r6 | |
530 | $UMULH r8,r5,r6 | |
531 | ||
532 | addc r11,r7,r11 | |
533 | adde r9,r8,r9 | |
534 | addze r10,r0 | |
535 | ||
536 | addc r11,r7,r11 | |
537 | adde r9,r8,r9 | |
538 | addze r10,r10 | |
539 | #sqr_add_c2(a,4,1,c3,c1,c2); | |
540 | $LD r5,`1*$BNSZ`(r4) | |
541 | $LD r6,`4*$BNSZ`(r4) | |
542 | $UMULL r7,r5,r6 | |
543 | $UMULH r8,r5,r6 | |
544 | ||
545 | addc r11,r7,r11 | |
546 | adde r9,r8,r9 | |
547 | addze r10,r10 | |
548 | ||
549 | addc r11,r7,r11 | |
550 | adde r9,r8,r9 | |
551 | addze r10,r10 | |
552 | #sqr_add_c2(a,3,2,c3,c1,c2); | |
553 | $LD r5,`2*$BNSZ`(r4) | |
554 | $LD r6,`3*$BNSZ`(r4) | |
555 | $UMULL r7,r5,r6 | |
556 | $UMULH r8,r5,r6 | |
557 | ||
558 | addc r11,r7,r11 | |
559 | adde r9,r8,r9 | |
560 | addze r10,r10 | |
561 | ||
562 | addc r11,r7,r11 | |
563 | adde r9,r8,r9 | |
564 | addze r10,r10 | |
565 | $ST r11,`5*$BNSZ`(r3) #r[5]=c3; | |
566 | #sqr_add_c(a,3,c1,c2,c3); | |
567 | $UMULL r7,r6,r6 | |
568 | $UMULH r8,r6,r6 | |
569 | addc r9,r7,r9 | |
570 | adde r10,r8,r10 | |
571 | addze r11,r0 | |
572 | #sqr_add_c2(a,4,2,c1,c2,c3); | |
573 | $LD r6,`4*$BNSZ`(r4) | |
574 | $UMULL r7,r5,r6 | |
575 | $UMULH r8,r5,r6 | |
576 | ||
577 | addc r9,r7,r9 | |
578 | adde r10,r8,r10 | |
579 | addze r11,r11 | |
580 | ||
581 | addc r9,r7,r9 | |
582 | adde r10,r8,r10 | |
583 | addze r11,r11 | |
584 | #sqr_add_c2(a,5,1,c1,c2,c3); | |
585 | $LD r5,`1*$BNSZ`(r4) | |
586 | $LD r6,`5*$BNSZ`(r4) | |
587 | $UMULL r7,r5,r6 | |
588 | $UMULH r8,r5,r6 | |
589 | ||
590 | addc r9,r7,r9 | |
591 | adde r10,r8,r10 | |
592 | addze r11,r11 | |
593 | ||
594 | addc r9,r7,r9 | |
595 | adde r10,r8,r10 | |
596 | addze r11,r11 | |
597 | #sqr_add_c2(a,6,0,c1,c2,c3); | |
598 | $LD r5,`0*$BNSZ`(r4) | |
599 | $LD r6,`6*$BNSZ`(r4) | |
600 | $UMULL r7,r5,r6 | |
601 | $UMULH r8,r5,r6 | |
602 | addc r9,r7,r9 | |
603 | adde r10,r8,r10 | |
604 | addze r11,r11 | |
605 | addc r9,r7,r9 | |
606 | adde r10,r8,r10 | |
607 | addze r11,r11 | |
608 | $ST r9,`6*$BNSZ`(r3) #r[6]=c1; | |
609 | #sqr_add_c2(a,7,0,c2,c3,c1); | |
610 | $LD r6,`7*$BNSZ`(r4) | |
611 | $UMULL r7,r5,r6 | |
612 | $UMULH r8,r5,r6 | |
613 | ||
614 | addc r10,r7,r10 | |
615 | adde r11,r8,r11 | |
616 | addze r9,r0 | |
617 | addc r10,r7,r10 | |
618 | adde r11,r8,r11 | |
619 | addze r9,r9 | |
620 | #sqr_add_c2(a,6,1,c2,c3,c1); | |
621 | $LD r5,`1*$BNSZ`(r4) | |
622 | $LD r6,`6*$BNSZ`(r4) | |
623 | $UMULL r7,r5,r6 | |
624 | $UMULH r8,r5,r6 | |
625 | ||
626 | addc r10,r7,r10 | |
627 | adde r11,r8,r11 | |
628 | addze r9,r9 | |
629 | addc r10,r7,r10 | |
630 | adde r11,r8,r11 | |
631 | addze r9,r9 | |
632 | #sqr_add_c2(a,5,2,c2,c3,c1); | |
633 | $LD r5,`2*$BNSZ`(r4) | |
634 | $LD r6,`5*$BNSZ`(r4) | |
635 | $UMULL r7,r5,r6 | |
636 | $UMULH r8,r5,r6 | |
637 | addc r10,r7,r10 | |
638 | adde r11,r8,r11 | |
639 | addze r9,r9 | |
640 | addc r10,r7,r10 | |
641 | adde r11,r8,r11 | |
642 | addze r9,r9 | |
643 | #sqr_add_c2(a,4,3,c2,c3,c1); | |
644 | $LD r5,`3*$BNSZ`(r4) | |
645 | $LD r6,`4*$BNSZ`(r4) | |
646 | $UMULL r7,r5,r6 | |
647 | $UMULH r8,r5,r6 | |
648 | ||
649 | addc r10,r7,r10 | |
650 | adde r11,r8,r11 | |
651 | addze r9,r9 | |
652 | addc r10,r7,r10 | |
653 | adde r11,r8,r11 | |
654 | addze r9,r9 | |
655 | $ST r10,`7*$BNSZ`(r3) #r[7]=c2; | |
656 | #sqr_add_c(a,4,c3,c1,c2); | |
657 | $UMULL r7,r6,r6 | |
658 | $UMULH r8,r6,r6 | |
659 | addc r11,r7,r11 | |
660 | adde r9,r8,r9 | |
661 | addze r10,r0 | |
662 | #sqr_add_c2(a,5,3,c3,c1,c2); | |
663 | $LD r6,`5*$BNSZ`(r4) | |
664 | $UMULL r7,r5,r6 | |
665 | $UMULH r8,r5,r6 | |
666 | addc r11,r7,r11 | |
667 | adde r9,r8,r9 | |
668 | addze r10,r10 | |
669 | addc r11,r7,r11 | |
670 | adde r9,r8,r9 | |
671 | addze r10,r10 | |
672 | #sqr_add_c2(a,6,2,c3,c1,c2); | |
673 | $LD r5,`2*$BNSZ`(r4) | |
674 | $LD r6,`6*$BNSZ`(r4) | |
675 | $UMULL r7,r5,r6 | |
676 | $UMULH r8,r5,r6 | |
677 | addc r11,r7,r11 | |
678 | adde r9,r8,r9 | |
679 | addze r10,r10 | |
680 | ||
681 | addc r11,r7,r11 | |
682 | adde r9,r8,r9 | |
683 | addze r10,r10 | |
684 | #sqr_add_c2(a,7,1,c3,c1,c2); | |
685 | $LD r5,`1*$BNSZ`(r4) | |
686 | $LD r6,`7*$BNSZ`(r4) | |
687 | $UMULL r7,r5,r6 | |
688 | $UMULH r8,r5,r6 | |
689 | addc r11,r7,r11 | |
690 | adde r9,r8,r9 | |
691 | addze r10,r10 | |
692 | addc r11,r7,r11 | |
693 | adde r9,r8,r9 | |
694 | addze r10,r10 | |
695 | $ST r11,`8*$BNSZ`(r3) #r[8]=c3; | |
696 | #sqr_add_c2(a,7,2,c1,c2,c3); | |
697 | $LD r5,`2*$BNSZ`(r4) | |
698 | $UMULL r7,r5,r6 | |
699 | $UMULH r8,r5,r6 | |
700 | ||
701 | addc r9,r7,r9 | |
702 | adde r10,r8,r10 | |
703 | addze r11,r0 | |
704 | addc r9,r7,r9 | |
705 | adde r10,r8,r10 | |
706 | addze r11,r11 | |
707 | #sqr_add_c2(a,6,3,c1,c2,c3); | |
708 | $LD r5,`3*$BNSZ`(r4) | |
709 | $LD r6,`6*$BNSZ`(r4) | |
710 | $UMULL r7,r5,r6 | |
711 | $UMULH r8,r5,r6 | |
712 | addc r9,r7,r9 | |
713 | adde r10,r8,r10 | |
714 | addze r11,r11 | |
715 | addc r9,r7,r9 | |
716 | adde r10,r8,r10 | |
717 | addze r11,r11 | |
718 | #sqr_add_c2(a,5,4,c1,c2,c3); | |
719 | $LD r5,`4*$BNSZ`(r4) | |
720 | $LD r6,`5*$BNSZ`(r4) | |
721 | $UMULL r7,r5,r6 | |
722 | $UMULH r8,r5,r6 | |
723 | addc r9,r7,r9 | |
724 | adde r10,r8,r10 | |
725 | addze r11,r11 | |
726 | addc r9,r7,r9 | |
727 | adde r10,r8,r10 | |
728 | addze r11,r11 | |
729 | $ST r9,`9*$BNSZ`(r3) #r[9]=c1; | |
730 | #sqr_add_c(a,5,c2,c3,c1); | |
731 | $UMULL r7,r6,r6 | |
732 | $UMULH r8,r6,r6 | |
733 | addc r10,r7,r10 | |
734 | adde r11,r8,r11 | |
735 | addze r9,r0 | |
736 | #sqr_add_c2(a,6,4,c2,c3,c1); | |
737 | $LD r6,`6*$BNSZ`(r4) | |
738 | $UMULL r7,r5,r6 | |
739 | $UMULH r8,r5,r6 | |
740 | addc r10,r7,r10 | |
741 | adde r11,r8,r11 | |
742 | addze r9,r9 | |
743 | addc r10,r7,r10 | |
744 | adde r11,r8,r11 | |
745 | addze r9,r9 | |
746 | #sqr_add_c2(a,7,3,c2,c3,c1); | |
747 | $LD r5,`3*$BNSZ`(r4) | |
748 | $LD r6,`7*$BNSZ`(r4) | |
749 | $UMULL r7,r5,r6 | |
750 | $UMULH r8,r5,r6 | |
751 | addc r10,r7,r10 | |
752 | adde r11,r8,r11 | |
753 | addze r9,r9 | |
754 | addc r10,r7,r10 | |
755 | adde r11,r8,r11 | |
756 | addze r9,r9 | |
757 | $ST r10,`10*$BNSZ`(r3) #r[10]=c2; | |
758 | #sqr_add_c2(a,7,4,c3,c1,c2); | |
759 | $LD r5,`4*$BNSZ`(r4) | |
760 | $UMULL r7,r5,r6 | |
761 | $UMULH r8,r5,r6 | |
762 | addc r11,r7,r11 | |
763 | adde r9,r8,r9 | |
764 | addze r10,r0 | |
765 | addc r11,r7,r11 | |
766 | adde r9,r8,r9 | |
767 | addze r10,r10 | |
768 | #sqr_add_c2(a,6,5,c3,c1,c2); | |
769 | $LD r5,`5*$BNSZ`(r4) | |
770 | $LD r6,`6*$BNSZ`(r4) | |
771 | $UMULL r7,r5,r6 | |
772 | $UMULH r8,r5,r6 | |
773 | addc r11,r7,r11 | |
774 | adde r9,r8,r9 | |
775 | addze r10,r10 | |
776 | addc r11,r7,r11 | |
777 | adde r9,r8,r9 | |
778 | addze r10,r10 | |
779 | $ST r11,`11*$BNSZ`(r3) #r[11]=c3; | |
780 | #sqr_add_c(a,6,c1,c2,c3); | |
781 | $UMULL r7,r6,r6 | |
782 | $UMULH r8,r6,r6 | |
783 | addc r9,r7,r9 | |
784 | adde r10,r8,r10 | |
785 | addze r11,r0 | |
786 | #sqr_add_c2(a,7,5,c1,c2,c3) | |
787 | $LD r6,`7*$BNSZ`(r4) | |
788 | $UMULL r7,r5,r6 | |
789 | $UMULH r8,r5,r6 | |
790 | addc r9,r7,r9 | |
791 | adde r10,r8,r10 | |
792 | addze r11,r11 | |
793 | addc r9,r7,r9 | |
794 | adde r10,r8,r10 | |
795 | addze r11,r11 | |
796 | $ST r9,`12*$BNSZ`(r3) #r[12]=c1; | |
797 | ||
798 | #sqr_add_c2(a,7,6,c2,c3,c1) | |
799 | $LD r5,`6*$BNSZ`(r4) | |
800 | $UMULL r7,r5,r6 | |
801 | $UMULH r8,r5,r6 | |
802 | addc r10,r7,r10 | |
803 | adde r11,r8,r11 | |
804 | addze r9,r0 | |
805 | addc r10,r7,r10 | |
806 | adde r11,r8,r11 | |
807 | addze r9,r9 | |
808 | $ST r10,`13*$BNSZ`(r3) #r[13]=c2; | |
809 | #sqr_add_c(a,7,c3,c1,c2); | |
810 | $UMULL r7,r6,r6 | |
811 | $UMULH r8,r6,r6 | |
812 | addc r11,r7,r11 | |
813 | adde r9,r8,r9 | |
814 | $ST r11,`14*$BNSZ`(r3) #r[14]=c3; | |
815 | $ST r9, `15*$BNSZ`(r3) #r[15]=c1; | |
816 | ||
817 | ||
31439046 | 818 | blr |
67150340 AP |
819 | .long 0 |
820 | .byte 0,12,0x14,0,0,0,2,0 | |
821 | .long 0 | |
dd558806 AP |
822 | |
823 | # | |
824 | # NOTE: The following label name should be changed to | |
825 | # "bn_mul_comba4" i.e. remove the first dot | |
826 | # for the gcc compiler. This should be automatically | |
827 | # done in the build | |
828 | # | |
829 | ||
830 | .align 4 | |
831 | .bn_mul_comba4: | |
832 | # | |
833 | # This is an optimized version of the bn_mul_comba4 routine. | |
834 | # | |
835 | # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
836 | # r3 contains r | |
837 | # r4 contains a | |
838 | # r5 contains b | |
839 | # r6, r7 are the 2 BN_ULONGs being multiplied. | |
840 | # r8, r9 are the results of the 32x32 giving 64 multiply. | |
841 | # r10, r11, r12 are the equivalents of c1, c2, and c3. | |
842 | # | |
843 | xor r0,r0,r0 #r0=0. Used in addze below. | |
844 | #mul_add_c(a[0],b[0],c1,c2,c3); | |
845 | $LD r6,`0*$BNSZ`(r4) | |
846 | $LD r7,`0*$BNSZ`(r5) | |
847 | $UMULL r10,r6,r7 | |
848 | $UMULH r11,r6,r7 | |
849 | $ST r10,`0*$BNSZ`(r3) #r[0]=c1 | |
850 | #mul_add_c(a[0],b[1],c2,c3,c1); | |
851 | $LD r7,`1*$BNSZ`(r5) | |
852 | $UMULL r8,r6,r7 | |
853 | $UMULH r9,r6,r7 | |
854 | addc r11,r8,r11 | |
855 | adde r12,r9,r0 | |
856 | addze r10,r0 | |
857 | #mul_add_c(a[1],b[0],c2,c3,c1); | |
858 | $LD r6, `1*$BNSZ`(r4) | |
859 | $LD r7, `0*$BNSZ`(r5) | |
860 | $UMULL r8,r6,r7 | |
861 | $UMULH r9,r6,r7 | |
862 | addc r11,r8,r11 | |
863 | adde r12,r9,r12 | |
864 | addze r10,r10 | |
865 | $ST r11,`1*$BNSZ`(r3) #r[1]=c2 | |
866 | #mul_add_c(a[2],b[0],c3,c1,c2); | |
867 | $LD r6,`2*$BNSZ`(r4) | |
868 | $UMULL r8,r6,r7 | |
869 | $UMULH r9,r6,r7 | |
870 | addc r12,r8,r12 | |
871 | adde r10,r9,r10 | |
872 | addze r11,r0 | |
873 | #mul_add_c(a[1],b[1],c3,c1,c2); | |
874 | $LD r6,`1*$BNSZ`(r4) | |
875 | $LD r7,`1*$BNSZ`(r5) | |
876 | $UMULL r8,r6,r7 | |
877 | $UMULH r9,r6,r7 | |
878 | addc r12,r8,r12 | |
879 | adde r10,r9,r10 | |
880 | addze r11,r11 | |
881 | #mul_add_c(a[0],b[2],c3,c1,c2); | |
882 | $LD r6,`0*$BNSZ`(r4) | |
883 | $LD r7,`2*$BNSZ`(r5) | |
884 | $UMULL r8,r6,r7 | |
885 | $UMULH r9,r6,r7 | |
886 | addc r12,r8,r12 | |
887 | adde r10,r9,r10 | |
888 | addze r11,r11 | |
889 | $ST r12,`2*$BNSZ`(r3) #r[2]=c3 | |
890 | #mul_add_c(a[0],b[3],c1,c2,c3); | |
891 | $LD r7,`3*$BNSZ`(r5) | |
892 | $UMULL r8,r6,r7 | |
893 | $UMULH r9,r6,r7 | |
894 | addc r10,r8,r10 | |
895 | adde r11,r9,r11 | |
896 | addze r12,r0 | |
897 | #mul_add_c(a[1],b[2],c1,c2,c3); | |
898 | $LD r6,`1*$BNSZ`(r4) | |
899 | $LD r7,`2*$BNSZ`(r5) | |
900 | $UMULL r8,r6,r7 | |
901 | $UMULH r9,r6,r7 | |
902 | addc r10,r8,r10 | |
903 | adde r11,r9,r11 | |
904 | addze r12,r12 | |
905 | #mul_add_c(a[2],b[1],c1,c2,c3); | |
906 | $LD r6,`2*$BNSZ`(r4) | |
907 | $LD r7,`1*$BNSZ`(r5) | |
908 | $UMULL r8,r6,r7 | |
909 | $UMULH r9,r6,r7 | |
910 | addc r10,r8,r10 | |
911 | adde r11,r9,r11 | |
912 | addze r12,r12 | |
913 | #mul_add_c(a[3],b[0],c1,c2,c3); | |
914 | $LD r6,`3*$BNSZ`(r4) | |
915 | $LD r7,`0*$BNSZ`(r5) | |
916 | $UMULL r8,r6,r7 | |
917 | $UMULH r9,r6,r7 | |
918 | addc r10,r8,r10 | |
919 | adde r11,r9,r11 | |
920 | addze r12,r12 | |
921 | $ST r10,`3*$BNSZ`(r3) #r[3]=c1 | |
922 | #mul_add_c(a[3],b[1],c2,c3,c1); | |
923 | $LD r7,`1*$BNSZ`(r5) | |
924 | $UMULL r8,r6,r7 | |
925 | $UMULH r9,r6,r7 | |
926 | addc r11,r8,r11 | |
927 | adde r12,r9,r12 | |
928 | addze r10,r0 | |
929 | #mul_add_c(a[2],b[2],c2,c3,c1); | |
930 | $LD r6,`2*$BNSZ`(r4) | |
931 | $LD r7,`2*$BNSZ`(r5) | |
932 | $UMULL r8,r6,r7 | |
933 | $UMULH r9,r6,r7 | |
934 | addc r11,r8,r11 | |
935 | adde r12,r9,r12 | |
936 | addze r10,r10 | |
937 | #mul_add_c(a[1],b[3],c2,c3,c1); | |
938 | $LD r6,`1*$BNSZ`(r4) | |
939 | $LD r7,`3*$BNSZ`(r5) | |
940 | $UMULL r8,r6,r7 | |
941 | $UMULH r9,r6,r7 | |
942 | addc r11,r8,r11 | |
943 | adde r12,r9,r12 | |
944 | addze r10,r10 | |
945 | $ST r11,`4*$BNSZ`(r3) #r[4]=c2 | |
946 | #mul_add_c(a[2],b[3],c3,c1,c2); | |
947 | $LD r6,`2*$BNSZ`(r4) | |
948 | $UMULL r8,r6,r7 | |
949 | $UMULH r9,r6,r7 | |
950 | addc r12,r8,r12 | |
951 | adde r10,r9,r10 | |
952 | addze r11,r0 | |
953 | #mul_add_c(a[3],b[2],c3,c1,c2); | |
954 | $LD r6,`3*$BNSZ`(r4) | |
955 | $LD r7,`2*$BNSZ`(r4) | |
956 | $UMULL r8,r6,r7 | |
957 | $UMULH r9,r6,r7 | |
958 | addc r12,r8,r12 | |
959 | adde r10,r9,r10 | |
960 | addze r11,r11 | |
961 | $ST r12,`5*$BNSZ`(r3) #r[5]=c3 | |
962 | #mul_add_c(a[3],b[3],c1,c2,c3); | |
963 | $LD r7,`3*$BNSZ`(r5) | |
964 | $UMULL r8,r6,r7 | |
965 | $UMULH r9,r6,r7 | |
966 | addc r10,r8,r10 | |
967 | adde r11,r9,r11 | |
968 | ||
969 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1 | |
970 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2 | |
31439046 | 971 | blr |
67150340 AP |
972 | .long 0 |
973 | .byte 0,12,0x14,0,0,0,3,0 | |
974 | .long 0 | |
dd558806 AP |
975 | |
976 | # | |
977 | # NOTE: The following label name should be changed to | |
978 | # "bn_mul_comba8" i.e. remove the first dot | |
979 | # for the gcc compiler. This should be automatically | |
980 | # done in the build | |
981 | # | |
982 | ||
983 | .align 4 | |
984 | .bn_mul_comba8: | |
985 | # | |
986 | # Optimized version of the bn_mul_comba8 routine. | |
987 | # | |
988 | # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | |
989 | # r3 contains r | |
990 | # r4 contains a | |
991 | # r5 contains b | |
992 | # r6, r7 are the 2 BN_ULONGs being multiplied. | |
993 | # r8, r9 are the results of the 32x32 giving 64 multiply. | |
994 | # r10, r11, r12 are the equivalents of c1, c2, and c3. | |
995 | # | |
996 | xor r0,r0,r0 #r0=0. Used in addze below. | |
997 | ||
998 | #mul_add_c(a[0],b[0],c1,c2,c3); | |
999 | $LD r6,`0*$BNSZ`(r4) #a[0] | |
1000 | $LD r7,`0*$BNSZ`(r5) #b[0] | |
1001 | $UMULL r10,r6,r7 | |
1002 | $UMULH r11,r6,r7 | |
1003 | $ST r10,`0*$BNSZ`(r3) #r[0]=c1; | |
1004 | #mul_add_c(a[0],b[1],c2,c3,c1); | |
1005 | $LD r7,`1*$BNSZ`(r5) | |
1006 | $UMULL r8,r6,r7 | |
1007 | $UMULH r9,r6,r7 | |
1008 | addc r11,r11,r8 | |
1009 | addze r12,r9 # since we didnt set r12 to zero before. | |
1010 | addze r10,r0 | |
1011 | #mul_add_c(a[1],b[0],c2,c3,c1); | |
1012 | $LD r6,`1*$BNSZ`(r4) | |
1013 | $LD r7,`0*$BNSZ`(r5) | |
1014 | $UMULL r8,r6,r7 | |
1015 | $UMULH r9,r6,r7 | |
1016 | addc r11,r11,r8 | |
1017 | adde r12,r12,r9 | |
1018 | addze r10,r10 | |
1019 | $ST r11,`1*$BNSZ`(r3) #r[1]=c2; | |
1020 | #mul_add_c(a[2],b[0],c3,c1,c2); | |
1021 | $LD r6,`2*$BNSZ`(r4) | |
1022 | $UMULL r8,r6,r7 | |
1023 | $UMULH r9,r6,r7 | |
1024 | addc r12,r12,r8 | |
1025 | adde r10,r10,r9 | |
1026 | addze r11,r0 | |
1027 | #mul_add_c(a[1],b[1],c3,c1,c2); | |
1028 | $LD r6,`1*$BNSZ`(r4) | |
1029 | $LD r7,`1*$BNSZ`(r5) | |
1030 | $UMULL r8,r6,r7 | |
1031 | $UMULH r9,r6,r7 | |
1032 | addc r12,r12,r8 | |
1033 | adde r10,r10,r9 | |
1034 | addze r11,r11 | |
1035 | #mul_add_c(a[0],b[2],c3,c1,c2); | |
1036 | $LD r6,`0*$BNSZ`(r4) | |
1037 | $LD r7,`2*$BNSZ`(r5) | |
1038 | $UMULL r8,r6,r7 | |
1039 | $UMULH r9,r6,r7 | |
1040 | addc r12,r12,r8 | |
1041 | adde r10,r10,r9 | |
1042 | addze r11,r11 | |
1043 | $ST r12,`2*$BNSZ`(r3) #r[2]=c3; | |
1044 | #mul_add_c(a[0],b[3],c1,c2,c3); | |
1045 | $LD r7,`3*$BNSZ`(r5) | |
1046 | $UMULL r8,r6,r7 | |
1047 | $UMULH r9,r6,r7 | |
1048 | addc r10,r10,r8 | |
1049 | adde r11,r11,r9 | |
1050 | addze r12,r0 | |
1051 | #mul_add_c(a[1],b[2],c1,c2,c3); | |
1052 | $LD r6,`1*$BNSZ`(r4) | |
1053 | $LD r7,`2*$BNSZ`(r5) | |
1054 | $UMULL r8,r6,r7 | |
1055 | $UMULH r9,r6,r7 | |
1056 | addc r10,r10,r8 | |
1057 | adde r11,r11,r9 | |
1058 | addze r12,r12 | |
1059 | ||
1060 | #mul_add_c(a[2],b[1],c1,c2,c3); | |
1061 | $LD r6,`2*$BNSZ`(r4) | |
1062 | $LD r7,`1*$BNSZ`(r5) | |
1063 | $UMULL r8,r6,r7 | |
1064 | $UMULH r9,r6,r7 | |
1065 | addc r10,r10,r8 | |
1066 | adde r11,r11,r9 | |
1067 | addze r12,r12 | |
1068 | #mul_add_c(a[3],b[0],c1,c2,c3); | |
1069 | $LD r6,`3*$BNSZ`(r4) | |
1070 | $LD r7,`0*$BNSZ`(r5) | |
1071 | $UMULL r8,r6,r7 | |
1072 | $UMULH r9,r6,r7 | |
1073 | addc r10,r10,r8 | |
1074 | adde r11,r11,r9 | |
1075 | addze r12,r12 | |
1076 | $ST r10,`3*$BNSZ`(r3) #r[3]=c1; | |
1077 | #mul_add_c(a[4],b[0],c2,c3,c1); | |
1078 | $LD r6,`4*$BNSZ`(r4) | |
1079 | $UMULL r8,r6,r7 | |
1080 | $UMULH r9,r6,r7 | |
1081 | addc r11,r11,r8 | |
1082 | adde r12,r12,r9 | |
1083 | addze r10,r0 | |
1084 | #mul_add_c(a[3],b[1],c2,c3,c1); | |
1085 | $LD r6,`3*$BNSZ`(r4) | |
1086 | $LD r7,`1*$BNSZ`(r5) | |
1087 | $UMULL r8,r6,r7 | |
1088 | $UMULH r9,r6,r7 | |
1089 | addc r11,r11,r8 | |
1090 | adde r12,r12,r9 | |
1091 | addze r10,r10 | |
1092 | #mul_add_c(a[2],b[2],c2,c3,c1); | |
1093 | $LD r6,`2*$BNSZ`(r4) | |
1094 | $LD r7,`2*$BNSZ`(r5) | |
1095 | $UMULL r8,r6,r7 | |
1096 | $UMULH r9,r6,r7 | |
1097 | addc r11,r11,r8 | |
1098 | adde r12,r12,r9 | |
1099 | addze r10,r10 | |
1100 | #mul_add_c(a[1],b[3],c2,c3,c1); | |
1101 | $LD r6,`1*$BNSZ`(r4) | |
1102 | $LD r7,`3*$BNSZ`(r5) | |
1103 | $UMULL r8,r6,r7 | |
1104 | $UMULH r9,r6,r7 | |
1105 | addc r11,r11,r8 | |
1106 | adde r12,r12,r9 | |
1107 | addze r10,r10 | |
1108 | #mul_add_c(a[0],b[4],c2,c3,c1); | |
1109 | $LD r6,`0*$BNSZ`(r4) | |
1110 | $LD r7,`4*$BNSZ`(r5) | |
1111 | $UMULL r8,r6,r7 | |
1112 | $UMULH r9,r6,r7 | |
1113 | addc r11,r11,r8 | |
1114 | adde r12,r12,r9 | |
1115 | addze r10,r10 | |
1116 | $ST r11,`4*$BNSZ`(r3) #r[4]=c2; | |
1117 | #mul_add_c(a[0],b[5],c3,c1,c2); | |
1118 | $LD r7,`5*$BNSZ`(r5) | |
1119 | $UMULL r8,r6,r7 | |
1120 | $UMULH r9,r6,r7 | |
1121 | addc r12,r12,r8 | |
1122 | adde r10,r10,r9 | |
1123 | addze r11,r0 | |
1124 | #mul_add_c(a[1],b[4],c3,c1,c2); | |
1125 | $LD r6,`1*$BNSZ`(r4) | |
1126 | $LD r7,`4*$BNSZ`(r5) | |
1127 | $UMULL r8,r6,r7 | |
1128 | $UMULH r9,r6,r7 | |
1129 | addc r12,r12,r8 | |
1130 | adde r10,r10,r9 | |
1131 | addze r11,r11 | |
1132 | #mul_add_c(a[2],b[3],c3,c1,c2); | |
1133 | $LD r6,`2*$BNSZ`(r4) | |
1134 | $LD r7,`3*$BNSZ`(r5) | |
1135 | $UMULL r8,r6,r7 | |
1136 | $UMULH r9,r6,r7 | |
1137 | addc r12,r12,r8 | |
1138 | adde r10,r10,r9 | |
1139 | addze r11,r11 | |
1140 | #mul_add_c(a[3],b[2],c3,c1,c2); | |
1141 | $LD r6,`3*$BNSZ`(r4) | |
1142 | $LD r7,`2*$BNSZ`(r5) | |
1143 | $UMULL r8,r6,r7 | |
1144 | $UMULH r9,r6,r7 | |
1145 | addc r12,r12,r8 | |
1146 | adde r10,r10,r9 | |
1147 | addze r11,r11 | |
1148 | #mul_add_c(a[4],b[1],c3,c1,c2); | |
1149 | $LD r6,`4*$BNSZ`(r4) | |
1150 | $LD r7,`1*$BNSZ`(r5) | |
1151 | $UMULL r8,r6,r7 | |
1152 | $UMULH r9,r6,r7 | |
1153 | addc r12,r12,r8 | |
1154 | adde r10,r10,r9 | |
1155 | addze r11,r11 | |
1156 | #mul_add_c(a[5],b[0],c3,c1,c2); | |
1157 | $LD r6,`5*$BNSZ`(r4) | |
1158 | $LD r7,`0*$BNSZ`(r5) | |
1159 | $UMULL r8,r6,r7 | |
1160 | $UMULH r9,r6,r7 | |
1161 | addc r12,r12,r8 | |
1162 | adde r10,r10,r9 | |
1163 | addze r11,r11 | |
1164 | $ST r12,`5*$BNSZ`(r3) #r[5]=c3; | |
1165 | #mul_add_c(a[6],b[0],c1,c2,c3); | |
1166 | $LD r6,`6*$BNSZ`(r4) | |
1167 | $UMULL r8,r6,r7 | |
1168 | $UMULH r9,r6,r7 | |
1169 | addc r10,r10,r8 | |
1170 | adde r11,r11,r9 | |
1171 | addze r12,r0 | |
1172 | #mul_add_c(a[5],b[1],c1,c2,c3); | |
1173 | $LD r6,`5*$BNSZ`(r4) | |
1174 | $LD r7,`1*$BNSZ`(r5) | |
1175 | $UMULL r8,r6,r7 | |
1176 | $UMULH r9,r6,r7 | |
1177 | addc r10,r10,r8 | |
1178 | adde r11,r11,r9 | |
1179 | addze r12,r12 | |
1180 | #mul_add_c(a[4],b[2],c1,c2,c3); | |
1181 | $LD r6,`4*$BNSZ`(r4) | |
1182 | $LD r7,`2*$BNSZ`(r5) | |
1183 | $UMULL r8,r6,r7 | |
1184 | $UMULH r9,r6,r7 | |
1185 | addc r10,r10,r8 | |
1186 | adde r11,r11,r9 | |
1187 | addze r12,r12 | |
1188 | #mul_add_c(a[3],b[3],c1,c2,c3); | |
1189 | $LD r6,`3*$BNSZ`(r4) | |
1190 | $LD r7,`3*$BNSZ`(r5) | |
1191 | $UMULL r8,r6,r7 | |
1192 | $UMULH r9,r6,r7 | |
1193 | addc r10,r10,r8 | |
1194 | adde r11,r11,r9 | |
1195 | addze r12,r12 | |
1196 | #mul_add_c(a[2],b[4],c1,c2,c3); | |
1197 | $LD r6,`2*$BNSZ`(r4) | |
1198 | $LD r7,`4*$BNSZ`(r5) | |
1199 | $UMULL r8,r6,r7 | |
1200 | $UMULH r9,r6,r7 | |
1201 | addc r10,r10,r8 | |
1202 | adde r11,r11,r9 | |
1203 | addze r12,r12 | |
1204 | #mul_add_c(a[1],b[5],c1,c2,c3); | |
1205 | $LD r6,`1*$BNSZ`(r4) | |
1206 | $LD r7,`5*$BNSZ`(r5) | |
1207 | $UMULL r8,r6,r7 | |
1208 | $UMULH r9,r6,r7 | |
1209 | addc r10,r10,r8 | |
1210 | adde r11,r11,r9 | |
1211 | addze r12,r12 | |
1212 | #mul_add_c(a[0],b[6],c1,c2,c3); | |
1213 | $LD r6,`0*$BNSZ`(r4) | |
1214 | $LD r7,`6*$BNSZ`(r5) | |
1215 | $UMULL r8,r6,r7 | |
1216 | $UMULH r9,r6,r7 | |
1217 | addc r10,r10,r8 | |
1218 | adde r11,r11,r9 | |
1219 | addze r12,r12 | |
1220 | $ST r10,`6*$BNSZ`(r3) #r[6]=c1; | |
1221 | #mul_add_c(a[0],b[7],c2,c3,c1); | |
1222 | $LD r7,`7*$BNSZ`(r5) | |
1223 | $UMULL r8,r6,r7 | |
1224 | $UMULH r9,r6,r7 | |
1225 | addc r11,r11,r8 | |
1226 | adde r12,r12,r9 | |
1227 | addze r10,r0 | |
1228 | #mul_add_c(a[1],b[6],c2,c3,c1); | |
1229 | $LD r6,`1*$BNSZ`(r4) | |
1230 | $LD r7,`6*$BNSZ`(r5) | |
1231 | $UMULL r8,r6,r7 | |
1232 | $UMULH r9,r6,r7 | |
1233 | addc r11,r11,r8 | |
1234 | adde r12,r12,r9 | |
1235 | addze r10,r10 | |
1236 | #mul_add_c(a[2],b[5],c2,c3,c1); | |
1237 | $LD r6,`2*$BNSZ`(r4) | |
1238 | $LD r7,`5*$BNSZ`(r5) | |
1239 | $UMULL r8,r6,r7 | |
1240 | $UMULH r9,r6,r7 | |
1241 | addc r11,r11,r8 | |
1242 | adde r12,r12,r9 | |
1243 | addze r10,r10 | |
1244 | #mul_add_c(a[3],b[4],c2,c3,c1); | |
1245 | $LD r6,`3*$BNSZ`(r4) | |
1246 | $LD r7,`4*$BNSZ`(r5) | |
1247 | $UMULL r8,r6,r7 | |
1248 | $UMULH r9,r6,r7 | |
1249 | addc r11,r11,r8 | |
1250 | adde r12,r12,r9 | |
1251 | addze r10,r10 | |
1252 | #mul_add_c(a[4],b[3],c2,c3,c1); | |
1253 | $LD r6,`4*$BNSZ`(r4) | |
1254 | $LD r7,`3*$BNSZ`(r5) | |
1255 | $UMULL r8,r6,r7 | |
1256 | $UMULH r9,r6,r7 | |
1257 | addc r11,r11,r8 | |
1258 | adde r12,r12,r9 | |
1259 | addze r10,r10 | |
1260 | #mul_add_c(a[5],b[2],c2,c3,c1); | |
1261 | $LD r6,`5*$BNSZ`(r4) | |
1262 | $LD r7,`2*$BNSZ`(r5) | |
1263 | $UMULL r8,r6,r7 | |
1264 | $UMULH r9,r6,r7 | |
1265 | addc r11,r11,r8 | |
1266 | adde r12,r12,r9 | |
1267 | addze r10,r10 | |
1268 | #mul_add_c(a[6],b[1],c2,c3,c1); | |
1269 | $LD r6,`6*$BNSZ`(r4) | |
1270 | $LD r7,`1*$BNSZ`(r5) | |
1271 | $UMULL r8,r6,r7 | |
1272 | $UMULH r9,r6,r7 | |
1273 | addc r11,r11,r8 | |
1274 | adde r12,r12,r9 | |
1275 | addze r10,r10 | |
1276 | #mul_add_c(a[7],b[0],c2,c3,c1); | |
1277 | $LD r6,`7*$BNSZ`(r4) | |
1278 | $LD r7,`0*$BNSZ`(r5) | |
1279 | $UMULL r8,r6,r7 | |
1280 | $UMULH r9,r6,r7 | |
1281 | addc r11,r11,r8 | |
1282 | adde r12,r12,r9 | |
1283 | addze r10,r10 | |
1284 | $ST r11,`7*$BNSZ`(r3) #r[7]=c2; | |
1285 | #mul_add_c(a[7],b[1],c3,c1,c2); | |
1286 | $LD r7,`1*$BNSZ`(r5) | |
1287 | $UMULL r8,r6,r7 | |
1288 | $UMULH r9,r6,r7 | |
1289 | addc r12,r12,r8 | |
1290 | adde r10,r10,r9 | |
1291 | addze r11,r0 | |
1292 | #mul_add_c(a[6],b[2],c3,c1,c2); | |
1293 | $LD r6,`6*$BNSZ`(r4) | |
1294 | $LD r7,`2*$BNSZ`(r5) | |
1295 | $UMULL r8,r6,r7 | |
1296 | $UMULH r9,r6,r7 | |
1297 | addc r12,r12,r8 | |
1298 | adde r10,r10,r9 | |
1299 | addze r11,r11 | |
1300 | #mul_add_c(a[5],b[3],c3,c1,c2); | |
1301 | $LD r6,`5*$BNSZ`(r4) | |
1302 | $LD r7,`3*$BNSZ`(r5) | |
1303 | $UMULL r8,r6,r7 | |
1304 | $UMULH r9,r6,r7 | |
1305 | addc r12,r12,r8 | |
1306 | adde r10,r10,r9 | |
1307 | addze r11,r11 | |
1308 | #mul_add_c(a[4],b[4],c3,c1,c2); | |
1309 | $LD r6,`4*$BNSZ`(r4) | |
1310 | $LD r7,`4*$BNSZ`(r5) | |
1311 | $UMULL r8,r6,r7 | |
1312 | $UMULH r9,r6,r7 | |
1313 | addc r12,r12,r8 | |
1314 | adde r10,r10,r9 | |
1315 | addze r11,r11 | |
1316 | #mul_add_c(a[3],b[5],c3,c1,c2); | |
1317 | $LD r6,`3*$BNSZ`(r4) | |
1318 | $LD r7,`5*$BNSZ`(r5) | |
1319 | $UMULL r8,r6,r7 | |
1320 | $UMULH r9,r6,r7 | |
1321 | addc r12,r12,r8 | |
1322 | adde r10,r10,r9 | |
1323 | addze r11,r11 | |
1324 | #mul_add_c(a[2],b[6],c3,c1,c2); | |
1325 | $LD r6,`2*$BNSZ`(r4) | |
1326 | $LD r7,`6*$BNSZ`(r5) | |
1327 | $UMULL r8,r6,r7 | |
1328 | $UMULH r9,r6,r7 | |
1329 | addc r12,r12,r8 | |
1330 | adde r10,r10,r9 | |
1331 | addze r11,r11 | |
1332 | #mul_add_c(a[1],b[7],c3,c1,c2); | |
1333 | $LD r6,`1*$BNSZ`(r4) | |
1334 | $LD r7,`7*$BNSZ`(r5) | |
1335 | $UMULL r8,r6,r7 | |
1336 | $UMULH r9,r6,r7 | |
1337 | addc r12,r12,r8 | |
1338 | adde r10,r10,r9 | |
1339 | addze r11,r11 | |
1340 | $ST r12,`8*$BNSZ`(r3) #r[8]=c3; | |
1341 | #mul_add_c(a[2],b[7],c1,c2,c3); | |
1342 | $LD r6,`2*$BNSZ`(r4) | |
1343 | $UMULL r8,r6,r7 | |
1344 | $UMULH r9,r6,r7 | |
1345 | addc r10,r10,r8 | |
1346 | adde r11,r11,r9 | |
1347 | addze r12,r0 | |
1348 | #mul_add_c(a[3],b[6],c1,c2,c3); | |
1349 | $LD r6,`3*$BNSZ`(r4) | |
1350 | $LD r7,`6*$BNSZ`(r5) | |
1351 | $UMULL r8,r6,r7 | |
1352 | $UMULH r9,r6,r7 | |
1353 | addc r10,r10,r8 | |
1354 | adde r11,r11,r9 | |
1355 | addze r12,r12 | |
1356 | #mul_add_c(a[4],b[5],c1,c2,c3); | |
1357 | $LD r6,`4*$BNSZ`(r4) | |
1358 | $LD r7,`5*$BNSZ`(r5) | |
1359 | $UMULL r8,r6,r7 | |
1360 | $UMULH r9,r6,r7 | |
1361 | addc r10,r10,r8 | |
1362 | adde r11,r11,r9 | |
1363 | addze r12,r12 | |
1364 | #mul_add_c(a[5],b[4],c1,c2,c3); | |
1365 | $LD r6,`5*$BNSZ`(r4) | |
1366 | $LD r7,`4*$BNSZ`(r5) | |
1367 | $UMULL r8,r6,r7 | |
1368 | $UMULH r9,r6,r7 | |
1369 | addc r10,r10,r8 | |
1370 | adde r11,r11,r9 | |
1371 | addze r12,r12 | |
1372 | #mul_add_c(a[6],b[3],c1,c2,c3); | |
1373 | $LD r6,`6*$BNSZ`(r4) | |
1374 | $LD r7,`3*$BNSZ`(r5) | |
1375 | $UMULL r8,r6,r7 | |
1376 | $UMULH r9,r6,r7 | |
1377 | addc r10,r10,r8 | |
1378 | adde r11,r11,r9 | |
1379 | addze r12,r12 | |
1380 | #mul_add_c(a[7],b[2],c1,c2,c3); | |
1381 | $LD r6,`7*$BNSZ`(r4) | |
1382 | $LD r7,`2*$BNSZ`(r5) | |
1383 | $UMULL r8,r6,r7 | |
1384 | $UMULH r9,r6,r7 | |
1385 | addc r10,r10,r8 | |
1386 | adde r11,r11,r9 | |
1387 | addze r12,r12 | |
1388 | $ST r10,`9*$BNSZ`(r3) #r[9]=c1; | |
1389 | #mul_add_c(a[7],b[3],c2,c3,c1); | |
1390 | $LD r7,`3*$BNSZ`(r5) | |
1391 | $UMULL r8,r6,r7 | |
1392 | $UMULH r9,r6,r7 | |
1393 | addc r11,r11,r8 | |
1394 | adde r12,r12,r9 | |
1395 | addze r10,r0 | |
1396 | #mul_add_c(a[6],b[4],c2,c3,c1); | |
1397 | $LD r6,`6*$BNSZ`(r4) | |
1398 | $LD r7,`4*$BNSZ`(r5) | |
1399 | $UMULL r8,r6,r7 | |
1400 | $UMULH r9,r6,r7 | |
1401 | addc r11,r11,r8 | |
1402 | adde r12,r12,r9 | |
1403 | addze r10,r10 | |
1404 | #mul_add_c(a[5],b[5],c2,c3,c1); | |
1405 | $LD r6,`5*$BNSZ`(r4) | |
1406 | $LD r7,`5*$BNSZ`(r5) | |
1407 | $UMULL r8,r6,r7 | |
1408 | $UMULH r9,r6,r7 | |
1409 | addc r11,r11,r8 | |
1410 | adde r12,r12,r9 | |
1411 | addze r10,r10 | |
1412 | #mul_add_c(a[4],b[6],c2,c3,c1); | |
1413 | $LD r6,`4*$BNSZ`(r4) | |
1414 | $LD r7,`6*$BNSZ`(r5) | |
1415 | $UMULL r8,r6,r7 | |
1416 | $UMULH r9,r6,r7 | |
1417 | addc r11,r11,r8 | |
1418 | adde r12,r12,r9 | |
1419 | addze r10,r10 | |
1420 | #mul_add_c(a[3],b[7],c2,c3,c1); | |
1421 | $LD r6,`3*$BNSZ`(r4) | |
1422 | $LD r7,`7*$BNSZ`(r5) | |
1423 | $UMULL r8,r6,r7 | |
1424 | $UMULH r9,r6,r7 | |
1425 | addc r11,r11,r8 | |
1426 | adde r12,r12,r9 | |
1427 | addze r10,r10 | |
1428 | $ST r11,`10*$BNSZ`(r3) #r[10]=c2; | |
1429 | #mul_add_c(a[4],b[7],c3,c1,c2); | |
1430 | $LD r6,`4*$BNSZ`(r4) | |
1431 | $UMULL r8,r6,r7 | |
1432 | $UMULH r9,r6,r7 | |
1433 | addc r12,r12,r8 | |
1434 | adde r10,r10,r9 | |
1435 | addze r11,r0 | |
1436 | #mul_add_c(a[5],b[6],c3,c1,c2); | |
1437 | $LD r6,`5*$BNSZ`(r4) | |
1438 | $LD r7,`6*$BNSZ`(r5) | |
1439 | $UMULL r8,r6,r7 | |
1440 | $UMULH r9,r6,r7 | |
1441 | addc r12,r12,r8 | |
1442 | adde r10,r10,r9 | |
1443 | addze r11,r11 | |
1444 | #mul_add_c(a[6],b[5],c3,c1,c2); | |
1445 | $LD r6,`6*$BNSZ`(r4) | |
1446 | $LD r7,`5*$BNSZ`(r5) | |
1447 | $UMULL r8,r6,r7 | |
1448 | $UMULH r9,r6,r7 | |
1449 | addc r12,r12,r8 | |
1450 | adde r10,r10,r9 | |
1451 | addze r11,r11 | |
1452 | #mul_add_c(a[7],b[4],c3,c1,c2); | |
1453 | $LD r6,`7*$BNSZ`(r4) | |
1454 | $LD r7,`4*$BNSZ`(r5) | |
1455 | $UMULL r8,r6,r7 | |
1456 | $UMULH r9,r6,r7 | |
1457 | addc r12,r12,r8 | |
1458 | adde r10,r10,r9 | |
1459 | addze r11,r11 | |
1460 | $ST r12,`11*$BNSZ`(r3) #r[11]=c3; | |
1461 | #mul_add_c(a[7],b[5],c1,c2,c3); | |
1462 | $LD r7,`5*$BNSZ`(r5) | |
1463 | $UMULL r8,r6,r7 | |
1464 | $UMULH r9,r6,r7 | |
1465 | addc r10,r10,r8 | |
1466 | adde r11,r11,r9 | |
1467 | addze r12,r0 | |
1468 | #mul_add_c(a[6],b[6],c1,c2,c3); | |
1469 | $LD r6,`6*$BNSZ`(r4) | |
1470 | $LD r7,`6*$BNSZ`(r5) | |
1471 | $UMULL r8,r6,r7 | |
1472 | $UMULH r9,r6,r7 | |
1473 | addc r10,r10,r8 | |
1474 | adde r11,r11,r9 | |
1475 | addze r12,r12 | |
1476 | #mul_add_c(a[5],b[7],c1,c2,c3); | |
1477 | $LD r6,`5*$BNSZ`(r4) | |
1478 | $LD r7,`7*$BNSZ`(r5) | |
1479 | $UMULL r8,r6,r7 | |
1480 | $UMULH r9,r6,r7 | |
1481 | addc r10,r10,r8 | |
1482 | adde r11,r11,r9 | |
1483 | addze r12,r12 | |
1484 | $ST r10,`12*$BNSZ`(r3) #r[12]=c1; | |
1485 | #mul_add_c(a[6],b[7],c2,c3,c1); | |
1486 | $LD r6,`6*$BNSZ`(r4) | |
1487 | $UMULL r8,r6,r7 | |
1488 | $UMULH r9,r6,r7 | |
1489 | addc r11,r11,r8 | |
1490 | adde r12,r12,r9 | |
1491 | addze r10,r0 | |
1492 | #mul_add_c(a[7],b[6],c2,c3,c1); | |
1493 | $LD r6,`7*$BNSZ`(r4) | |
1494 | $LD r7,`6*$BNSZ`(r5) | |
1495 | $UMULL r8,r6,r7 | |
1496 | $UMULH r9,r6,r7 | |
1497 | addc r11,r11,r8 | |
1498 | adde r12,r12,r9 | |
1499 | addze r10,r10 | |
1500 | $ST r11,`13*$BNSZ`(r3) #r[13]=c2; | |
1501 | #mul_add_c(a[7],b[7],c3,c1,c2); | |
1502 | $LD r7,`7*$BNSZ`(r5) | |
1503 | $UMULL r8,r6,r7 | |
1504 | $UMULH r9,r6,r7 | |
1505 | addc r12,r12,r8 | |
1506 | adde r10,r10,r9 | |
1507 | $ST r12,`14*$BNSZ`(r3) #r[14]=c3; | |
1508 | $ST r10,`15*$BNSZ`(r3) #r[15]=c1; | |
31439046 | 1509 | blr |
67150340 AP |
1510 | .long 0 |
1511 | .byte 0,12,0x14,0,0,0,3,0 | |
1512 | .long 0 | |
dd558806 AP |
1513 | |
1514 | # | |
1515 | # NOTE: The following label name should be changed to | |
1516 | # "bn_sub_words" i.e. remove the first dot | |
1517 | # for the gcc compiler. This should be automatically | |
1518 | # done in the build | |
1519 | # | |
1520 | # | |
1521 | .align 4 | |
1522 | .bn_sub_words: | |
1523 | # | |
1524 | # Handcoded version of bn_sub_words | |
1525 | # | |
1526 | #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
1527 | # | |
1528 | # r3 = r | |
1529 | # r4 = a | |
1530 | # r5 = b | |
1531 | # r6 = n | |
1532 | # | |
1533 | # Note: No loop unrolling done since this is not a performance | |
1534 | # critical loop. | |
1535 | ||
1536 | xor r0,r0,r0 #set r0 = 0 | |
1537 | # | |
1538 | # check for r6 = 0 AND set carry bit. | |
1539 | # | |
1540 | subfc. r7,r0,r6 # If r6 is 0 then result is 0. | |
1541 | # if r6 > 0 then result !=0 | |
1542 | # In either case carry bit is set. | |
31439046 | 1543 | beq Lppcasm_sub_adios |
dd558806 AP |
1544 | addi r4,r4,-$BNSZ |
1545 | addi r3,r3,-$BNSZ | |
1546 | addi r5,r5,-$BNSZ | |
1547 | mtctr r6 | |
1548 | Lppcasm_sub_mainloop: | |
1549 | $LDU r7,$BNSZ(r4) | |
1550 | $LDU r8,$BNSZ(r5) | |
1551 | subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) | |
1552 | # if carry = 1 this is r7-r8. Else it | |
1553 | # is r7-r8 -1 as we need. | |
1554 | $STU r6,$BNSZ(r3) | |
31439046 | 1555 | bdnz- Lppcasm_sub_mainloop |
dd558806 AP |
1556 | Lppcasm_sub_adios: |
1557 | subfze r3,r0 # if carry bit is set then r3 = 0 else -1 | |
1558 | andi. r3,r3,1 # keep only last bit. | |
31439046 | 1559 | blr |
67150340 AP |
1560 | .long 0 |
1561 | .byte 0,12,0x14,0,0,0,4,0 | |
1562 | .long 0 | |
dd558806 AP |
1563 | |
1564 | # | |
1565 | # NOTE: The following label name should be changed to | |
1566 | # "bn_add_words" i.e. remove the first dot | |
1567 | # for the gcc compiler. This should be automatically | |
1568 | # done in the build | |
1569 | # | |
1570 | ||
1571 | .align 4 | |
1572 | .bn_add_words: | |
1573 | # | |
1574 | # Handcoded version of bn_add_words | |
1575 | # | |
1576 | #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) | |
1577 | # | |
1578 | # r3 = r | |
1579 | # r4 = a | |
1580 | # r5 = b | |
1581 | # r6 = n | |
1582 | # | |
1583 | # Note: No loop unrolling done since this is not a performance | |
1584 | # critical loop. | |
1585 | ||
1586 | xor r0,r0,r0 | |
1587 | # | |
1588 | # check for r6 = 0. Is this needed? | |
1589 | # | |
1590 | addic. r6,r6,0 #test r6 and clear carry bit. | |
31439046 | 1591 | beq Lppcasm_add_adios |
dd558806 AP |
1592 | addi r4,r4,-$BNSZ |
1593 | addi r3,r3,-$BNSZ | |
1594 | addi r5,r5,-$BNSZ | |
1595 | mtctr r6 | |
1596 | Lppcasm_add_mainloop: | |
1597 | $LDU r7,$BNSZ(r4) | |
1598 | $LDU r8,$BNSZ(r5) | |
1599 | adde r8,r7,r8 | |
1600 | $STU r8,$BNSZ(r3) | |
31439046 | 1601 | bdnz- Lppcasm_add_mainloop |
dd558806 AP |
1602 | Lppcasm_add_adios: |
1603 | addze r3,r0 #return carry bit. | |
31439046 | 1604 | blr |
67150340 AP |
1605 | .long 0 |
1606 | .byte 0,12,0x14,0,0,0,4,0 | |
1607 | .long 0 | |
dd558806 AP |
1608 | |
1609 | # | |
1610 | # NOTE: The following label name should be changed to | |
1611 | # "bn_div_words" i.e. remove the first dot | |
1612 | # for the gcc compiler. This should be automatically | |
1613 | # done in the build | |
1614 | # | |
1615 | ||
1616 | .align 4 | |
1617 | .bn_div_words: | |
1618 | # | |
1619 | # This is a cleaned up version of code generated by | |
1620 | # the AIX compiler. The only optimization is to use | |
1621 | # the PPC instruction to count leading zeros instead | |
1622 | # of call to num_bits_word. Since this was compiled | |
1623 | # only at level -O2 we can possibly squeeze it more? | |
1624 | # | |
1625 | # r3 = h | |
1626 | # r4 = l | |
1627 | # r5 = d | |
1628 | ||
1629 | $UCMPI 0,r5,0 # compare r5 and 0 | |
31439046 | 1630 | bne Lppcasm_div1 # proceed if d!=0 |
dd558806 | 1631 | li r3,-1 # d=0 return -1 |
31439046 | 1632 | blr |
dd558806 AP |
1633 | Lppcasm_div1: |
1634 | xor r0,r0,r0 #r0=0 | |
aaa5dc61 AP |
1635 | li r8,$BITS |
1636 | $CNTLZ. r7,r5 #r7 = num leading 0s in d. | |
31439046 | 1637 | beq Lppcasm_div2 #proceed if no leading zeros |
aaa5dc61 AP |
1638 | subf r8,r7,r8 #r8 = BN_num_bits_word(d) |
1639 | $SHR. r9,r3,r8 #are there any bits above r8'th? | |
31efffbd | 1640 | $TR 16,r9,r0 #if there're, signal to dump core... |
dd558806 AP |
1641 | Lppcasm_div2: |
1642 | $UCMP 0,r3,r5 #h>=d? | |
31439046 | 1643 | blt Lppcasm_div3 #goto Lppcasm_div3 if not |
dd558806 AP |
1644 | subf r3,r5,r3 #h-=d ; |
1645 | Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i | |
1646 | cmpi 0,0,r7,0 # is (i == 0)? | |
31439046 | 1647 | beq Lppcasm_div4 |
dd558806 AP |
1648 | $SHL r3,r3,r7 # h = (h<< i) |
1649 | $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) | |
1650 | $SHL r5,r5,r7 # d<<=i | |
1651 | or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i)) | |
1652 | $SHL r4,r4,r7 # l <<=i | |
1653 | Lppcasm_div4: | |
1654 | $SHRI r9,r5,`$BITS/2` # r9 = dh | |
1655 | # dl will be computed when needed | |
1656 | # as it saves registers. | |
1657 | li r6,2 #r6=2 | |
1658 | mtctr r6 #counter will be in count. | |
1659 | Lppcasm_divouterloop: | |
1660 | $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) | |
1661 | $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 | |
1662 | # compute here for innerloop. | |
1663 | $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh | |
31439046 | 1664 | bne Lppcasm_div5 # goto Lppcasm_div5 if not |
dd558806 AP |
1665 | |
1666 | li r8,-1 | |
1667 | $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l | |
1668 | b Lppcasm_div6 | |
1669 | Lppcasm_div5: | |
1670 | $UDIV r8,r3,r9 #q = h/dh | |
1671 | Lppcasm_div6: | |
1672 | $UMULL r12,r9,r8 #th = q*dh | |
1673 | $CLRU r10,r5,`$BITS/2` #r10=dl | |
1674 | $UMULL r6,r8,r10 #tl = q*dl | |
1675 | ||
1676 | Lppcasm_divinnerloop: | |
1677 | subf r10,r12,r3 #t = h -th | |
1678 | $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... | |
1679 | addic. r7,r7,0 #test if r7 == 0. used below. | |
1680 | # now want to compute | |
1681 | # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4) | |
1682 | # the following 2 instructions do that | |
1683 | $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4) | |
1684 | or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4) | |
31439046 AP |
1685 | $UCMP cr1,r6,r7 # compare (tl <= r7) |
1686 | bne Lppcasm_divinnerexit | |
1687 | ble cr1,Lppcasm_divinnerexit | |
dd558806 AP |
1688 | addi r8,r8,-1 #q-- |
1689 | subf r12,r9,r12 #th -=dh | |
1690 | $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. | |
1691 | subf r6,r10,r6 #tl -=dl | |
1692 | b Lppcasm_divinnerloop | |
1693 | Lppcasm_divinnerexit: | |
1694 | $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) | |
1695 | $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h; | |
31439046 | 1696 | $UCMP cr1,r4,r11 # compare l and tl |
dd558806 | 1697 | add r12,r12,r10 # th+=t |
31439046 | 1698 | bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7 |
dd558806 AP |
1699 | addi r12,r12,1 # th++ |
1700 | Lppcasm_div7: | |
1701 | subf r11,r11,r4 #r11=l-tl | |
31439046 AP |
1702 | $UCMP cr1,r3,r12 #compare h and th |
1703 | bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 | |
dd558806 AP |
1704 | addi r8,r8,-1 # q-- |
1705 | add r3,r5,r3 # h+=d | |
1706 | Lppcasm_div8: | |
1707 | subf r12,r12,r3 #r12 = h-th | |
1708 | $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4 | |
1709 | # want to compute | |
1710 | # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2 | |
1711 | # the following 2 instructions will do this. | |
1712 | $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. | |
1713 | $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 | |
31439046 | 1714 | bdz Lppcasm_div9 #if (count==0) break ; |
dd558806 AP |
1715 | $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4 |
1716 | b Lppcasm_divouterloop | |
1717 | Lppcasm_div9: | |
1718 | or r3,r8,r0 | |
31439046 | 1719 | blr |
67150340 AP |
1720 | .long 0 |
1721 | .byte 0,12,0x14,0,0,0,3,0 | |
1722 | .long 0 | |
dd558806 AP |
1723 | |
1724 | # | |
1725 | # NOTE: The following label name should be changed to | |
1726 | # "bn_sqr_words" i.e. remove the first dot | |
1727 | # for the gcc compiler. This should be automatically | |
1728 | # done in the build | |
1729 | # | |
1730 | .align 4 | |
1731 | .bn_sqr_words: | |
1732 | # | |
1733 | # Optimized version of bn_sqr_words | |
1734 | # | |
1735 | # void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) | |
1736 | # | |
1737 | # r3 = r | |
1738 | # r4 = a | |
1739 | # r5 = n | |
1740 | # | |
1741 | # r6 = a[i]. | |
1742 | # r7,r8 = product. | |
1743 | # | |
1744 | # No unrolling done here. Not performance critical. | |
1745 | ||
1746 | addic. r5,r5,0 #test r5. | |
31439046 | 1747 | beq Lppcasm_sqr_adios |
dd558806 AP |
1748 | addi r4,r4,-$BNSZ |
1749 | addi r3,r3,-$BNSZ | |
1750 | mtctr r5 | |
1751 | Lppcasm_sqr_mainloop: | |
1752 | #sqr(r[0],r[1],a[0]); | |
1753 | $LDU r6,$BNSZ(r4) | |
1754 | $UMULL r7,r6,r6 | |
1755 | $UMULH r8,r6,r6 | |
1756 | $STU r7,$BNSZ(r3) | |
1757 | $STU r8,$BNSZ(r3) | |
31439046 | 1758 | bdnz- Lppcasm_sqr_mainloop |
dd558806 | 1759 | Lppcasm_sqr_adios: |
31439046 | 1760 | blr |
67150340 AP |
1761 | .long 0 |
1762 | .byte 0,12,0x14,0,0,0,3,0 | |
1763 | .long 0 | |
dd558806 AP |
1764 | |
1765 | # | |
1766 | # NOTE: The following label name should be changed to | |
1767 | # "bn_mul_words" i.e. remove the first dot | |
1768 | # for the gcc compiler. This should be automatically | |
1769 | # done in the build | |
1770 | # | |
1771 | ||
1772 | .align 4 | |
1773 | .bn_mul_words: | |
1774 | # | |
1775 | # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
1776 | # | |
1777 | # r3 = rp | |
1778 | # r4 = ap | |
1779 | # r5 = num | |
1780 | # r6 = w | |
1781 | xor r0,r0,r0 | |
1782 | xor r12,r12,r12 # used for carry | |
1783 | rlwinm. r7,r5,30,2,31 # num >> 2 | |
31439046 | 1784 | beq Lppcasm_mw_REM |
dd558806 AP |
1785 | mtctr r7 |
1786 | Lppcasm_mw_LOOP: | |
1787 | #mul(rp[0],ap[0],w,c1); | |
1788 | $LD r8,`0*$BNSZ`(r4) | |
1789 | $UMULL r9,r6,r8 | |
1790 | $UMULH r10,r6,r8 | |
1791 | addc r9,r9,r12 | |
1792 | #addze r10,r10 #carry is NOT ignored. | |
1793 | #will be taken care of | |
1794 | #in second spin below | |
1795 | #using adde. | |
1796 | $ST r9,`0*$BNSZ`(r3) | |
1797 | #mul(rp[1],ap[1],w,c1); | |
1798 | $LD r8,`1*$BNSZ`(r4) | |
1799 | $UMULL r11,r6,r8 | |
1800 | $UMULH r12,r6,r8 | |
1801 | adde r11,r11,r10 | |
1802 | #addze r12,r12 | |
1803 | $ST r11,`1*$BNSZ`(r3) | |
1804 | #mul(rp[2],ap[2],w,c1); | |
1805 | $LD r8,`2*$BNSZ`(r4) | |
1806 | $UMULL r9,r6,r8 | |
1807 | $UMULH r10,r6,r8 | |
1808 | adde r9,r9,r12 | |
1809 | #addze r10,r10 | |
1810 | $ST r9,`2*$BNSZ`(r3) | |
1811 | #mul_add(rp[3],ap[3],w,c1); | |
1812 | $LD r8,`3*$BNSZ`(r4) | |
1813 | $UMULL r11,r6,r8 | |
1814 | $UMULH r12,r6,r8 | |
1815 | adde r11,r11,r10 | |
1816 | addze r12,r12 #this spin we collect carry into | |
1817 | #r12 | |
1818 | $ST r11,`3*$BNSZ`(r3) | |
1819 | ||
1820 | addi r3,r3,`4*$BNSZ` | |
1821 | addi r4,r4,`4*$BNSZ` | |
31439046 | 1822 | bdnz- Lppcasm_mw_LOOP |
dd558806 AP |
1823 | |
1824 | Lppcasm_mw_REM: | |
1825 | andi. r5,r5,0x3 | |
31439046 | 1826 | beq Lppcasm_mw_OVER |
dd558806 AP |
1827 | #mul(rp[0],ap[0],w,c1); |
1828 | $LD r8,`0*$BNSZ`(r4) | |
1829 | $UMULL r9,r6,r8 | |
1830 | $UMULH r10,r6,r8 | |
1831 | addc r9,r9,r12 | |
1832 | addze r10,r10 | |
1833 | $ST r9,`0*$BNSZ`(r3) | |
1834 | addi r12,r10,0 | |
1835 | ||
1836 | addi r5,r5,-1 | |
1837 | cmpli 0,0,r5,0 | |
31439046 | 1838 | beq Lppcasm_mw_OVER |
dd558806 AP |
1839 | |
1840 | ||
1841 | #mul(rp[1],ap[1],w,c1); | |
1842 | $LD r8,`1*$BNSZ`(r4) | |
1843 | $UMULL r9,r6,r8 | |
1844 | $UMULH r10,r6,r8 | |
1845 | addc r9,r9,r12 | |
1846 | addze r10,r10 | |
1847 | $ST r9,`1*$BNSZ`(r3) | |
1848 | addi r12,r10,0 | |
1849 | ||
1850 | addi r5,r5,-1 | |
1851 | cmpli 0,0,r5,0 | |
31439046 | 1852 | beq Lppcasm_mw_OVER |
dd558806 AP |
1853 | |
1854 | #mul_add(rp[2],ap[2],w,c1); | |
1855 | $LD r8,`2*$BNSZ`(r4) | |
1856 | $UMULL r9,r6,r8 | |
1857 | $UMULH r10,r6,r8 | |
1858 | addc r9,r9,r12 | |
1859 | addze r10,r10 | |
1860 | $ST r9,`2*$BNSZ`(r3) | |
1861 | addi r12,r10,0 | |
1862 | ||
1863 | Lppcasm_mw_OVER: | |
1864 | addi r3,r12,0 | |
31439046 | 1865 | blr |
67150340 AP |
1866 | .long 0 |
1867 | .byte 0,12,0x14,0,0,0,4,0 | |
1868 | .long 0 | |
dd558806 AP |
1869 | |
1870 | # | |
1871 | # NOTE: The following label name should be changed to | |
1872 | # "bn_mul_add_words" i.e. remove the first dot | |
1873 | # for the gcc compiler. This should be automatically | |
1874 | # done in the build | |
1875 | # | |
1876 | ||
1877 | .align 4 | |
1878 | .bn_mul_add_words: | |
1879 | # | |
1880 | # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
1881 | # | |
1882 | # r3 = rp | |
1883 | # r4 = ap | |
1884 | # r5 = num | |
1885 | # r6 = w | |
1886 | # | |
1887 | # empirical evidence suggests that unrolled version performs best!! | |
1888 | # | |
1889 | xor r0,r0,r0 #r0 = 0 | |
1890 | xor r12,r12,r12 #r12 = 0 . used for carry | |
1891 | rlwinm. r7,r5,30,2,31 # num >> 2 | |
31439046 | 1892 | beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover |
dd558806 AP |
1893 | mtctr r7 |
1894 | Lppcasm_maw_mainloop: | |
1895 | #mul_add(rp[0],ap[0],w,c1); | |
1896 | $LD r8,`0*$BNSZ`(r4) | |
1897 | $LD r11,`0*$BNSZ`(r3) | |
1898 | $UMULL r9,r6,r8 | |
1899 | $UMULH r10,r6,r8 | |
1900 | addc r9,r9,r12 #r12 is carry. | |
1901 | addze r10,r10 | |
1902 | addc r9,r9,r11 | |
1903 | #addze r10,r10 | |
1904 | #the above instruction addze | |
1905 | #is NOT needed. Carry will NOT | |
1906 | #be ignored. It's not affected | |
1907 | #by multiply and will be collected | |
1908 | #in the next spin | |
1909 | $ST r9,`0*$BNSZ`(r3) | |
1910 | ||
1911 | #mul_add(rp[1],ap[1],w,c1); | |
1912 | $LD r8,`1*$BNSZ`(r4) | |
1913 | $LD r9,`1*$BNSZ`(r3) | |
1914 | $UMULL r11,r6,r8 | |
1915 | $UMULH r12,r6,r8 | |
1916 | adde r11,r11,r10 #r10 is carry. | |
1917 | addze r12,r12 | |
1918 | addc r11,r11,r9 | |
1919 | #addze r12,r12 | |
1920 | $ST r11,`1*$BNSZ`(r3) | |
1921 | ||
1922 | #mul_add(rp[2],ap[2],w,c1); | |
1923 | $LD r8,`2*$BNSZ`(r4) | |
1924 | $UMULL r9,r6,r8 | |
1925 | $LD r11,`2*$BNSZ`(r3) | |
1926 | $UMULH r10,r6,r8 | |
1927 | adde r9,r9,r12 | |
1928 | addze r10,r10 | |
1929 | addc r9,r9,r11 | |
1930 | #addze r10,r10 | |
1931 | $ST r9,`2*$BNSZ`(r3) | |
1932 | ||
1933 | #mul_add(rp[3],ap[3],w,c1); | |
1934 | $LD r8,`3*$BNSZ`(r4) | |
1935 | $UMULL r11,r6,r8 | |
1936 | $LD r9,`3*$BNSZ`(r3) | |
1937 | $UMULH r12,r6,r8 | |
1938 | adde r11,r11,r10 | |
1939 | addze r12,r12 | |
1940 | addc r11,r11,r9 | |
1941 | addze r12,r12 | |
1942 | $ST r11,`3*$BNSZ`(r3) | |
1943 | addi r3,r3,`4*$BNSZ` | |
1944 | addi r4,r4,`4*$BNSZ` | |
31439046 | 1945 | bdnz- Lppcasm_maw_mainloop |
dd558806 AP |
1946 | |
1947 | Lppcasm_maw_leftover: | |
1948 | andi. r5,r5,0x3 | |
31439046 | 1949 | beq Lppcasm_maw_adios |
dd558806 AP |
1950 | addi r3,r3,-$BNSZ |
1951 | addi r4,r4,-$BNSZ | |
1952 | #mul_add(rp[0],ap[0],w,c1); | |
1953 | mtctr r5 | |
1954 | $LDU r8,$BNSZ(r4) | |
1955 | $UMULL r9,r6,r8 | |
1956 | $UMULH r10,r6,r8 | |
1957 | $LDU r11,$BNSZ(r3) | |
1958 | addc r9,r9,r11 | |
1959 | addze r10,r10 | |
1960 | addc r9,r9,r12 | |
1961 | addze r12,r10 | |
1962 | $ST r9,0(r3) | |
1963 | ||
31439046 | 1964 | bdz Lppcasm_maw_adios |
dd558806 AP |
1965 | #mul_add(rp[1],ap[1],w,c1); |
1966 | $LDU r8,$BNSZ(r4) | |
1967 | $UMULL r9,r6,r8 | |
1968 | $UMULH r10,r6,r8 | |
1969 | $LDU r11,$BNSZ(r3) | |
1970 | addc r9,r9,r11 | |
1971 | addze r10,r10 | |
1972 | addc r9,r9,r12 | |
1973 | addze r12,r10 | |
1974 | $ST r9,0(r3) | |
1975 | ||
31439046 | 1976 | bdz Lppcasm_maw_adios |
dd558806 AP |
1977 | #mul_add(rp[2],ap[2],w,c1); |
1978 | $LDU r8,$BNSZ(r4) | |
1979 | $UMULL r9,r6,r8 | |
1980 | $UMULH r10,r6,r8 | |
1981 | $LDU r11,$BNSZ(r3) | |
1982 | addc r9,r9,r11 | |
1983 | addze r10,r10 | |
1984 | addc r9,r9,r12 | |
1985 | addze r12,r10 | |
1986 | $ST r9,0(r3) | |
1987 | ||
1988 | Lppcasm_maw_adios: | |
1989 | addi r3,r12,0 | |
31439046 | 1990 | blr |
67150340 AP |
1991 | .long 0 |
1992 | .byte 0,12,0x14,0,0,0,4,0 | |
1993 | .long 0 | |
dd558806 AP |
1994 | .align 4 |
1995 | EOF | |
31439046 AP |
1996 | $data =~ s/\`([^\`]*)\`/eval $1/gem; |
1997 | print $data; | |
1998 | close STDOUT; |