]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
83cf7abf | 2 | # Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
367ace68 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
dacdcf3c AP |
9 | # |
10 | # ==================================================================== | |
e3713c36 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
dacdcf3c AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | ||
17 | # January 2010 | |
18 | # | |
19 | # "Teaser" Montgomery multiplication module for IA-64. There are | |
20 | # several possibilities for improvement: | |
21 | # | |
22 | # - modulo-scheduling outer loop would eliminate quite a number of | |
23 | # stalls after ldf8, xma and getf.sig outside inner loop and | |
24 | # improve shorter key performance; | |
25 | # - shorter vector support [with input vectors being fetched only | |
26 | # once] should be added; | |
27 | # - 2x unroll with help of n0[1] would make the code scalable on | |
28 | # "wider" IA-64, "wider" than Itanium 2 that is, which is not of | |
29 | # acute interest, because upcoming Tukwila's individual cores are | |
30 | # reportedly based on Itanium 2 design; | |
31 | # - dedicated squaring procedure(?); | |
32 | # | |
4407700c AP |
33 | # January 2010 |
34 | # | |
35 | # Shorter vector support is implemented by zero-padding ap and np | |
36 | # vectors up to 8 elements, or 512 bits. This means that 256-bit | |
37 | # inputs will be processed only 2 times faster than 512-bit inputs, | |
38 | # not 4 [as one would expect, because algorithm complexity is n^2]. | |
39 | # The reason for padding is that inputs shorter than 512 bits won't | |
40 | # be processed faster anyway, because minimal critical path of the | |
41 | # core loop happens to match 512-bit timing. Either way, it resulted | |
42 | # in >100% improvement of 512-bit RSA sign benchmark and 50% - of | |
43 | # 1024-bit one [in comparison to original version of *this* module]. | |
44 | # | |
dacdcf3c AP |
45 | # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* |
46 | # this module is: | |
47 | # sign verify sign/s verify/s | |
a000759a AP |
48 | # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 |
49 | # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 | |
dacdcf3c AP |
50 | # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 |
51 | # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 | |
a000759a | 52 | # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 |
dacdcf3c AP |
53 | # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 |
54 | # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 | |
55 | # | |
a000759a | 56 | # ... and *without* (but still with ia64.S): |
dacdcf3c AP |
57 | # |
58 | # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 | |
59 | # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 | |
60 | # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 | |
61 | # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 | |
62 | # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 | |
63 | # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 | |
64 | # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 | |
65 | # | |
a000759a AP |
66 | # As it can be seen, RSA sign performance improves by 130-30%, |
67 | # hereafter less for longer keys, while verify - by 74-13%. | |
68 | # DSA performance improves by 115-30%. | |
dacdcf3c | 69 | |
1aa89a7a RL |
70 | # $output is the last argument if it looks like a file (it has an extension) |
71 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
6bd7a4d9 | 72 | |
dacdcf3c AP |
73 | if ($^O eq "hpux") { |
74 | $ADDP="addp4"; | |
75 | for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } | |
76 | } else { $ADDP="add"; } | |
77 | ||
78 | $code=<<___; | |
79 | .explicit | |
80 | .text | |
4407700c | 81 | |
dacdcf3c AP |
82 | // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, |
83 | // const BN_ULONG *bp,const BN_ULONG *np, | |
609b0852 | 84 | // const BN_ULONG *n0p,int num); |
56c5f703 | 85 | .align 64 |
dacdcf3c AP |
86 | .global bn_mul_mont# |
87 | .proc bn_mul_mont# | |
4407700c AP |
88 | bn_mul_mont: |
89 | .prologue | |
90 | .body | |
91 | { .mmi; cmp4.le p6,p7=2,r37;; | |
92 | (p6) cmp4.lt.unc p8,p9=8,r37 | |
93 | mov ret0=r0 };; | |
94 | { .bbb; | |
95 | (p9) br.cond.dptk.many bn_mul_mont_8 | |
96 | (p8) br.cond.dpnt.many bn_mul_mont_general | |
97 | (p7) br.ret.spnt.many b0 };; | |
98 | .endp bn_mul_mont# | |
99 | \f | |
100 | prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; | |
dacdcf3c | 101 | |
4407700c AP |
102 | rptr=r8; aptr=r9; bptr=r14; nptr=r15; |
103 | tptr=r16; // &tp[0] | |
104 | tp_1=r17; // &tp[-1] | |
105 | num=r18; len=r19; lc=r20; | |
106 | topbit=r21; // carry bit from tmp[num] | |
dacdcf3c | 107 | |
4407700c AP |
108 | n0=f6; |
109 | m0=f7; | |
110 | bi=f8; | |
dacdcf3c | 111 | |
56c5f703 | 112 | .align 64 |
4407700c AP |
113 | .local bn_mul_mont_general# |
114 | .proc bn_mul_mont_general# | |
4407700c | 115 | bn_mul_mont_general: |
dacdcf3c AP |
116 | .prologue |
117 | { .mmi; .save ar.pfs,prevfs | |
118 | alloc prevfs=ar.pfs,6,2,0,8 | |
119 | $ADDP aptr=0,in1 | |
120 | .save ar.lc,prevlc | |
121 | mov prevlc=ar.lc } | |
122 | { .mmi; .vframe prevsp | |
123 | mov prevsp=sp | |
124 | $ADDP bptr=0,in2 | |
dacdcf3c | 125 | .save pr,prevpr |
4407700c | 126 | mov prevpr=pr };; |
dacdcf3c AP |
127 | |
128 | .body | |
129 | .rotf alo[6],nlo[4],ahi[8],nhi[6] | |
130 | .rotr a[3],n[3],t[2] | |
131 | ||
132 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | |
133 | ldf8 alo[4]=[aptr],16 // ap[0] | |
134 | $ADDP r30=8,in1 };; | |
135 | { .mmi; ldf8 alo[3]=[r30],16 // ap[1] | |
136 | ldf8 alo[2]=[aptr],16 // ap[2] | |
137 | $ADDP in4=0,in4 };; | |
138 | { .mmi; ldf8 alo[1]=[r30] // ap[3] | |
139 | ldf8 n0=[in4] // n0 | |
140 | $ADDP rptr=0,in0 } | |
141 | { .mmi; $ADDP nptr=0,in3 | |
142 | mov r31=16 | |
143 | zxt4 num=in5 };; | |
144 | { .mmi; ldf8 nlo[2]=[nptr],8 // np[0] | |
145 | shladd len=num,3,r0 | |
146 | shladd r31=num,3,r31 };; | |
147 | { .mmi; ldf8 nlo[1]=[nptr],8 // np[1] | |
148 | add lc=-5,num | |
149 | sub r31=sp,r31 };; | |
150 | { .mfb; and sp=-16,r31 // alloca | |
151 | xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] | |
152 | nop.b 0 } | |
153 | { .mfb; nop.m 0 | |
154 | xmpy.lu alo[4]=alo[4],bi | |
155 | brp.loop.imp .L1st_ctop,.L1st_cend-16 | |
156 | };; | |
157 | { .mfi; nop.m 0 | |
158 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] | |
74f22606 | 159 | add tp_1=8,sp } |
dacdcf3c AP |
160 | { .mfi; nop.m 0 |
161 | xma.lu alo[3]=alo[3],bi,ahi[2] | |
162 | mov pr.rot=0x20001f<<16 | |
163 | // ------^----- (p40) at first (p23) | |
164 | // ----------^^ p[16:20]=1 | |
165 | };; | |
166 | { .mfi; nop.m 0 | |
167 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 | |
168 | mov ar.lc=lc } | |
169 | { .mfi; nop.m 0 | |
170 | fcvt.fxu.s1 nhi[1]=f0 | |
171 | mov ar.ec=8 };; | |
172 | ||
173 | .align 32 | |
174 | .L1st_ctop: | |
175 | .pred.rel "mutex",p40,p42 | |
176 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | |
177 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | |
178 | (p40) add n[2]=n[2],a[2] } // (p23) } | |
179 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) | |
180 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | |
181 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | |
182 | { .mfi; (p21) getf.sig a[0]=alo[5] | |
183 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | |
184 | (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) | |
185 | { .mfi; (p23) st8 [tp_1]=n[2],8 | |
186 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | |
187 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | |
188 | { .mmb; (p21) getf.sig n[0]=nlo[3] | |
189 | (p16) nop.m 0 | |
190 | br.ctop.sptk .L1st_ctop };; | |
191 | .L1st_cend: | |
192 | ||
193 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | |
194 | getf.sig n[0]=nhi[4] | |
195 | add num=-1,num };; // num-- | |
196 | { .mmi; .pred.rel "mutex",p40,p42 | |
197 | (p40) add n[0]=n[0],a[0] | |
198 | (p42) add n[0]=n[0],a[0],1 | |
199 | sub aptr=aptr,len };; // rewind | |
200 | { .mmi; .pred.rel "mutex",p40,p42 | |
201 | (p40) cmp.ltu p41,p39=n[0],a[0] | |
202 | (p42) cmp.leu p41,p39=n[0],a[0] | |
203 | sub nptr=nptr,len };; | |
204 | { .mmi; .pred.rel "mutex",p39,p41 | |
205 | (p39) add topbit=r0,r0 | |
206 | (p41) add topbit=r0,r0,1 | |
609b0852 | 207 | nop.i 0 } |
dacdcf3c | 208 | { .mmi; st8 [tp_1]=n[0] |
74f22606 AP |
209 | add tptr=16,sp |
210 | add tp_1=8,sp };; | |
211 | \f | |
dacdcf3c AP |
212 | .Louter: |
213 | { .mmi; ldf8 bi=[bptr],8 // (*bp++) | |
214 | ldf8 ahi[3]=[tptr] // tp[0] | |
215 | add r30=8,aptr };; | |
216 | { .mmi; ldf8 alo[4]=[aptr],16 // ap[0] | |
217 | ldf8 alo[3]=[r30],16 // ap[1] | |
218 | add r31=8,nptr };; | |
219 | { .mfb; ldf8 alo[2]=[aptr],16 // ap[2] | |
220 | xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] | |
221 | brp.loop.imp .Linner_ctop,.Linner_cend-16 | |
222 | } | |
223 | { .mfb; ldf8 alo[1]=[r30] // ap[3] | |
224 | xma.lu alo[4]=alo[4],bi,ahi[3] | |
225 | clrrrb.pr };; | |
226 | { .mfi; ldf8 nlo[2]=[nptr],16 // np[0] | |
227 | xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] | |
228 | nop.i 0 } | |
229 | { .mfi; ldf8 nlo[1]=[r31] // np[1] | |
230 | xma.lu alo[3]=alo[3],bi,ahi[2] | |
231 | mov pr.rot=0x20101f<<16 | |
232 | // ------^----- (p40) at first (p23) | |
233 | // --------^--- (p30) at first (p22) | |
234 | // ----------^^ p[16:20]=1 | |
235 | };; | |
236 | { .mfi; st8 [tptr]=r0 // tp[0] is already accounted | |
237 | xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 | |
238 | mov ar.lc=lc } | |
239 | { .mfi; | |
240 | fcvt.fxu.s1 nhi[1]=f0 | |
241 | mov ar.ec=8 };; | |
242 | ||
243 | // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in | |
244 | // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 | |
245 | // in latter case accounts for two-tick pipeline stall, which means | |
246 | // that its performance would be ~20% lower than optimal one. No | |
247 | // attempt was made to address this, because original Itanium is | |
248 | // hardly represented out in the wild... | |
249 | .align 32 | |
250 | .Linner_ctop: | |
251 | .pred.rel "mutex",p40,p42 | |
252 | .pred.rel "mutex",p30,p32 | |
253 | { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) | |
254 | (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] | |
255 | (p40) add n[2]=n[2],a[2] } // (p23) | |
256 | { .mfi; (p16) nop.m 0 | |
257 | (p18) xma.lu alo[2]=alo[2],bi,ahi[1] | |
258 | (p42) add n[2]=n[2],a[2],1 };; // (p23) | |
259 | { .mfi; (p21) getf.sig a[0]=alo[5] | |
260 | (p16) nop.f 0 | |
261 | (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) | |
262 | { .mfi; (p21) ld8 t[0]=[tptr],8 | |
263 | (p16) nop.f 0 | |
264 | (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) | |
265 | { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) | |
266 | (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] | |
267 | (p30) add a[1]=a[1],t[1] } // (p22) | |
268 | { .mfi; (p16) nop.m 0 | |
269 | (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] | |
270 | (p32) add a[1]=a[1],t[1],1 };; // (p22) | |
271 | { .mmi; (p21) getf.sig n[0]=nlo[3] | |
272 | (p16) nop.m 0 | |
273 | (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) | |
274 | { .mmb; (p23) st8 [tp_1]=n[2],8 | |
275 | (p32) cmp.leu p31,p29=a[1],t[1] // (p22) | |
276 | br.ctop.sptk .Linner_ctop };; | |
277 | .Linner_cend: | |
278 | ||
279 | { .mmi; getf.sig a[0]=ahi[6] // (p24) | |
280 | getf.sig n[0]=nhi[4] | |
281 | nop.i 0 };; | |
282 | ||
283 | { .mmi; .pred.rel "mutex",p31,p33 | |
284 | (p31) add a[0]=a[0],topbit | |
285 | (p33) add a[0]=a[0],topbit,1 | |
286 | mov topbit=r0 };; | |
287 | { .mfi; .pred.rel "mutex",p31,p33 | |
288 | (p31) cmp.ltu p32,p30=a[0],topbit | |
289 | (p33) cmp.leu p32,p30=a[0],topbit | |
290 | } | |
291 | { .mfi; .pred.rel "mutex",p40,p42 | |
292 | (p40) add n[0]=n[0],a[0] | |
293 | (p42) add n[0]=n[0],a[0],1 | |
294 | };; | |
295 | { .mmi; .pred.rel "mutex",p44,p46 | |
296 | (p40) cmp.ltu p41,p39=n[0],a[0] | |
297 | (p42) cmp.leu p41,p39=n[0],a[0] | |
298 | (p32) add topbit=r0,r0,1 } | |
299 | ||
300 | { .mmi; st8 [tp_1]=n[0],8 | |
301 | cmp4.ne p6,p0=1,num | |
302 | sub aptr=aptr,len };; // rewind | |
303 | { .mmi; sub nptr=nptr,len | |
304 | (p41) add topbit=r0,r0,1 | |
74f22606 AP |
305 | add tptr=16,sp } |
306 | { .mmb; add tp_1=8,sp | |
dacdcf3c AP |
307 | add num=-1,num // num-- |
308 | (p6) br.cond.sptk.many .Louter };; | |
309 | \f | |
310 | { .mbb; add lc=4,lc | |
311 | brp.loop.imp .Lsub_ctop,.Lsub_cend-16 | |
312 | clrrrb.pr };; | |
313 | { .mii; nop.m 0 | |
314 | mov pr.rot=0x10001<<16 | |
315 | // ------^---- (p33) at first (p17) | |
316 | mov ar.lc=lc } | |
317 | { .mii; nop.m 0 | |
318 | mov ar.ec=3 | |
319 | nop.i 0 };; | |
320 | ||
321 | .Lsub_ctop: | |
322 | .pred.rel "mutex",p33,p35 | |
323 | { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) | |
324 | (p16) nop.f 0 | |
325 | (p33) sub n[1]=t[1],n[1] } // (p17) | |
326 | { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) | |
327 | (p16) nop.f 0 | |
328 | (p35) sub n[1]=t[1],n[1],1 };; // (p17) | |
329 | { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r | |
330 | (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) | |
331 | (p18) nop.b 0 } | |
332 | { .mib; (p18) nop.m 0 | |
333 | (p35) cmp.geu p34,p32=n[1],t[1] // (p17) | |
334 | br.ctop.sptk .Lsub_ctop };; | |
335 | .Lsub_cend: | |
336 | ||
337 | { .mmb; .pred.rel "mutex",p34,p36 | |
338 | (p34) sub topbit=topbit,r0 // (p19) | |
339 | (p36) sub topbit=topbit,r0,1 | |
340 | brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 | |
341 | } | |
342 | { .mmb; sub rptr=rptr,len // rewind | |
343 | sub tptr=tptr,len | |
344 | clrrrb.pr };; | |
774ff8fe AP |
345 | { .mmi; mov aptr=rptr |
346 | mov bptr=tptr | |
dacdcf3c | 347 | mov pr.rot=1<<16 };; |
774ff8fe | 348 | { .mii; cmp.eq p0,p6=topbit,r0 |
dacdcf3c | 349 | mov ar.lc=lc |
774ff8fe | 350 | mov ar.ec=2 };; |
dacdcf3c AP |
351 | |
352 | .Lcopy_ctop: | |
774ff8fe AP |
353 | { .mmi; (p16) ld8 a[0]=[aptr],8 |
354 | (p16) ld8 t[0]=[bptr],8 | |
355 | (p6) mov a[1]=t[1] };; // (p17) | |
356 | { .mmb; (p17) st8 [rptr]=a[1],8 | |
357 | (p17) st8 [tptr]=r0,8 | |
dacdcf3c AP |
358 | br.ctop.sptk .Lcopy_ctop };; |
359 | .Lcopy_cend: | |
360 | ||
361 | { .mmi; mov ret0=1 // signal "handled" | |
362 | rum 1<<5 // clear um.mfh | |
363 | mov ar.lc=prevlc } | |
364 | { .mib; .restore sp | |
365 | mov sp=prevsp | |
56c5f703 | 366 | mov pr=prevpr,0x1ffff |
dacdcf3c | 367 | br.ret.sptk.many b0 };; |
4407700c AP |
368 | .endp bn_mul_mont_general# |
369 | \f | |
370 | a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; | |
371 | n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; | |
372 | t0=r15; | |
373 | ||
374 | ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; | |
375 | ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; | |
376 | ||
56c5f703 AP |
377 | .align 64 |
378 | .skip 48 // aligns loop body | |
4407700c AP |
379 | .local bn_mul_mont_8# |
380 | .proc bn_mul_mont_8# | |
4407700c AP |
381 | bn_mul_mont_8: |
382 | .prologue | |
383 | { .mmi; .save ar.pfs,prevfs | |
384 | alloc prevfs=ar.pfs,6,2,0,8 | |
385 | .vframe prevsp | |
386 | mov prevsp=sp | |
387 | .save ar.lc,prevlc | |
388 | mov prevlc=ar.lc } | |
389 | { .mmi; add r17=-6*16,sp | |
390 | add sp=-7*16,sp | |
391 | .save pr,prevpr | |
392 | mov prevpr=pr };; | |
393 | ||
394 | { .mmi; .save.gf 0,0x10 | |
395 | stf.spill [sp]=f16,-16 | |
396 | .save.gf 0,0x20 | |
397 | stf.spill [r17]=f17,32 | |
398 | add r16=-5*16,prevsp};; | |
399 | { .mmi; .save.gf 0,0x40 | |
400 | stf.spill [r16]=f18,32 | |
401 | .save.gf 0,0x80 | |
402 | stf.spill [r17]=f19,32 | |
403 | $ADDP aptr=0,in1 };; | |
404 | { .mmi; .save.gf 0,0x100 | |
405 | stf.spill [r16]=f20,32 | |
406 | .save.gf 0,0x200 | |
407 | stf.spill [r17]=f21,32 | |
408 | $ADDP r29=8,in1 };; | |
409 | { .mmi; .save.gf 0,0x400 | |
410 | stf.spill [r16]=f22 | |
411 | .save.gf 0,0x800 | |
412 | stf.spill [r17]=f23 | |
413 | $ADDP rptr=0,in0 };; | |
414 | \f | |
415 | .body | |
416 | .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] | |
417 | .rotr t[8] | |
418 | ||
419 | // load input vectors padding them to 8 elements | |
420 | { .mmi; ldf8 ai0=[aptr],16 // ap[0] | |
421 | ldf8 ai1=[r29],16 // ap[1] | |
422 | $ADDP bptr=0,in2 } | |
423 | { .mmi; $ADDP r30=8,in2 | |
424 | $ADDP nptr=0,in3 | |
425 | $ADDP r31=8,in3 };; | |
426 | { .mmi; ldf8 bj[7]=[bptr],16 // bp[0] | |
427 | ldf8 bj[6]=[r30],16 // bp[1] | |
428 | cmp4.le p4,p5=3,in5 } | |
429 | { .mmi; ldf8 ni0=[nptr],16 // np[0] | |
430 | ldf8 ni1=[r31],16 // np[1] | |
431 | cmp4.le p6,p7=4,in5 };; | |
432 | ||
433 | { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] | |
434 | (p5)fcvt.fxu ai2=f0 | |
435 | cmp4.le p8,p9=5,in5 } | |
436 | { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] | |
437 | (p7)fcvt.fxu ai3=f0 | |
438 | cmp4.le p10,p11=6,in5 } | |
439 | { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] | |
440 | (p5)fcvt.fxu bj[5]=f0 | |
441 | cmp4.le p12,p13=7,in5 } | |
442 | { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] | |
443 | (p7)fcvt.fxu bj[4]=f0 | |
444 | cmp4.le p14,p15=8,in5 } | |
445 | { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] | |
446 | (p5)fcvt.fxu ni2=f0 | |
447 | addp4 r28=-1,in5 } | |
448 | { .mfi; (p6)ldf8 ni3=[r31],16 // np[3] | |
449 | (p7)fcvt.fxu ni3=f0 | |
450 | $ADDP in4=0,in4 };; | |
451 | ||
452 | { .mfi; ldf8 n0=[in4] | |
453 | fcvt.fxu tf[1]=f0 | |
454 | nop.i 0 } | |
455 | ||
456 | { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] | |
457 | (p9)fcvt.fxu ai4=f0 | |
458 | mov t[0]=r0 } | |
459 | { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] | |
460 | (p11)fcvt.fxu ai5=f0 | |
461 | mov t[1]=r0 } | |
462 | { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] | |
463 | (p9)fcvt.fxu bj[3]=f0 | |
464 | mov t[2]=r0 } | |
465 | { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] | |
466 | (p11)fcvt.fxu bj[2]=f0 | |
467 | mov t[3]=r0 } | |
468 | { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] | |
469 | (p9)fcvt.fxu ni4=f0 | |
470 | mov t[4]=r0 } | |
471 | { .mfi; (p10)ldf8 ni5=[r31],16 // np[5] | |
472 | (p11)fcvt.fxu ni5=f0 | |
473 | mov t[5]=r0 };; | |
474 | ||
475 | { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] | |
476 | (p13)fcvt.fxu ai6=f0 | |
477 | mov t[6]=r0 } | |
478 | { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] | |
479 | (p15)fcvt.fxu ai7=f0 | |
480 | mov t[7]=r0 } | |
481 | { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] | |
482 | (p13)fcvt.fxu bj[1]=f0 | |
483 | mov ar.lc=r28 } | |
484 | { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] | |
485 | (p15)fcvt.fxu bj[0]=f0 | |
a000759a | 486 | mov ar.ec=1 } |
4407700c AP |
487 | { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] |
488 | (p13)fcvt.fxu ni6=f0 | |
489 | mov pr.rot=1<<16 } | |
490 | { .mfb; (p14)ldf8 ni7=[r31],16 // np[7] | |
491 | (p15)fcvt.fxu ni7=f0 | |
492 | brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 | |
493 | };; | |
494 | \f | |
a000759a AP |
495 | // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt |
496 | // to measure with help of Interval Time Counter indicated that the | |
4407700c AP |
497 | // factor is a tad higher: 33 or 34, if not 35. Exact measurement and |
498 | // addressing the issue is problematic, because I don't have access | |
499 | // to platform-specific instruction-level profiler. On Itanium it | |
a000759a | 500 | // should run in 56*n ticks, because of higher xma latency... |
4407700c AP |
501 | .Louter_8_ctop: |
502 | .pred.rel "mutex",p40,p42 | |
503 | .pred.rel "mutex",p48,p50 | |
504 | { .mfi; (p16) nop.m 0 // 0: | |
505 | (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] | |
506 | (p40) add a3=a3,n3 } // (p17) a3+=n3 | |
507 | { .mfi; (p42) add a3=a3,n3,1 | |
508 | (p16) xma.lu alo[0]=ai0,bj[7],tf[1] | |
509 | (p16) nop.i 0 };; | |
510 | { .mii; (p17) getf.sig a7=alo[8] // 1: | |
511 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | |
512 | (p50) add t[6]=t[6],a3,1 };; | |
513 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | |
514 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | |
515 | (p40) cmp.ltu p43,p41=a3,n3 } | |
516 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | |
517 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | |
518 | (p16) nop.i 0 };; | |
519 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | |
520 | (p48) cmp.ltu p51,p49=t[6],a3 | |
521 | (p50) cmp.leu p51,p49=t[6],a3 };; | |
522 | .pred.rel "mutex",p41,p43 | |
523 | .pred.rel "mutex",p49,p51 | |
524 | { .mfi; (p16) nop.m 0 // 4: | |
525 | (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] | |
526 | (p41) add a4=a4,n4 } // (p17) a4+=n4 | |
527 | { .mfi; (p43) add a4=a4,n4,1 | |
528 | (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] | |
529 | (p16) nop.i 0 };; | |
530 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | |
531 | (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 | |
532 | (p51) add t[5]=t[5],a4,1 };; | |
533 | { .mfi; (p16) nop.m 0 // 6: | |
534 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | |
535 | (p41) cmp.ltu p42,p40=a4,n4 } | |
536 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | |
537 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | |
538 | (p16) nop.i 0 };; | |
539 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | |
540 | (p49) cmp.ltu p50,p48=t[5],a4 | |
541 | (p51) cmp.leu p50,p48=t[5],a4 };; | |
542 | .pred.rel "mutex",p40,p42 | |
543 | .pred.rel "mutex",p48,p50 | |
544 | { .mfi; (p16) nop.m 0 // 8: | |
545 | (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] | |
546 | (p40) add a5=a5,n5 } // (p17) a5+=n5 | |
547 | { .mfi; (p42) add a5=a5,n5,1 | |
548 | (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] | |
549 | (p16) nop.i 0 };; | |
550 | { .mii; (p16) getf.sig a1=alo[1] // 9: | |
551 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | |
552 | (p50) add t[4]=t[4],a5,1 };; | |
553 | { .mfi; (p16) nop.m 0 // 10: | |
554 | (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 | |
555 | (p40) cmp.ltu p43,p41=a5,n5 } | |
556 | { .mfi; (p42) cmp.leu p43,p41=a5,n5 | |
557 | (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] | |
558 | (p16) nop.i 0 };; | |
559 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | |
560 | (p48) cmp.ltu p51,p49=t[4],a5 | |
561 | (p50) cmp.leu p51,p49=t[4],a5 };; | |
562 | .pred.rel "mutex",p41,p43 | |
563 | .pred.rel "mutex",p49,p51 | |
564 | { .mfi; (p17) getf.sig n8=nhi[8] // 12: | |
565 | (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] | |
566 | (p41) add a6=a6,n6 } // (p17) a6+=n6 | |
567 | { .mfi; (p43) add a6=a6,n6,1 | |
568 | (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] | |
569 | (p16) nop.i 0 };; | |
570 | { .mii; (p16) getf.sig a2=alo[2] // 13: | |
571 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | |
572 | (p51) add t[3]=t[3],a6,1 };; | |
573 | { .mfi; (p16) nop.m 0 // 14: | |
574 | (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 | |
575 | (p41) cmp.ltu p42,p40=a6,n6 } | |
576 | { .mfi; (p43) cmp.leu p42,p40=a6,n6 | |
577 | (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] | |
578 | (p16) nop.i 0 };; | |
579 | { .mii; (p16) nop.m 0 // 15: | |
580 | (p49) cmp.ltu p50,p48=t[3],a6 | |
581 | (p51) cmp.leu p50,p48=t[3],a6 };; | |
582 | .pred.rel "mutex",p40,p42 | |
583 | .pred.rel "mutex",p48,p50 | |
584 | { .mfi; (p16) nop.m 0 // 16: | |
585 | (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] | |
586 | (p40) add a7=a7,n7 } // (p17) a7+=n7 | |
587 | { .mfi; (p42) add a7=a7,n7,1 | |
588 | (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] | |
589 | (p16) nop.i 0 };; | |
590 | { .mii; (p16) getf.sig a3=alo[3] // 17: | |
591 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | |
592 | (p50) add t[2]=t[2],a7,1 };; | |
593 | { .mfi; (p16) nop.m 0 // 18: | |
594 | (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 | |
595 | (p40) cmp.ltu p43,p41=a7,n7 } | |
596 | { .mfi; (p42) cmp.leu p43,p41=a7,n7 | |
597 | (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] | |
598 | (p16) nop.i 0 };; | |
599 | { .mii; (p16) getf.sig n1=nlo[1] // 19: | |
600 | (p48) cmp.ltu p51,p49=t[2],a7 | |
601 | (p50) cmp.leu p51,p49=t[2],a7 };; | |
602 | .pred.rel "mutex",p41,p43 | |
603 | .pred.rel "mutex",p49,p51 | |
604 | { .mfi; (p16) nop.m 0 // 20: | |
605 | (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] | |
606 | (p41) add a8=a8,n8 } // (p17) a8+=n8 | |
607 | { .mfi; (p43) add a8=a8,n8,1 | |
608 | (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] | |
609 | (p16) nop.i 0 };; | |
610 | { .mii; (p16) getf.sig a4=alo[4] // 21: | |
611 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | |
612 | (p51) add t[1]=t[1],a8,1 };; | |
613 | { .mfi; (p16) nop.m 0 // 22: | |
614 | (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 | |
615 | (p41) cmp.ltu p42,p40=a8,n8 } | |
616 | { .mfi; (p43) cmp.leu p42,p40=a8,n8 | |
617 | (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] | |
618 | (p16) nop.i 0 };; | |
619 | { .mii; (p16) getf.sig n2=nlo[2] // 23: | |
620 | (p49) cmp.ltu p50,p48=t[1],a8 | |
621 | (p51) cmp.leu p50,p48=t[1],a8 };; | |
622 | { .mfi; (p16) nop.m 0 // 24: | |
623 | (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] | |
624 | (p16) add a1=a1,n1 } // (p16) a1+=n1 | |
625 | { .mfi; (p16) nop.m 0 | |
626 | (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] | |
627 | (p17) mov t[0]=r0 };; | |
628 | { .mii; (p16) getf.sig a5=alo[5] // 25: | |
629 | (p16) add t0=t[7],a1 // (p16) t[7]+=a1 | |
630 | (p42) add t[0]=t[0],r0,1 };; | |
631 | { .mfi; (p16) setf.sig tf[0]=t0 // 26: | |
632 | (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 | |
633 | (p50) add t[0]=t[0],r0,1 } | |
634 | { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 | |
635 | (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] | |
636 | (p16) nop.i 0 };; | |
637 | { .mii; (p16) getf.sig n3=nlo[3] // 27: | |
638 | (p16) cmp.ltu.unc p50,p48=t0,a1 | |
639 | (p16) nop.i 0 };; | |
640 | .pred.rel "mutex",p40,p42 | |
641 | .pred.rel "mutex",p48,p50 | |
642 | { .mfi; (p16) nop.m 0 // 28: | |
643 | (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] | |
644 | (p40) add a2=a2,n2 } // (p16) a2+=n2 | |
645 | { .mfi; (p42) add a2=a2,n2,1 | |
646 | (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] | |
647 | (p16) nop.i 0 };; | |
648 | { .mii; (p16) getf.sig a6=alo[6] // 29: | |
649 | (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 | |
650 | (p50) add t[6]=t[6],a2,1 };; | |
651 | { .mfi; (p16) nop.m 0 // 30: | |
652 | (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 | |
653 | (p40) cmp.ltu p41,p39=a2,n2 } | |
654 | { .mfi; (p42) cmp.leu p41,p39=a2,n2 | |
655 | (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] | |
656 | (p16) nop.i 0 };; | |
657 | { .mfi; (p16) getf.sig n4=nlo[4] // 31: | |
658 | (p16) nop.f 0 | |
659 | (p48) cmp.ltu p49,p47=t[6],a2 } | |
660 | { .mfb; (p50) cmp.leu p49,p47=t[6],a2 | |
661 | (p16) nop.f 0 | |
662 | br.ctop.sptk.many .Louter_8_ctop };; | |
663 | .Louter_8_cend: | |
664 | \f | |
a000759a AP |
665 | // above loop has to execute one more time, without (p16), which is |
666 | // replaced with merged move of np[8] to GPR bank | |
667 | .pred.rel "mutex",p40,p42 | |
668 | .pred.rel "mutex",p48,p50 | |
669 | { .mmi; (p0) getf.sig n1=ni0 // 0: | |
670 | (p40) add a3=a3,n3 // (p17) a3+=n3 | |
671 | (p42) add a3=a3,n3,1 };; | |
672 | { .mii; (p17) getf.sig a7=alo[8] // 1: | |
673 | (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 | |
674 | (p50) add t[6]=t[6],a3,1 };; | |
675 | { .mfi; (p17) getf.sig a8=ahi[8] // 2: | |
676 | (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 | |
677 | (p40) cmp.ltu p43,p41=a3,n3 } | |
678 | { .mfi; (p42) cmp.leu p43,p41=a3,n3 | |
679 | (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] | |
680 | (p0) nop.i 0 };; | |
681 | { .mii; (p17) getf.sig n5=nlo[6] // 3: | |
682 | (p48) cmp.ltu p51,p49=t[6],a3 | |
683 | (p50) cmp.leu p51,p49=t[6],a3 };; | |
684 | .pred.rel "mutex",p41,p43 | |
685 | .pred.rel "mutex",p49,p51 | |
686 | { .mmi; (p0) getf.sig n2=ni1 // 4: | |
687 | (p41) add a4=a4,n4 // (p17) a4+=n4 | |
688 | (p43) add a4=a4,n4,1 };; | |
689 | { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 | |
690 | (p0) nop.f 0 | |
691 | (p51) add t[5]=t[5],a4,1 };; | |
692 | { .mfi; (p0) getf.sig n3=ni2 // 6: | |
693 | (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 | |
694 | (p41) cmp.ltu p42,p40=a4,n4 } | |
695 | { .mfi; (p43) cmp.leu p42,p40=a4,n4 | |
696 | (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] | |
697 | (p0) nop.i 0 };; | |
698 | { .mii; (p17) getf.sig n6=nlo[7] // 7: | |
699 | (p49) cmp.ltu p50,p48=t[5],a4 | |
700 | (p51) cmp.leu p50,p48=t[5],a4 };; | |
701 | .pred.rel "mutex",p40,p42 | |
702 | .pred.rel "mutex",p48,p50 | |
703 | { .mii; (p0) getf.sig n4=ni3 // 8: | |
704 | (p40) add a5=a5,n5 // (p17) a5+=n5 | |
705 | (p42) add a5=a5,n5,1 };; | |
706 | { .mii; (p0) nop.m 0 // 9: | |
707 | (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 | |
708 | (p50) add t[4]=t[4],a5,1 };; | |
709 | { .mii; (p0) nop.m 0 // 10: | |
710 | (p40) cmp.ltu p43,p41=a5,n5 | |
711 | (p42) cmp.leu p43,p41=a5,n5 };; | |
712 | { .mii; (p17) getf.sig n7=nlo[8] // 11: | |
713 | (p48) cmp.ltu p51,p49=t[4],a5 | |
714 | (p50) cmp.leu p51,p49=t[4],a5 };; | |
715 | .pred.rel "mutex",p41,p43 | |
716 | .pred.rel "mutex",p49,p51 | |
717 | { .mii; (p17) getf.sig n8=nhi[8] // 12: | |
718 | (p41) add a6=a6,n6 // (p17) a6+=n6 | |
719 | (p43) add a6=a6,n6,1 };; | |
720 | { .mii; (p0) getf.sig n5=ni4 // 13: | |
721 | (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 | |
722 | (p51) add t[3]=t[3],a6,1 };; | |
723 | { .mii; (p0) nop.m 0 // 14: | |
724 | (p41) cmp.ltu p42,p40=a6,n6 | |
725 | (p43) cmp.leu p42,p40=a6,n6 };; | |
726 | { .mii; (p0) getf.sig n6=ni5 // 15: | |
727 | (p49) cmp.ltu p50,p48=t[3],a6 | |
728 | (p51) cmp.leu p50,p48=t[3],a6 };; | |
729 | .pred.rel "mutex",p40,p42 | |
730 | .pred.rel "mutex",p48,p50 | |
731 | { .mii; (p0) nop.m 0 // 16: | |
732 | (p40) add a7=a7,n7 // (p17) a7+=n7 | |
733 | (p42) add a7=a7,n7,1 };; | |
734 | { .mii; (p0) nop.m 0 // 17: | |
735 | (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 | |
736 | (p50) add t[2]=t[2],a7,1 };; | |
737 | { .mii; (p0) nop.m 0 // 18: | |
738 | (p40) cmp.ltu p43,p41=a7,n7 | |
739 | (p42) cmp.leu p43,p41=a7,n7 };; | |
740 | { .mii; (p0) getf.sig n7=ni6 // 19: | |
741 | (p48) cmp.ltu p51,p49=t[2],a7 | |
742 | (p50) cmp.leu p51,p49=t[2],a7 };; | |
743 | .pred.rel "mutex",p41,p43 | |
744 | .pred.rel "mutex",p49,p51 | |
745 | { .mii; (p0) nop.m 0 // 20: | |
746 | (p41) add a8=a8,n8 // (p17) a8+=n8 | |
747 | (p43) add a8=a8,n8,1 };; | |
748 | { .mmi; (p0) nop.m 0 // 21: | |
749 | (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 | |
750 | (p51) add t[1]=t[1],a8,1 } | |
751 | { .mmi; (p17) mov t[0]=r0 | |
752 | (p41) cmp.ltu p42,p40=a8,n8 | |
753 | (p43) cmp.leu p42,p40=a8,n8 };; | |
754 | { .mmi; (p0) getf.sig n8=ni7 // 22: | |
755 | (p49) cmp.ltu p50,p48=t[1],a8 | |
756 | (p51) cmp.leu p50,p48=t[1],a8 } | |
757 | { .mmi; (p42) add t[0]=t[0],r0,1 | |
758 | (p0) add r16=-7*16,prevsp | |
759 | (p0) add r17=-6*16,prevsp };; | |
760 | \f | |
761 | // subtract np[8] from carrybit|tmp[8] | |
4407700c | 762 | // carrybit|tmp[8] layout upon exit from above loop is: |
a000759a AP |
763 | // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) |
764 | { .mmi; (p50)add t[0]=t[0],r0,1 | |
765 | add r18=-5*16,prevsp | |
4407700c AP |
766 | sub n1=t0,n1 };; |
767 | { .mmi; cmp.gtu p34,p32=n1,t0;; | |
768 | .pred.rel "mutex",p32,p34 | |
a000759a AP |
769 | (p32)sub n2=t[7],n2 |
770 | (p34)sub n2=t[7],n2,1 };; | |
771 | { .mii; (p32)cmp.gtu p35,p33=n2,t[7] | |
772 | (p34)cmp.geu p35,p33=n2,t[7];; | |
4407700c | 773 | .pred.rel "mutex",p33,p35 |
a000759a AP |
774 | (p33)sub n3=t[6],n3 } |
775 | { .mmi; (p35)sub n3=t[6],n3,1;; | |
776 | (p33)cmp.gtu p34,p32=n3,t[6] | |
777 | (p35)cmp.geu p34,p32=n3,t[6] };; | |
4407700c | 778 | .pred.rel "mutex",p32,p34 |
a000759a AP |
779 | { .mii; (p32)sub n4=t[5],n4 |
780 | (p34)sub n4=t[5],n4,1;; | |
781 | (p32)cmp.gtu p35,p33=n4,t[5] } | |
782 | { .mmi; (p34)cmp.geu p35,p33=n4,t[5];; | |
4407700c | 783 | .pred.rel "mutex",p33,p35 |
a000759a AP |
784 | (p33)sub n5=t[4],n5 |
785 | (p35)sub n5=t[4],n5,1 };; | |
786 | { .mii; (p33)cmp.gtu p34,p32=n5,t[4] | |
787 | (p35)cmp.geu p34,p32=n5,t[4];; | |
4407700c | 788 | .pred.rel "mutex",p32,p34 |
a000759a AP |
789 | (p32)sub n6=t[3],n6 } |
790 | { .mmi; (p34)sub n6=t[3],n6,1;; | |
791 | (p32)cmp.gtu p35,p33=n6,t[3] | |
792 | (p34)cmp.geu p35,p33=n6,t[3] };; | |
4407700c | 793 | .pred.rel "mutex",p33,p35 |
a000759a AP |
794 | { .mii; (p33)sub n7=t[2],n7 |
795 | (p35)sub n7=t[2],n7,1;; | |
796 | (p33)cmp.gtu p34,p32=n7,t[2] } | |
797 | { .mmi; (p35)cmp.geu p34,p32=n7,t[2];; | |
4407700c | 798 | .pred.rel "mutex",p32,p34 |
a000759a AP |
799 | (p32)sub n8=t[1],n8 |
800 | (p34)sub n8=t[1],n8,1 };; | |
801 | { .mii; (p32)cmp.gtu p35,p33=n8,t[1] | |
802 | (p34)cmp.geu p35,p33=n8,t[1];; | |
4407700c | 803 | .pred.rel "mutex",p33,p35 |
a000759a AP |
804 | (p33)sub a8=t[0],r0 } |
805 | { .mmi; (p35)sub a8=t[0],r0,1;; | |
806 | (p33)cmp.gtu p34,p32=a8,t[0] | |
807 | (p35)cmp.geu p34,p32=a8,t[0] };; | |
4407700c AP |
808 | \f |
809 | // save the result, either tmp[num] or tmp[num]-np[num] | |
810 | .pred.rel "mutex",p32,p34 | |
811 | { .mmi; (p32)st8 [rptr]=n1,8 | |
812 | (p34)st8 [rptr]=t0,8 | |
813 | add r19=-4*16,prevsp};; | |
814 | { .mmb; (p32)st8 [rptr]=n2,8 | |
a000759a | 815 | (p34)st8 [rptr]=t[7],8 |
4407700c AP |
816 | (p5)br.cond.dpnt.few .Ldone };; |
817 | { .mmb; (p32)st8 [rptr]=n3,8 | |
a000759a | 818 | (p34)st8 [rptr]=t[6],8 |
4407700c AP |
819 | (p7)br.cond.dpnt.few .Ldone };; |
820 | { .mmb; (p32)st8 [rptr]=n4,8 | |
a000759a | 821 | (p34)st8 [rptr]=t[5],8 |
4407700c AP |
822 | (p9)br.cond.dpnt.few .Ldone };; |
823 | { .mmb; (p32)st8 [rptr]=n5,8 | |
a000759a | 824 | (p34)st8 [rptr]=t[4],8 |
4407700c AP |
825 | (p11)br.cond.dpnt.few .Ldone };; |
826 | { .mmb; (p32)st8 [rptr]=n6,8 | |
a000759a | 827 | (p34)st8 [rptr]=t[3],8 |
4407700c AP |
828 | (p13)br.cond.dpnt.few .Ldone };; |
829 | { .mmb; (p32)st8 [rptr]=n7,8 | |
a000759a | 830 | (p34)st8 [rptr]=t[2],8 |
4407700c AP |
831 | (p15)br.cond.dpnt.few .Ldone };; |
832 | { .mmb; (p32)st8 [rptr]=n8,8 | |
a000759a | 833 | (p34)st8 [rptr]=t[1],8 |
4407700c AP |
834 | nop.b 0 };; |
835 | .Ldone: // epilogue | |
836 | { .mmi; ldf.fill f16=[r16],64 | |
837 | ldf.fill f17=[r17],64 | |
838 | nop.i 0 } | |
839 | { .mmi; ldf.fill f18=[r18],64 | |
840 | ldf.fill f19=[r19],64 | |
56c5f703 | 841 | mov pr=prevpr,0x1ffff };; |
4407700c AP |
842 | { .mmi; ldf.fill f20=[r16] |
843 | ldf.fill f21=[r17] | |
844 | mov ar.lc=prevlc } | |
845 | { .mmi; ldf.fill f22=[r18] | |
846 | ldf.fill f23=[r19] | |
847 | mov ret0=1 } // signal "handled" | |
848 | { .mib; rum 1<<5 | |
849 | .restore sp | |
850 | mov sp=prevsp | |
851 | br.ret.sptk.many b0 };; | |
852 | .endp bn_mul_mont_8# | |
853 | ||
dacdcf3c AP |
854 | .type copyright#,\@object |
855 | copyright: | |
856 | stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" | |
857 | ___ | |
858 | ||
6bd7a4d9 | 859 | open STDOUT,">$output" if $output; |
dacdcf3c | 860 | print $code; |
a21314db | 861 | close STDOUT or die "error closing STDOUT: $!"; |