]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/ia64.S
~15% better AES x86_64 assembler.
[thirdparty/openssl.git] / crypto / bn / asm / ia64.S
CommitLineData
52c0d300 1.explicit
4cb73bf8 2.text
e2f2a9af 3.ident "ia64.S, Version 2.1"
622d3d35 4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
4cb73bf8
AP
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
722d17cb
AP
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17// different from Itanium to this module viewpoint. Most notably, is it
18// "wider" than Itanium? Can you experience loop scalability as
19// discussed in commentary sections? Not really:-( Itanium2 has 6
20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22// ports is the same, i.e. 2, while I need 4. In other words, to this
23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24// essentially different in respect to this module, and a re-tune was
25// required. Well, because some intruction latencies has changed. Most
26// noticeably those intensively used:
27//
28// Itanium Itanium2
29// ldf8 9 6 L2 hit
30// ld8 2 1 L1 hit
31// getf 2 5
32// xma[->getf] 7[+1] 4[+0]
33// add[->st8] 1[+1] 1[+0]
34//
35// What does it mean? You might ratiocinate that the original code
36// should run just faster... Because sum of latencies is smaller...
37// Wrong! Note that getf latency increased. This means that if a loop is
e2f2a9af 38// scheduled for lower latency (as they were), then it will suffer from
722d17cb
AP
39// stall condition and the code will therefore turn anti-scalable, e.g.
40// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43// for worst latency for every instruction aiming for best *all-round*
44// performance.
4cb73bf8
AP
45
46// Q. How much faster does it get?
47// A. Here is the output from 'openssl speed rsa dsa' for vanilla
48// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
49// Linux 7.1 2.96-81):
50//
51// sign verify sign/s verify/s
52// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
53// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
54// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
55// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
56// sign verify sign/s verify/s
57// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
58// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
59//
60// And here is similar output but for this assembler
61// implementation:-)
62//
63// sign verify sign/s verify/s
64// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
65// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
66// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
67// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
68// sign verify sign/s verify/s
69// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
70// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
71//
72// Yes, you may argue that it's not fair comparison as it's
73// possible to craft the C implementation with BN_UMULT_HIGH
74// inline assembler macro. But of course! Here is the output
75// with the macro:
76//
77// sign verify sign/s verify/s
78// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
79// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
80// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
81// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
82// sign verify sign/s verify/s
83// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
84// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
85//
86// My code is still way faster, huh:-) And I believe that even
87// higher performance can be achieved. Note that as keys get
88// longer, performance gain is larger. Why? According to the
89// profiler there is another player in the field, namely
90// BN_from_montgomery consuming larger and larger portion of CPU
91// time as keysize decreases. I therefore consider putting effort
92// to assembler implementation of the following routine:
93//
94// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
95// {
96// int i,j;
97// BN_ULONG v;
98//
99// for (i=0; i<nl; i++)
100// {
101// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
102// nrp++;
103// rp++;
104// if (((nrp[-1]+=v)&BN_MASK2) < v)
105// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
106// }
107// }
108//
109// It might as well be beneficial to implement even combaX
110// variants, as it appears as it can literally unleash the
111// performance (see comment section to bn_mul_comba8 below).
112//
113// And finally for your reference the output for 0.9.6a compiled
114// with SGIcc version 0.01.0-12 (keep in mind that for the moment
115// of this writing it's not possible to convince SGIcc to use
116// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
117// i.e. for a compiler generated one:-):
118//
119// sign verify sign/s verify/s
120// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
121// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
122// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
123// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
124// sign verify sign/s verify/s
125// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
126// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
127//
128// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
129// system running Redhat Linux 7.1 (very special thanks to Ray
130// McCaffity of Williams Communications for providing an account).
131//
132// Q. What's the heck with 'rum 1<<5' at the end of every function?
133// A. Well, by clearing the "upper FP registers written" bit of the
134// User Mask I want to excuse the kernel from preserving upper
135// (f32-f128) FP register bank over process context switch, thus
136// minimizing bus bandwidth consumption during the switch (i.e.
137// after PKI opration completes and the program is off doing
138// something else like bulk symmetric encryption). Having said
139// this, I also want to point out that it might be good idea
140// to compile the whole toolkit (as well as majority of the
141// programs for that matter) with -mfixed-range=f32-f127 command
142// line option. No, it doesn't prevent the compiler from writing
143// to upper bank, but at least discourages to do so. If you don't
144// like the idea you have the option to compile the module with
145// -Drum=nop.m in command line.
146//
147
e2f2a9af
AP
148#if defined(_HPUX_SOURCE) && !defined(_LP64)
149#define ADDP addp4
150#else
151#define ADDP add
152#endif
153
4cb73bf8
AP
154#if 1
155//
156// bn_[add|sub]_words routines.
157//
158// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
159// data reside in L1 cache, i.e. 2 ticks away). It's possible to
160// compress the epilogue and get down to 2*n+6, but at the cost of
161// scalability (the neat feature of this implementation is that it
162// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
163// I consider that the epilogue is short enough as it is to trade tiny
164// performance loss on Itanium for scalability.
165//
166// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
167//
168.global bn_add_words#
169.proc bn_add_words#
170.align 64
622d3d35 171.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
172bn_add_words:
173 .prologue
174 .fframe 0
175 .save ar.pfs,r2
622d3d35 176{ .mii; alloc r2=ar.pfs,4,12,0,16
4cb73bf8 177 cmp4.le p6,p0=r35,r0 };;
622d3d35 178{ .mfb; mov r8=r0 // return value
4cb73bf8
AP
179(p6) br.ret.spnt.many b0 };;
180
181 .save ar.lc,r3
622d3d35 182{ .mib; sub r10=r35,r0,1
4cb73bf8
AP
183 mov r3=ar.lc
184 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
185 }
186 .body
e2f2a9af 187{ .mib; ADDP r14=0,r32 // rp
4cb73bf8 188 mov r9=pr };;
e2f2a9af 189{ .mii; ADDP r15=0,r33 // ap
4cb73bf8
AP
190 mov ar.lc=r10
191 mov ar.ec=6 }
e2f2a9af 192{ .mib; ADDP r16=0,r34 // bp
622d3d35 193 mov pr.rot=1<<16 };;
4cb73bf8 194
622d3d35
AP
195.L_bn_add_words_ctop:
196{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
4cb73bf8
AP
197 (p18) add r39=r37,r34
198 (p19) cmp.ltu.unc p56,p0=r40,r38 }
622d3d35 199{ .mfb; (p0) nop.m 0x0
4cb73bf8
AP
200 (p0) nop.f 0x0
201 (p0) nop.b 0x0 }
622d3d35 202{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
4cb73bf8
AP
203 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
204 (p58) add r41=1,r41 } // (p20)
622d3d35 205{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
4cb73bf8
AP
206 (p0) nop.f 0x0
207 br.ctop.sptk .L_bn_add_words_ctop };;
208.L_bn_add_words_cend:
209
622d3d35 210{ .mii;
4cb73bf8 211(p59) add r8=1,r8 // return value
46a0d4fb 212 mov pr=r9,0x1ffff
4cb73bf8 213 mov ar.lc=r3 }
622d3d35 214{ .mbb; nop.b 0x0
4cb73bf8
AP
215 br.ret.sptk.many b0 };;
216.endp bn_add_words#
217
218//
219// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
220//
221.global bn_sub_words#
222.proc bn_sub_words#
223.align 64
622d3d35 224.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
225bn_sub_words:
226 .prologue
227 .fframe 0
228 .save ar.pfs,r2
622d3d35 229{ .mii; alloc r2=ar.pfs,4,12,0,16
4cb73bf8 230 cmp4.le p6,p0=r35,r0 };;
622d3d35 231{ .mfb; mov r8=r0 // return value
4cb73bf8
AP
232(p6) br.ret.spnt.many b0 };;
233
234 .save ar.lc,r3
622d3d35 235{ .mib; sub r10=r35,r0,1
4cb73bf8
AP
236 mov r3=ar.lc
237 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
238 }
239 .body
e2f2a9af 240{ .mib; ADDP r14=0,r32 // rp
4cb73bf8 241 mov r9=pr };;
e2f2a9af 242{ .mii; ADDP r15=0,r33 // ap
4cb73bf8
AP
243 mov ar.lc=r10
244 mov ar.ec=6 }
e2f2a9af 245{ .mib; ADDP r16=0,r34 // bp
622d3d35 246 mov pr.rot=1<<16 };;
4cb73bf8 247
622d3d35
AP
248.L_bn_sub_words_ctop:
249{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
4cb73bf8
AP
250 (p18) sub r39=r37,r34
251 (p19) cmp.gtu.unc p56,p0=r40,r38 }
622d3d35 252{ .mfb; (p0) nop.m 0x0
4cb73bf8
AP
253 (p0) nop.f 0x0
254 (p0) nop.b 0x0 }
622d3d35 255{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
4cb73bf8
AP
256 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
257 (p58) add r41=-1,r41 } // (p20)
622d3d35 258{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
4cb73bf8
AP
259 (p0) nop.b 0x0
260 br.ctop.sptk .L_bn_sub_words_ctop };;
261.L_bn_sub_words_cend:
262
622d3d35 263{ .mii;
4cb73bf8 264(p59) add r8=1,r8 // return value
46a0d4fb 265 mov pr=r9,0x1ffff
4cb73bf8 266 mov ar.lc=r3 }
622d3d35 267{ .mbb; nop.b 0x0
4cb73bf8
AP
268 br.ret.sptk.many b0 };;
269.endp bn_sub_words#
270#endif
271
272#if 0
273#define XMA_TEMPTATION
274#endif
275
276#if 1
277//
278// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
279//
280.global bn_mul_words#
281.proc bn_mul_words#
282.align 64
622d3d35 283.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
284bn_mul_words:
285 .prologue
286 .fframe 0
287 .save ar.pfs,r2
288#ifdef XMA_TEMPTATION
622d3d35 289{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
4cb73bf8 290#else
722d17cb 291{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
4cb73bf8 292#endif
622d3d35 293{ .mib; mov r8=r0 // return value
4cb73bf8
AP
294 cmp4.le p6,p0=r34,r0
295(p6) br.ret.spnt.many b0 };;
296
297 .save ar.lc,r3
622d3d35 298{ .mii; sub r10=r34,r0,1
4cb73bf8
AP
299 mov r3=ar.lc
300 mov r9=pr };;
301
302 .body
622d3d35 303{ .mib; setf.sig f8=r35 // w
722d17cb
AP
304 mov pr.rot=0x800001<<16
305 // ------^----- serves as (p50) at first (p27)
4cb73bf8
AP
306 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
307 }
308
309#ifndef XMA_TEMPTATION
310
e2f2a9af
AP
311{ .mmi; ADDP r14=0,r32 // rp
312 ADDP r15=0,r33 // ap
4cb73bf8 313 mov ar.lc=r10 }
e2f2a9af 314{ .mmi; mov r40=0 // serves as r35 at first (p27)
722d17cb 315 mov ar.ec=13 };;
4cb73bf8 316
722d17cb
AP
317// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
318// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
4cb73bf8 319// bypass L1 cache and L2 latency is actually best-case scenario for
722d17cb
AP
320// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
321// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
4cb73bf8
AP
322// would give us ~5% in *overall* performance improvement on "wider"
323// IA-64, but would hurt Itanium for about same because of longer
324// epilogue. As it's a matter of few percents in either case I've
325// chosen to trade the scalability for development time (you can see
326// this very instruction sequence in bn_mul_add_words loop which in
327// turn is scalable).
622d3d35 328.L_bn_mul_words_ctop:
722d17cb
AP
329{ .mfi; (p25) getf.sig r36=f52 // low
330 (p21) xmpy.lu f48=f37,f8
331 (p28) cmp.ltu p54,p50=r41,r39 }
622d3d35 332{ .mfi; (p16) ldf8 f32=[r15],8
722d17cb 333 (p21) xmpy.hu f40=f37,f8
4cb73bf8 334 (p0) nop.i 0x0 };;
722d17cb
AP
335{ .mii; (p25) getf.sig r32=f44 // high
336 .pred.rel "mutex",p50,p54
337 (p50) add r40=r38,r35 // (p27)
338 (p54) add r40=r38,r35,1 } // (p27)
339{ .mfb; (p28) st8 [r14]=r41,8
4cb73bf8
AP
340 (p0) nop.f 0x0
341 br.ctop.sptk .L_bn_mul_words_ctop };;
342.L_bn_mul_words_cend:
343
622d3d35 344{ .mii; nop.m 0x0
722d17cb
AP
345.pred.rel "mutex",p51,p55
346(p51) add r8=r36,r0
347(p55) add r8=r36,r0,1 }
622d3d35 348{ .mfb; nop.m 0x0
4cb73bf8
AP
349 nop.f 0x0
350 nop.b 0x0 }
351
352#else // XMA_TEMPTATION
353
354 setf.sig f37=r0 // serves as carry at (p18) tick
355 mov ar.lc=r10
622d3d35 356 mov ar.ec=5;;
4cb73bf8
AP
357
358// Most of you examining this code very likely wonder why in the name
359// of Intel the following loop is commented out? Indeed, it looks so
360// neat that you find it hard to believe that it's something wrong
361// with it, right? The catch is that every iteration depends on the
362// result from previous one and the latter isn't available instantly.
363// The loop therefore spins at the latency of xma minus 1, or in other
364// words at 6*(n+4) ticks:-( Compare to the "production" loop above
365// that runs in 2*(n+11) where the low latency problem is worked around
366// by moving the dependency to one-tick latent interger ALU. Note that
367// "distance" between ldf8 and xma is not latency of ldf8, but the
368// *difference* between xma and ldf8 latencies.
622d3d35
AP
369.L_bn_mul_words_ctop:
370{ .mfi; (p16) ldf8 f32=[r33],8
4cb73bf8 371 (p18) xma.hu f38=f34,f8,f39 }
622d3d35 372{ .mfb; (p20) stf8 [r32]=f37,8
4cb73bf8
AP
373 (p18) xma.lu f35=f34,f8,f39
374 br.ctop.sptk .L_bn_mul_words_ctop };;
375.L_bn_mul_words_cend:
376
377 getf.sig r8=f41 // the return value
378
379#endif // XMA_TEMPTATION
380
622d3d35 381{ .mii; nop.m 0x0
46a0d4fb 382 mov pr=r9,0x1ffff
4cb73bf8 383 mov ar.lc=r3 }
622d3d35 384{ .mfb; rum 1<<5 // clear um.mfh
4cb73bf8
AP
385 nop.f 0x0
386 br.ret.sptk.many b0 };;
387.endp bn_mul_words#
388#endif
389
390#if 1
391//
392// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
393//
394.global bn_mul_add_words#
395.proc bn_mul_add_words#
396.align 64
e2f2a9af 397.skip 48 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
398bn_mul_add_words:
399 .prologue
400 .fframe 0
401 .save ar.pfs,r2
4cb73bf8 402 .save ar.lc,r3
e2f2a9af
AP
403 .save pr,r9
404{ .mmi; alloc r2=ar.pfs,4,4,0,8
405 cmp4.le p6,p0=r34,r0
406 mov r3=ar.lc };;
407{ .mib; mov r8=r0 // return value
408 sub r10=r34,r0,1
409(p6) br.ret.spnt.many b0 };;
4cb73bf8
AP
410
411 .body
e2f2a9af
AP
412{ .mib; setf.sig f8=r35 // w
413 mov r9=pr
4cb73bf8
AP
414 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
415 }
e2f2a9af
AP
416{ .mmi; ADDP r14=0,r32 // rp
417 ADDP r15=0,r33 // ap
4cb73bf8 418 mov ar.lc=r10 }
e2f2a9af
AP
419{ .mii; ADDP r16=0,r32 // rp copy
420 mov pr.rot=0x2001<<16
421 // ------^----- serves as (p40) at first (p27)
422 mov ar.ec=11 };;
423
424// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
425// Itanium 2. Yes, unlike previous versions it scales:-) Previous
426// version was peforming *all* additions in IALU and was starving
427// for those even on Itanium 2. In this version one addition is
428// moved to FPU and is folded with multiplication. This is at cost
429// of propogating the result from previous call to this subroutine
430// to L2 cache... In other words negligible even for shorter keys.
431// *Overall* performance improvement [over previous version] varies
432// from 11 to 22 percent depending on key length.
622d3d35 433.L_bn_mul_add_words_ctop:
e2f2a9af
AP
434.pred.rel "mutex",p40,p42
435{ .mfi; (p23) getf.sig r36=f45 // low
436 (p20) xma.lu f42=f36,f8,f50 // low
437 (p40) add r39=r39,r35 } // (p27)
438{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
439 (p20) xma.hu f36=f36,f8,f50 // high
440 (p42) add r39=r39,r35,1 };; // (p27)
441{ .mmi; (p24) getf.sig r32=f40 // high
442 (p16) ldf8 f46=[r16],8 // *(rp1++)
443 (p40) cmp.ltu p41,p39=r39,r35 } // (p27)
444{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
445 (p42) cmp.leu p41,p39=r39,r35 // (p27)
4cb73bf8
AP
446 br.ctop.sptk .L_bn_mul_add_words_ctop};;
447.L_bn_mul_add_words_cend:
448
e2f2a9af
AP
449{ .mmi; .pred.rel "mutex",p40,p42
450(p40) add r8=r35,r0
451(p42) add r8=r35,r0,1
452 mov pr=r9,0x1ffff }
453{ .mib; rum 1<<5 // clear um.mfh
454 mov ar.lc=r3
4cb73bf8
AP
455 br.ret.sptk.many b0 };;
456.endp bn_mul_add_words#
457#endif
458
459#if 1
460//
461// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
462//
463.global bn_sqr_words#
464.proc bn_sqr_words#
465.align 64
622d3d35 466.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
467bn_sqr_words:
468 .prologue
469 .fframe 0
470 .save ar.pfs,r2
622d3d35 471{ .mii; alloc r2=ar.pfs,3,0,0,0
4cb73bf8 472 sxt4 r34=r34 };;
622d3d35 473{ .mii; cmp.le p6,p0=r34,r0
4cb73bf8 474 mov r8=r0 } // return value
e2f2a9af
AP
475{ .mfb; ADDP r32=0,r32
476 nop.f 0x0
4cb73bf8
AP
477(p6) br.ret.spnt.many b0 };;
478
479 .save ar.lc,r3
622d3d35 480{ .mii; sub r10=r34,r0,1
4cb73bf8
AP
481 mov r3=ar.lc
482 mov r9=pr };;
483
484 .body
e2f2a9af 485{ .mib; ADDP r33=0,r33
4cb73bf8
AP
486 mov pr.rot=1<<16
487 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
488 }
622d3d35 489{ .mii; add r34=8,r32
4cb73bf8 490 mov ar.lc=r10
622d3d35 491 mov ar.ec=18 };;
4cb73bf8
AP
492
493// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
494// possible to compress the epilogue (I'm getting tired to write this
495// comment over and over) and get down to 2*n+16 at the cost of
496// scalability. The decision will very likely be reconsidered after the
497// benchmark program is profiled. I.e. if perfomance gain on Itanium
498// will appear larger than loss on "wider" IA-64, then the loop should
499// be explicitely split and the epilogue compressed.
622d3d35
AP
500.L_bn_sqr_words_ctop:
501{ .mfi; (p16) ldf8 f32=[r33],8
4cb73bf8
AP
502 (p25) xmpy.lu f42=f41,f41
503 (p0) nop.i 0x0 }
622d3d35 504{ .mib; (p33) stf8 [r32]=f50,16
4cb73bf8
AP
505 (p0) nop.i 0x0
506 (p0) nop.b 0x0 }
622d3d35 507{ .mfi; (p0) nop.m 0x0
4cb73bf8
AP
508 (p25) xmpy.hu f52=f41,f41
509 (p0) nop.i 0x0 }
622d3d35 510{ .mib; (p33) stf8 [r34]=f60,16
4cb73bf8
AP
511 (p0) nop.i 0x0
512 br.ctop.sptk .L_bn_sqr_words_ctop };;
513.L_bn_sqr_words_cend:
514
622d3d35 515{ .mii; nop.m 0x0
46a0d4fb 516 mov pr=r9,0x1ffff
4cb73bf8 517 mov ar.lc=r3 }
622d3d35 518{ .mfb; rum 1<<5 // clear um.mfh
4cb73bf8
AP
519 nop.f 0x0
520 br.ret.sptk.many b0 };;
521.endp bn_sqr_words#
522#endif
523
524#if 1
a95541d6 525// Apparently we win nothing by implementing special bn_sqr_comba8.
4cb73bf8
AP
526// Yes, it is possible to reduce the number of multiplications by
527// almost factor of two, but then the amount of additions would
528// increase by factor of two (as we would have to perform those
529// otherwise performed by xma ourselves). Normally we would trade
530// anyway as multiplications are way more expensive, but not this
531// time... Multiplication kernel is fully pipelined and as we drain
532// one 128-bit multiplication result per clock cycle multiplications
533// are effectively as inexpensive as additions. Special implementation
534// might become of interest for "wider" IA-64 implementation as you'll
535// be able to get through the multiplication phase faster (there won't
536// be any stall issues as discussed in the commentary section below and
537// you therefore will be able to employ all 4 FP units)... But these
538// Itanium days it's simply too hard to justify the effort so I just
539// drop down to bn_mul_comba8 code:-)
540//
541// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
542//
543.global bn_sqr_comba8#
544.proc bn_sqr_comba8#
545.align 64
546bn_sqr_comba8:
547 .prologue
548 .fframe 0
549 .save ar.pfs,r2
e2f2a9af 550#if defined(_HPUX_SOURCE) && !defined(_LP64)
622d3d35 551{ .mii; alloc r2=ar.pfs,2,1,0,0
46a0d4fb
AP
552 addp4 r33=0,r33
553 addp4 r32=0,r32 };;
554{ .mii;
555#else
556{ .mii; alloc r2=ar.pfs,2,1,0,0
557#endif
4cb73bf8
AP
558 mov r34=r33
559 add r14=8,r33 };;
560 .body
622d3d35 561{ .mii; add r17=8,r34
4cb73bf8
AP
562 add r15=16,r33
563 add r18=16,r34 }
622d3d35 564{ .mfb; add r16=24,r33
4cb73bf8 565 br .L_cheat_entry_point8 };;
622d3d35 566.endp bn_sqr_comba8#
4cb73bf8
AP
567#endif
568
569#if 1
570// I've estimated this routine to run in ~120 ticks, but in reality
571// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
572// cycles consumed for instructions fetch? Or did I misinterpret some
573