]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/ia64.S
Suppress "deprecated" warnings introduced in VC8.
[thirdparty/openssl.git] / crypto / bn / asm / ia64.S
CommitLineData
52c0d300 1.explicit
4cb73bf8 2.text
e2f2a9af 3.ident "ia64.S, Version 2.1"
622d3d35 4.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
4cb73bf8
AP
5
6//
7// ====================================================================
8// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9// project.
10//
11// Rights for redistribution and usage in source and binary forms are
12// granted according to the OpenSSL license. Warranty of any kind is
13// disclaimed.
14// ====================================================================
15//
722d17cb
AP
16// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17// different from Itanium to this module viewpoint. Most notably, is it
18// "wider" than Itanium? Can you experience loop scalability as
19// discussed in commentary sections? Not really:-( Itanium2 has 6
20// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22// ports is the same, i.e. 2, while I need 4. In other words, to this
23// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24// essentially different in respect to this module, and a re-tune was
25// required. Well, because some intruction latencies has changed. Most
26// noticeably those intensively used:
27//
28// Itanium Itanium2
29// ldf8 9 6 L2 hit
30// ld8 2 1 L1 hit
31// getf 2 5
32// xma[->getf] 7[+1] 4[+0]
33// add[->st8] 1[+1] 1[+0]
34//
35// What does it mean? You might ratiocinate that the original code
36// should run just faster... Because sum of latencies is smaller...
37// Wrong! Note that getf latency increased. This means that if a loop is
e2f2a9af 38// scheduled for lower latency (as they were), then it will suffer from
722d17cb
AP
39// stall condition and the code will therefore turn anti-scalable, e.g.
40// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43// for worst latency for every instruction aiming for best *all-round*
44// performance.
4cb73bf8
AP
45
46// Q. How much faster does it get?
47// A. Here is the output from 'openssl speed rsa dsa' for vanilla
48// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
49// Linux 7.1 2.96-81):
50//
51// sign verify sign/s verify/s
52// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
53// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
54// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
55// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
56// sign verify sign/s verify/s
57// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
58// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
59//
60// And here is similar output but for this assembler
61// implementation:-)
62//
63// sign verify sign/s verify/s
64// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
65// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
66// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
67// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
68// sign verify sign/s verify/s
69// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
70// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
71//
72// Yes, you may argue that it's not fair comparison as it's
73// possible to craft the C implementation with BN_UMULT_HIGH
74// inline assembler macro. But of course! Here is the output
75// with the macro:
76//
77// sign verify sign/s verify/s
78// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
79// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
80// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
81// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
82// sign verify sign/s verify/s
83// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
84// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
85//
86// My code is still way faster, huh:-) And I believe that even
87// higher performance can be achieved. Note that as keys get
88// longer, performance gain is larger. Why? According to the
89// profiler there is another player in the field, namely
90// BN_from_montgomery consuming larger and larger portion of CPU
91// time as keysize decreases. I therefore consider putting effort
92// to assembler implementation of the following routine:
93//
94// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
95// {
96// int i,j;
97// BN_ULONG v;
98//
99// for (i=0; i<nl; i++)
100// {
101// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
102// nrp++;
103// rp++;
104// if (((nrp[-1]+=v)&BN_MASK2) < v)
105// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
106// }
107// }
108//
109// It might as well be beneficial to implement even combaX
110// variants, as it appears as it can literally unleash the
111// performance (see comment section to bn_mul_comba8 below).
112//
113// And finally for your reference the output for 0.9.6a compiled
114// with SGIcc version 0.01.0-12 (keep in mind that for the moment
115// of this writing it's not possible to convince SGIcc to use
116// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
117// i.e. for a compiler generated one:-):
118//
119// sign verify sign/s verify/s
120// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
121// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
122// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
123// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
124// sign verify sign/s verify/s
125// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
126// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
127//
128// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
129// system running Redhat Linux 7.1 (very special thanks to Ray
130// McCaffity of Williams Communications for providing an account).
131//
132// Q. What's the heck with 'rum 1<<5' at the end of every function?
133// A. Well, by clearing the "upper FP registers written" bit of the
134// User Mask I want to excuse the kernel from preserving upper
135// (f32-f128) FP register bank over process context switch, thus
136// minimizing bus bandwidth consumption during the switch (i.e.
137// after PKI opration completes and the program is off doing
138// something else like bulk symmetric encryption). Having said
139// this, I also want to point out that it might be good idea
140// to compile the whole toolkit (as well as majority of the
141// programs for that matter) with -mfixed-range=f32-f127 command
142// line option. No, it doesn't prevent the compiler from writing
143// to upper bank, but at least discourages to do so. If you don't
144// like the idea you have the option to compile the module with
145// -Drum=nop.m in command line.
146//
147
e2f2a9af
AP
148#if defined(_HPUX_SOURCE) && !defined(_LP64)
149#define ADDP addp4
150#else
151#define ADDP add
152#endif
153
4cb73bf8
AP
154#if 1
155//
156// bn_[add|sub]_words routines.
157//
158// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
159// data reside in L1 cache, i.e. 2 ticks away). It's possible to
160// compress the epilogue and get down to 2*n+6, but at the cost of
161// scalability (the neat feature of this implementation is that it
162// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
163// I consider that the epilogue is short enough as it is to trade tiny
164// performance loss on Itanium for scalability.
165//
166// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
167//
168.global bn_add_words#
169.proc bn_add_words#
170.align 64
622d3d35 171.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
172bn_add_words:
173 .prologue
4cb73bf8 174 .save ar.pfs,r2
622d3d35 175{ .mii; alloc r2=ar.pfs,4,12,0,16
4cb73bf8 176 cmp4.le p6,p0=r35,r0 };;
622d3d35 177{ .mfb; mov r8=r0 // return value
4cb73bf8
AP
178(p6) br.ret.spnt.many b0 };;
179
622d3d35 180{ .mib; sub r10=r35,r0,1
ef428d56 181 .save ar.lc,r3
4cb73bf8
AP
182 mov r3=ar.lc
183 brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
184 }
185 .body
e2f2a9af 186{ .mib; ADDP r14=0,r32 // rp
4cb73bf8 187 mov r9=pr };;
e2f2a9af 188{ .mii; ADDP r15=0,r33 // ap
4cb73bf8
AP
189 mov ar.lc=r10
190 mov ar.ec=6 }
e2f2a9af 191{ .mib; ADDP r16=0,r34 // bp
622d3d35 192 mov pr.rot=1<<16 };;
4cb73bf8 193
622d3d35
AP
194.L_bn_add_words_ctop:
195{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
4cb73bf8
AP
196 (p18) add r39=r37,r34
197 (p19) cmp.ltu.unc p56,p0=r40,r38 }
622d3d35 198{ .mfb; (p0) nop.m 0x0
4cb73bf8
AP
199 (p0) nop.f 0x0
200 (p0) nop.b 0x0 }
622d3d35 201{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
4cb73bf8
AP
202 (p58) cmp.eq.or p57,p0=-1,r41 // (p20)
203 (p58) add r41=1,r41 } // (p20)
622d3d35 204{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
4cb73bf8
AP
205 (p0) nop.f 0x0
206 br.ctop.sptk .L_bn_add_words_ctop };;
207.L_bn_add_words_cend:
208
622d3d35 209{ .mii;
4cb73bf8 210(p59) add r8=1,r8 // return value
46a0d4fb 211 mov pr=r9,0x1ffff
4cb73bf8 212 mov ar.lc=r3 }
622d3d35 213{ .mbb; nop.b 0x0
4cb73bf8
AP
214 br.ret.sptk.many b0 };;
215.endp bn_add_words#
216
217//
218// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
219//
220.global bn_sub_words#
221.proc bn_sub_words#
222.align 64
622d3d35 223.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
224bn_sub_words:
225 .prologue
4cb73bf8 226 .save ar.pfs,r2
622d3d35 227{ .mii; alloc r2=ar.pfs,4,12,0,16
4cb73bf8 228 cmp4.le p6,p0=r35,r0 };;
622d3d35 229{ .mfb; mov r8=r0 // return value
4cb73bf8
AP
230(p6) br.ret.spnt.many b0 };;
231
622d3d35 232{ .mib; sub r10=r35,r0,1
ef428d56 233 .save ar.lc,r3
4cb73bf8
AP
234 mov r3=ar.lc
235 brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
236 }
237 .body
e2f2a9af 238{ .mib; ADDP r14=0,r32 // rp
4cb73bf8 239 mov r9=pr };;
e2f2a9af 240{ .mii; ADDP r15=0,r33 // ap
4cb73bf8
AP
241 mov ar.lc=r10
242 mov ar.ec=6 }
e2f2a9af 243{ .mib; ADDP r16=0,r34 // bp
622d3d35 244 mov pr.rot=1<<16 };;
4cb73bf8 245
622d3d35
AP
246.L_bn_sub_words_ctop:
247{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
4cb73bf8
AP
248 (p18) sub r39=r37,r34
249 (p19) cmp.gtu.unc p56,p0=r40,r38 }
622d3d35 250{ .mfb; (p0) nop.m 0x0
4cb73bf8
AP
251 (p0) nop.f 0x0
252 (p0) nop.b 0x0 }
622d3d35 253{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
4cb73bf8
AP
254 (p58) cmp.eq.or p57,p0=0,r41 // (p20)
255 (p58) add r41=-1,r41 } // (p20)
622d3d35 256{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
4cb73bf8
AP
257 (p0) nop.b 0x0
258 br.ctop.sptk .L_bn_sub_words_ctop };;
259.L_bn_sub_words_cend:
260
622d3d35 261{ .mii;
4cb73bf8 262(p59) add r8=1,r8 // return value
46a0d4fb 263 mov pr=r9,0x1ffff
4cb73bf8 264 mov ar.lc=r3 }
622d3d35 265{ .mbb; nop.b 0x0
4cb73bf8
AP
266 br.ret.sptk.many b0 };;
267.endp bn_sub_words#
268#endif
269
270#if 0
271#define XMA_TEMPTATION
272#endif
273
274#if 1
275//
276// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
277//
278.global bn_mul_words#
279.proc bn_mul_words#
280.align 64
622d3d35 281.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
282bn_mul_words:
283 .prologue
4cb73bf8
AP
284 .save ar.pfs,r2
285#ifdef XMA_TEMPTATION
622d3d35 286{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
4cb73bf8 287#else
722d17cb 288{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
4cb73bf8 289#endif
622d3d35 290{ .mib; mov r8=r0 // return value
4cb73bf8
AP
291 cmp4.le p6,p0=r34,r0
292(p6) br.ret.spnt.many b0 };;
293
622d3d35 294{ .mii; sub r10=r34,r0,1
ef428d56 295 .save ar.lc,r3
4cb73bf8
AP
296 mov r3=ar.lc
297 mov r9=pr };;
298
299 .body
622d3d35 300{ .mib; setf.sig f8=r35 // w
722d17cb
AP
301 mov pr.rot=0x800001<<16
302 // ------^----- serves as (p50) at first (p27)
4cb73bf8
AP
303 brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
304 }
305
306#ifndef XMA_TEMPTATION
307
e2f2a9af
AP
308{ .mmi; ADDP r14=0,r32 // rp
309 ADDP r15=0,r33 // ap
4cb73bf8 310 mov ar.lc=r10 }
e2f2a9af 311{ .mmi; mov r40=0 // serves as r35 at first (p27)
722d17cb 312 mov ar.ec=13 };;
4cb73bf8 313
722d17cb
AP
314// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
315// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
4cb73bf8 316// bypass L1 cache and L2 latency is actually best-case scenario for
722d17cb
AP
317// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
318// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
4cb73bf8
AP
319// would give us ~5% in *overall* performance improvement on "wider"
320// IA-64, but would hurt Itanium for about same because of longer
321// epilogue. As it's a matter of few percents in either case I've
322// chosen to trade the scalability for development time (you can see
323// this very instruction sequence in bn_mul_add_words loop which in
324// turn is scalable).
622d3d35 325.L_bn_mul_words_ctop:
722d17cb
AP
326{ .mfi; (p25) getf.sig r36=f52 // low
327 (p21) xmpy.lu f48=f37,f8
328 (p28) cmp.ltu p54,p50=r41,r39 }
622d3d35 329{ .mfi; (p16) ldf8 f32=[r15],8
722d17cb 330 (p21) xmpy.hu f40=f37,f8
4cb73bf8 331 (p0) nop.i 0x0 };;
722d17cb
AP
332{ .mii; (p25) getf.sig r32=f44 // high
333 .pred.rel "mutex",p50,p54
334 (p50) add r40=r38,r35 // (p27)
335 (p54) add r40=r38,r35,1 } // (p27)
336{ .mfb; (p28) st8 [r14]=r41,8
4cb73bf8
AP
337 (p0) nop.f 0x0
338 br.ctop.sptk .L_bn_mul_words_ctop };;
339.L_bn_mul_words_cend:
340
622d3d35 341{ .mii; nop.m 0x0
722d17cb
AP
342.pred.rel "mutex",p51,p55
343(p51) add r8=r36,r0
344(p55) add r8=r36,r0,1 }
622d3d35 345{ .mfb; nop.m 0x0
4cb73bf8
AP
346 nop.f 0x0
347 nop.b 0x0 }
348
349#else // XMA_TEMPTATION
350
351 setf.sig f37=r0 // serves as carry at (p18) tick
352 mov ar.lc=r10
622d3d35 353 mov ar.ec=5;;
4cb73bf8
AP
354
355// Most of you examining this code very likely wonder why in the name
356// of Intel the following loop is commented out? Indeed, it looks so
357// neat that you find it hard to believe that it's something wrong
358// with it, right? The catch is that every iteration depends on the
359// result from previous one and the latter isn't available instantly.
360// The loop therefore spins at the latency of xma minus 1, or in other
361// words at 6*(n+4) ticks:-( Compare to the "production" loop above
362// that runs in 2*(n+11) where the low latency problem is worked around
363// by moving the dependency to one-tick latent interger ALU. Note that
364// "distance" between ldf8 and xma is not latency of ldf8, but the
365// *difference* between xma and ldf8 latencies.
622d3d35
AP
366.L_bn_mul_words_ctop:
367{ .mfi; (p16) ldf8 f32=[r33],8
4cb73bf8 368 (p18) xma.hu f38=f34,f8,f39 }
622d3d35 369{ .mfb; (p20) stf8 [r32]=f37,8
4cb73bf8
AP
370 (p18) xma.lu f35=f34,f8,f39
371 br.ctop.sptk .L_bn_mul_words_ctop };;
372.L_bn_mul_words_cend:
373
374 getf.sig r8=f41 // the return value
375
376#endif // XMA_TEMPTATION
377
622d3d35 378{ .mii; nop.m 0x0
46a0d4fb 379 mov pr=r9,0x1ffff
4cb73bf8 380 mov ar.lc=r3 }
622d3d35 381{ .mfb; rum 1<<5 // clear um.mfh
4cb73bf8
AP
382 nop.f 0x0
383 br.ret.sptk.many b0 };;
384.endp bn_mul_words#
385#endif
386
387#if 1
388//
389// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
390//
391.global bn_mul_add_words#
392.proc bn_mul_add_words#
393.align 64
e2f2a9af 394.skip 48 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
395bn_mul_add_words:
396 .prologue
4cb73bf8 397 .save ar.pfs,r2
e2f2a9af
AP
398{ .mmi; alloc r2=ar.pfs,4,4,0,8
399 cmp4.le p6,p0=r34,r0
ef428d56 400 .save ar.lc,r3
e2f2a9af
AP
401 mov r3=ar.lc };;
402{ .mib; mov r8=r0 // return value
403 sub r10=r34,r0,1
404(p6) br.ret.spnt.many b0 };;
4cb73bf8
AP
405
406 .body
e2f2a9af 407{ .mib; setf.sig f8=r35 // w
ef428d56 408 .save pr,r9
e2f2a9af 409 mov r9=pr
4cb73bf8
AP
410 brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
411 }
e2f2a9af
AP
412{ .mmi; ADDP r14=0,r32 // rp
413 ADDP r15=0,r33 // ap
4cb73bf8 414 mov ar.lc=r10 }
e2f2a9af
AP
415{ .mii; ADDP r16=0,r32 // rp copy
416 mov pr.rot=0x2001<<16
417 // ------^----- serves as (p40) at first (p27)
418 mov ar.ec=11 };;
419
420// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on
421// Itanium 2. Yes, unlike previous versions it scales:-) Previous
422// version was peforming *all* additions in IALU and was starving
423// for those even on Itanium 2. In this version one addition is
424// moved to FPU and is folded with multiplication. This is at cost
425// of propogating the result from previous call to this subroutine
426// to L2 cache... In other words negligible even for shorter keys.
427// *Overall* performance improvement [over previous version] varies
428// from 11 to 22 percent depending on key length.
622d3d35 429.L_bn_mul_add_words_ctop:
e2f2a9af
AP
430.pred.rel "mutex",p40,p42
431{ .mfi; (p23) getf.sig r36=f45 // low
432 (p20) xma.lu f42=f36,f8,f50 // low
433 (p40) add r39=r39,r35 } // (p27)
434{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++)
435 (p20) xma.hu f36=f36,f8,f50 // high
436 (p42) add r39=r39,r35,1 };; // (p27)
437{ .mmi; (p24) getf.sig r32=f40 // high
438 (p16) ldf8 f46=[r16],8 // *(rp1++)
439 (p40) cmp.ltu p41,p39=r39,r35 } // (p27)
440{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++)
441 (p42) cmp.leu p41,p39=r39,r35 // (p27)
4cb73bf8
AP
442 br.ctop.sptk .L_bn_mul_add_words_ctop};;
443.L_bn_mul_add_words_cend:
444
e2f2a9af
AP
445{ .mmi; .pred.rel "mutex",p40,p42
446(p40) add r8=r35,r0
447(p42) add r8=r35,r0,1
448 mov pr=r9,0x1ffff }
449{ .mib; rum 1<<5 // clear um.mfh
450 mov ar.lc=r3
4cb73bf8
AP
451 br.ret.sptk.many b0 };;
452.endp bn_mul_add_words#
453#endif
454
455#if 1
456//
457// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
458//
459.global bn_sqr_words#
460.proc bn_sqr_words#
461.align 64
622d3d35 462.skip 32 // makes the loop body aligned at 64-byte boundary
4cb73bf8
AP
463bn_sqr_words:
464 .prologue
4cb73bf8 465 .save ar.pfs,r2
622d3d35 466{ .mii; alloc r2=ar.pfs,3,0,0,0
4cb73bf8 467 sxt4 r34=r34 };;
622d3d35 468{ .mii; cmp.le p6,p0=r34,r0
4cb73bf8 469 mov r8=r0 } // return value
e2f2a9af
AP
470{ .mfb; ADDP r32=0,r32
471 nop.f 0x0
4cb73bf8
AP
472(p6) br.ret.spnt.many b0 };;
473
622d3d35 474{ .mii; sub r10=r34,r0,1
ef428d56 475 .save ar.lc,r3
4cb73bf8 476 mov r3=ar.lc
ef428d56 477 .save pr,r9
4cb73bf8
AP
478 mov r9=pr };;
479
480 .body
e2f2a9af 481{ .mib; ADDP r33=0,r33
4cb73bf8
AP
482 mov pr.rot=1<<16
483 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
484 }
622d3d35 485{ .mii; add r34=8,r32
4cb73bf8 486 mov ar.lc=r10
622d3d35 487 mov ar.ec=18 };;
4cb73bf8
AP
488
489// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
490// possible to compress the epilogue (I'm getting tired to write this
491// comment over and over) and get down to 2*n+16 at the cost of
492// scalability. The decision will very likely be reconsidered after the
493// benchmark program is profiled. I.e. if perfomance gain on Itanium
494// will appear larger than loss on "wider" IA-64, then the loop should
495// be explicitely split and the epilogue compressed.
622d3d35
AP
496.L_bn_sqr_words_ctop:
497{ .mfi; (p16) ldf8 f32=[r33],8
4cb73bf8
AP
498 (p25) xmpy.lu f42=f41,f41
499 (p0) nop.i 0x0 }
622d3d35 500{ .mib; (p33) stf8 [r32]=f50,16
4cb73bf8
AP
501 (p0) nop.i 0x0
502 (p0) nop.b 0x0 }
622d3d35 503{ .mfi; (p0) nop.m 0x0
4cb73bf8
AP
504 (p25) xmpy.hu f52=f41,f41
505 (p0) nop.i 0x0 }
622d3d35 506{ .mib; (p33) stf8 [r34]=f60,16
4cb73bf8
AP
507 (p0) nop.i 0x0
508 br.ctop.sptk .L_bn_sqr_words_ctop };;
509.L_bn_sqr_words_cend:
510
622d3d35 511{ .mii; nop.m 0x0
46a0d4fb 512 mov pr=r9,0x1ffff
4cb73bf8 513 mov ar.lc=r3 }
622d3d35 514{ .mfb; rum 1<<5 // clear um.mfh
4cb73bf8
AP
515 nop.f 0x0
516 br.ret.sptk.many b0 };;
517.endp bn_sqr_words#
518#endif
519
520#if 1
a95541d6 521// Apparently we win nothing by implementing special bn_sqr_comba8.
4cb73bf8
AP
522// Yes, it is possible to reduce the number of multiplications by
523// almost factor of two, but then the amount of additions would
524// increase by factor of two (as we would have to perform those
525// otherwise performed by xma ourselves). Normally we would trade
526// anyway as multiplications are way more expensive, but not this
527// time... Multiplication kernel is fully pipelined and as we drain
528// one 128-bit multiplication result per clock cycle multiplications
529// are effectively as inexpensive as additions. Special implementation
530// might become of interest for "wider" IA-64 implementation as you'll
531// be able to get through the multiplication phase faster (there won't
532// be any stall issues as discussed in the commentary section below and
533// you therefore will be able to employ all 4 FP units)... But these
534// Itanium days it's simply too hard to justify the effort so I just
535// drop down to bn_mul_comba8 code:-)
536//
537// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
538//
539.global bn_sqr_comba8#
540.proc bn_sqr_comba8#
541.align 64
542bn_sqr_comba8:
543 .prologue
4cb73bf8 544 .save ar.pfs,r2
e2f2a9af 545#if defined(_HPUX_SOURCE) && !defined(_LP64)
622d3d35 546{ .mii; alloc r2=ar.pfs,2,1,0,0
46a0d4fb
AP
547 addp4 r33=0,r33
548 addp4 r32=0,r32 };;
549{ .mii;
550#else
551{ .mii; alloc r2=ar.pfs,2,1,0,0
552#endif
4cb73bf8
AP
553 mov r34=r33
554 add r14=8,r33 };;
555 .body
622d3d35 556{ .mii; add r17=8,r34
4cb73bf8
AP
557 add r15=16,r33
558 add r18=16,r34 }
622d3d35 559{ .mfb; add r16=24,r33
4cb73bf8 560 br .L_cheat_entry_point8 };;
622d3d35 561.endp bn_sqr_comba8#
4cb73bf8
AP
562#endif
563
564#if 1
565// I've estimated this routine to run in ~120 ticks, but in reality
566// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
567// cycles consumed for instructions fetch? Or did I misinterpret some
568