]>
Commit | Line | Data |
---|---|---|
52c0d300 | 1 | .explicit |
4cb73bf8 AP |
2 | .text |
3 | .asciz "ia64.S, Version 1.0" | |
4 | .asciz "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | |
5 | ||
6 | // | |
7 | // ==================================================================== | |
8 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
9 | // project. | |
10 | // | |
11 | // Rights for redistribution and usage in source and binary forms are | |
12 | // granted according to the OpenSSL license. Warranty of any kind is | |
13 | // disclaimed. | |
14 | // ==================================================================== | |
15 | // | |
16 | ||
17 | // Q. How much faster does it get? | |
18 | // A. Here is the output from 'openssl speed rsa dsa' for vanilla | |
19 | // 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat | |
20 | // Linux 7.1 2.96-81): | |
21 | // | |
22 | // sign verify sign/s verify/s | |
23 | // rsa 512 bits 0.0036s 0.0003s 275.3 2999.2 | |
24 | // rsa 1024 bits 0.0203s 0.0011s 49.3 894.1 | |
25 | // rsa 2048 bits 0.1331s 0.0040s 7.5 250.9 | |
26 | // rsa 4096 bits 0.9270s 0.0147s 1.1 68.1 | |
27 | // sign verify sign/s verify/s | |
28 | // dsa 512 bits 0.0035s 0.0043s 288.3 234.8 | |
29 | // dsa 1024 bits 0.0111s 0.0135s 90.0 74.2 | |
30 | // | |
31 | // And here is similar output but for this assembler | |
32 | // implementation:-) | |
33 | // | |
34 | // sign verify sign/s verify/s | |
35 | // rsa 512 bits 0.0021s 0.0001s 549.4 9638.5 | |
36 | // rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1 | |
37 | // rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3 | |
38 | // rsa 4096 bits 0.1295s 0.0018s 7.7 561.5 | |
39 | // sign verify sign/s verify/s | |
40 | // dsa 512 bits 0.0012s 0.0013s 891.9 756.6 | |
41 | // dsa 1024 bits 0.0023s 0.0028s 440.4 376.2 | |
42 | // | |
43 | // Yes, you may argue that it's not fair comparison as it's | |
44 | // possible to craft the C implementation with BN_UMULT_HIGH | |
45 | // inline assembler macro. But of course! Here is the output | |
46 | // with the macro: | |
47 | // | |
48 | // sign verify sign/s verify/s | |
49 | // rsa 512 bits 0.0020s 0.0002s 495.0 6561.0 | |
50 | // rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7 | |
51 | // rsa 2048 bits 0.0519s 0.0015s 19.3 667.3 | |
52 | // rsa 4096 bits 0.3464s 0.0053s 2.9 187.7 | |
53 | // sign verify sign/s verify/s | |
54 | // dsa 512 bits 0.0016s 0.0020s 613.1 510.5 | |
55 | // dsa 1024 bits 0.0045s 0.0054s 221.0 183.9 | |
56 | // | |
57 | // My code is still way faster, huh:-) And I believe that even | |
58 | // higher performance can be achieved. Note that as keys get | |
59 | // longer, performance gain is larger. Why? According to the | |
60 | // profiler there is another player in the field, namely | |
61 | // BN_from_montgomery consuming larger and larger portion of CPU | |
62 | // time as keysize decreases. I therefore consider putting effort | |
63 | // to assembler implementation of the following routine: | |
64 | // | |
65 | // void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0) | |
66 | // { | |
67 | // int i,j; | |
68 | // BN_ULONG v; | |
69 | // | |
70 | // for (i=0; i<nl; i++) | |
71 | // { | |
72 | // v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2); | |
73 | // nrp++; | |
74 | // rp++; | |
75 | // if (((nrp[-1]+=v)&BN_MASK2) < v) | |
76 | // for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ; | |
77 | // } | |
78 | // } | |
79 | // | |
80 | // It might as well be beneficial to implement even combaX | |
81 | // variants, as it appears as it can literally unleash the | |
82 | // performance (see comment section to bn_mul_comba8 below). | |
83 | // | |
84 | // And finally for your reference the output for 0.9.6a compiled | |
85 | // with SGIcc version 0.01.0-12 (keep in mind that for the moment | |
86 | // of this writing it's not possible to convince SGIcc to use | |
87 | // BN_UMULT_HIGH inline assembler macro, yet the code is fast, | |
88 | // i.e. for a compiler generated one:-): | |
89 | // | |
90 | // sign verify sign/s verify/s | |
91 | // rsa 512 bits 0.0022s 0.0002s 452.7 5894.3 | |
92 | // rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9 | |
93 | // rsa 2048 bits 0.0578s 0.0017s 17.3 600.2 | |
94 | // rsa 4096 bits 0.3838s 0.0061s 2.6 164.5 | |
95 | // sign verify sign/s verify/s | |
96 | // dsa 512 bits 0.0018s 0.0022s 547.3 459.6 | |
97 | // dsa 1024 bits 0.0051s 0.0062s 196.6 161.3 | |
98 | // | |
99 | // Oh! Benchmarks were performed on 733MHz Lion-class Itanium | |
100 | // system running Redhat Linux 7.1 (very special thanks to Ray | |
101 | // McCaffity of Williams Communications for providing an account). | |
102 | // | |
103 | // Q. What's the heck with 'rum 1<<5' at the end of every function? | |
104 | // A. Well, by clearing the "upper FP registers written" bit of the | |
105 | // User Mask I want to excuse the kernel from preserving upper | |
106 | // (f32-f128) FP register bank over process context switch, thus | |
107 | // minimizing bus bandwidth consumption during the switch (i.e. | |
108 | // after PKI opration completes and the program is off doing | |
109 | // something else like bulk symmetric encryption). Having said | |
110 | // this, I also want to point out that it might be good idea | |
111 | // to compile the whole toolkit (as well as majority of the | |
112 | // programs for that matter) with -mfixed-range=f32-f127 command | |
113 | // line option. No, it doesn't prevent the compiler from writing | |
114 | // to upper bank, but at least discourages to do so. If you don't | |
115 | // like the idea you have the option to compile the module with | |
116 | // -Drum=nop.m in command line. | |
117 | // | |
118 | ||
119 | #if 1 | |
120 | // | |
121 | // bn_[add|sub]_words routines. | |
122 | // | |
123 | // Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the | |
124 | // data reside in L1 cache, i.e. 2 ticks away). It's possible to | |
125 | // compress the epilogue and get down to 2*n+6, but at the cost of | |
126 | // scalability (the neat feature of this implementation is that it | |
127 | // shall automagically spin in n+5 on "wider" IA-64 implementations:-) | |
128 | // I consider that the epilogue is short enough as it is to trade tiny | |
129 | // performance loss on Itanium for scalability. | |
130 | // | |
131 | // BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num) | |
132 | // | |
133 | .global bn_add_words# | |
134 | .proc bn_add_words# | |
135 | .align 64 | |
136 | .space 32 // makes the loop body aligned at 64-byte boundary | |
137 | bn_add_words: | |
138 | .prologue | |
139 | .fframe 0 | |
140 | .save ar.pfs,r2 | |
141 | { .mii alloc r2=ar.pfs,4,12,0,16 | |
142 | cmp4.le p6,p0=r35,r0 };; | |
143 | { .mfb mov r8=r0 // return value | |
144 | (p6) br.ret.spnt.many b0 };; | |
145 | ||
146 | .save ar.lc,r3 | |
147 | { .mib sub r10=r35,r0,1 | |
148 | mov r3=ar.lc | |
149 | brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 | |
150 | } | |
151 | .body | |
152 | { .mib mov r14=r32 // rp | |
153 | mov r9=pr };; | |
154 | { .mii mov r15=r33 // ap | |
155 | mov ar.lc=r10 | |
156 | mov ar.ec=6 } | |
157 | { .mib mov r16=r34 // bp | |
158 | mov pr.rot=1<<16 } | |
159 | ||
160 | .L_bn_add_words_ctop: ;; | |
161 | { .mii (p16) ld8 r32=[r16],8 // b=*(bp++) | |
162 | (p18) add r39=r37,r34 | |
163 | (p19) cmp.ltu.unc p56,p0=r40,r38 } | |
164 | { .mfb (p0) nop.m 0x0 | |
165 | (p0) nop.f 0x0 | |
166 | (p0) nop.b 0x0 } | |
167 | { .mii (p16) ld8 r35=[r15],8 // a=*(ap++) | |
168 | (p58) cmp.eq.or p57,p0=-1,r41 // (p20) | |
169 | (p58) add r41=1,r41 } // (p20) | |
170 | { .mfb (p21) st8 [r14]=r42,8 // *(rp++)=r | |
171 | (p0) nop.f 0x0 | |
172 | br.ctop.sptk .L_bn_add_words_ctop };; | |
173 | .L_bn_add_words_cend: | |
174 | ||
175 | { .mii | |
176 | (p59) add r8=1,r8 // return value | |
177 | mov pr=r9,-1 | |
178 | mov ar.lc=r3 } | |
179 | { .mbb nop.b 0x0 | |
180 | br.ret.sptk.many b0 };; | |
181 | .endp bn_add_words# | |
182 | ||
183 | // | |
184 | // BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num) | |
185 | // | |
186 | .global bn_sub_words# | |
187 | .proc bn_sub_words# | |
188 | .align 64 | |
189 | .space 32 // makes the loop body aligned at 64-byte boundary | |
190 | bn_sub_words: | |
191 | .prologue | |
192 | .fframe 0 | |
193 | .save ar.pfs,r2 | |
194 | { .mii alloc r2=ar.pfs,4,12,0,16 | |
195 | cmp4.le p6,p0=r35,r0 };; | |
196 | { .mfb mov r8=r0 // return value | |
197 | (p6) br.ret.spnt.many b0 };; | |
198 | ||
199 | .save ar.lc,r3 | |
200 | { .mib sub r10=r35,r0,1 | |
201 | mov r3=ar.lc | |
202 | brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 | |
203 | } | |
204 | .body | |
205 | { .mib mov r14=r32 // rp | |
206 | mov r9=pr };; | |
207 | { .mii mov r15=r33 // ap | |
208 | mov ar.lc=r10 | |
209 | mov ar.ec=6 } | |
210 | { .mib mov r16=r34 // bp | |
211 | mov pr.rot=1<<16 } | |
212 | ||
213 | .L_bn_sub_words_ctop: ;; | |
214 | { .mii (p16) ld8 r32=[r16],8 // b=*(bp++) | |
215 | (p18) sub r39=r37,r34 | |
216 | (p19) cmp.gtu.unc p56,p0=r40,r38 } | |
217 | { .mfb (p0) nop.m 0x0 | |
218 | (p0) nop.f 0x0 | |
219 | (p0) nop.b 0x0 } | |
220 | { .mii (p16) ld8 r35=[r15],8 // a=*(ap++) | |
221 | (p58) cmp.eq.or p57,p0=0,r41 // (p20) | |
222 | (p58) add r41=-1,r41 } // (p20) | |
223 | { .mbb (p21) st8 [r14]=r42,8 // *(rp++)=r | |
224 | (p0) nop.b 0x0 | |
225 | br.ctop.sptk .L_bn_sub_words_ctop };; | |
226 | .L_bn_sub_words_cend: | |
227 | ||
228 | { .mii | |
229 | (p59) add r8=1,r8 // return value | |
230 | mov pr=r9,-1 | |
231 | mov ar.lc=r3 } | |
232 | { .mbb nop.b 0x0 | |
233 | br.ret.sptk.many b0 };; | |
234 | .endp bn_sub_words# | |
235 | #endif | |
236 | ||
237 | #if 0 | |
238 | #define XMA_TEMPTATION | |
239 | #endif | |
240 | ||
241 | #if 1 | |
242 | // | |
243 | // BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
244 | // | |
245 | .global bn_mul_words# | |
246 | .proc bn_mul_words# | |
247 | .align 64 | |
248 | .space 32 // makes the loop body aligned at 64-byte boundary | |
249 | bn_mul_words: | |
250 | .prologue | |
251 | .fframe 0 | |
252 | .save ar.pfs,r2 | |
253 | #ifdef XMA_TEMPTATION | |
254 | { .mfi alloc r2=ar.pfs,4,0,0,0 };; | |
255 | #else | |
256 | { .mfi alloc r2=ar.pfs,4,4,0,8 };; | |
257 | #endif | |
258 | { .mib mov r8=r0 // return value | |
259 | cmp4.le p6,p0=r34,r0 | |
260 | (p6) br.ret.spnt.many b0 };; | |
261 | ||
262 | .save ar.lc,r3 | |
263 | { .mii sub r10=r34,r0,1 | |
264 | mov r3=ar.lc | |
265 | mov r9=pr };; | |
266 | ||
267 | .body | |
268 | { .mib setf.sig f8=r35 // w | |
269 | mov pr.rot=0x400001<<16 | |
270 | // ------^----- serves as (p48) at first (p26) | |
271 | brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 | |
272 | } | |
273 | ||
274 | #ifndef XMA_TEMPTATION | |
275 | ||
276 | { .mii mov r14=r32 // rp | |
277 | mov r15=r33 // ap | |
278 | mov ar.lc=r10 } | |
279 | { .mii mov r39=0 // serves as r33 at first (p26) | |
280 | mov ar.ec=12 } | |
281 | ||
282 | // This loop spins in 2*(n+11) ticks. It's scheduled for data in L2 | |
283 | // cache (i.e. 9 ticks away) as floating point load/store instructions | |
284 | // bypass L1 cache and L2 latency is actually best-case scenario for | |
285 | // ldf8. The loop is not scalable and shall run in 2*(n+11) even on | |
286 | // "wider" IA-64 implementations. It's a trade-off here. n+22 loop | |
287 | // would give us ~5% in *overall* performance improvement on "wider" | |
288 | // IA-64, but would hurt Itanium for about same because of longer | |
289 | // epilogue. As it's a matter of few percents in either case I've | |
290 | // chosen to trade the scalability for development time (you can see | |
291 | // this very instruction sequence in bn_mul_add_words loop which in | |
292 | // turn is scalable). | |
293 | .L_bn_mul_words_ctop: ;; | |
294 | { .mfi (p25) getf.sig r36=f49 // low | |
295 | (p21) xmpy.lu f45=f37,f8 | |
296 | (p27) cmp.ltu p52,p48=r39,r38 } | |
297 | { .mfi (p16) ldf8 f32=[r15],8 | |
298 | (p21) xmpy.hu f38=f37,f8 | |
299 | (p0) nop.i 0x0 };; | |
300 | { .mii (p26) getf.sig r32=f43 // high | |
52c0d300 | 301 | .pred.rel "mutex",p48,p52 |
4cb73bf8 AP |
302 | (p48) add r38=r37,r33 // (p26) |
303 | (p52) add r38=r37,r33,1 } // (p26) | |
304 | { .mfb (p27) st8 [r14]=r39,8 | |
305 | (p0) nop.f 0x0 | |
306 | br.ctop.sptk .L_bn_mul_words_ctop };; | |
307 | .L_bn_mul_words_cend: | |
308 | ||
309 | { .mii nop.m 0x0 | |
52c0d300 | 310 | .pred.rel "mutex",p49,p53 |
4cb73bf8 AP |
311 | (p49) add r8=r34,r0 |
312 | (p53) add r8=r34,r0,1 } | |
313 | { .mfb nop.m 0x0 | |
314 | nop.f 0x0 | |
315 | nop.b 0x0 } | |
316 | ||
317 | #else // XMA_TEMPTATION | |
318 | ||
319 | setf.sig f37=r0 // serves as carry at (p18) tick | |
320 | mov ar.lc=r10 | |
321 | mov ar.ec=5 | |
322 | ||
323 | // Most of you examining this code very likely wonder why in the name | |
324 | // of Intel the following loop is commented out? Indeed, it looks so | |
325 | // neat that you find it hard to believe that it's something wrong | |
326 | // with it, right? The catch is that every iteration depends on the | |
327 | // result from previous one and the latter isn't available instantly. | |
328 | // The loop therefore spins at the latency of xma minus 1, or in other | |
329 | // words at 6*(n+4) ticks:-( Compare to the "production" loop above | |
330 | // that runs in 2*(n+11) where the low latency problem is worked around | |
331 | // by moving the dependency to one-tick latent interger ALU. Note that | |
332 | // "distance" between ldf8 and xma is not latency of ldf8, but the | |
333 | // *difference* between xma and ldf8 latencies. | |
334 | .L_bn_mul_words_ctop: ;; | |
335 | { .mfi (p16) ldf8 f32=[r33],8 | |
336 | (p18) xma.hu f38=f34,f8,f39 } | |
337 | { .mfb (p20) stf8 [r32]=f37,8 | |
338 | (p18) xma.lu f35=f34,f8,f39 | |
339 | br.ctop.sptk .L_bn_mul_words_ctop };; | |
340 | .L_bn_mul_words_cend: | |
341 | ||
342 | getf.sig r8=f41 // the return value | |
343 | ||
344 | #endif // XMA_TEMPTATION | |
345 | ||
346 | { .mii nop.m 0x0 | |
347 | mov pr=r9,-1 | |
348 | mov ar.lc=r3 } | |
349 | { .mfb rum 1<<5 // clear um.mfh | |
350 | nop.f 0x0 | |
351 | br.ret.sptk.many b0 };; | |
352 | .endp bn_mul_words# | |
353 | #endif | |
354 | ||
355 | #if 1 | |
356 | // | |
357 | // BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) | |
358 | // | |
359 | .global bn_mul_add_words# | |
360 | .proc bn_mul_add_words# | |
361 | .align 64 | |
362 | //.space 0 // makes the loop split at 64-byte boundary | |
363 | bn_mul_add_words: | |
364 | .prologue | |
365 | .fframe 0 | |
366 | .save ar.pfs,r2 | |
367 | { .mii alloc r2=ar.pfs,4,12,0,16 | |
368 | cmp4.le p6,p0=r34,r0 };; | |
369 | { .mfb mov r8=r0 // return value | |
370 | (p6) br.ret.spnt.many b0 };; | |
371 | ||
372 | .save ar.lc,r3 | |
373 | { .mii sub r10=r34,r0,1 | |
374 | mov r3=ar.lc | |
375 | mov r9=pr };; | |
376 | ||
377 | .body | |
378 | { .mib setf.sig f8=r35 // w | |
379 | mov pr.rot=0x400001<<16 | |
380 | // ------^----- serves as (p48) at first (p26) | |
381 | brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 | |
382 | } | |
383 | { .mii mov r14=r32 // rp | |
384 | mov r15=r33 // ap | |
385 | mov ar.lc=r10 } | |
386 | { .mii mov r39=0 // serves as r33 at first (p26) | |
387 | mov r18=r32 // rp copy | |
388 | mov ar.ec=14 } | |
389 | ||
390 | // This loop spins in 3*(n+13) ticks on Itanium and should spin in | |
391 | // 2*(n+13) on "wider" IA-64 implementations (to be verified with new | |
392 |