]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/ia64-mont.pl
Also check for errors in x86_64-xlate.pl.
[thirdparty/openssl.git] / crypto / bn / asm / ia64-mont.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
83cf7abf 2# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
367ace68 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
dacdcf3c
AP
9#
10# ====================================================================
e3713c36 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
dacdcf3c
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# January 2010
18#
19# "Teaser" Montgomery multiplication module for IA-64. There are
20# several possibilities for improvement:
21#
22# - modulo-scheduling outer loop would eliminate quite a number of
23# stalls after ldf8, xma and getf.sig outside inner loop and
24# improve shorter key performance;
25# - shorter vector support [with input vectors being fetched only
26# once] should be added;
27# - 2x unroll with help of n0[1] would make the code scalable on
28# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
29# acute interest, because upcoming Tukwila's individual cores are
30# reportedly based on Itanium 2 design;
31# - dedicated squaring procedure(?);
32#
4407700c
AP
33# January 2010
34#
35# Shorter vector support is implemented by zero-padding ap and np
36# vectors up to 8 elements, or 512 bits. This means that 256-bit
37# inputs will be processed only 2 times faster than 512-bit inputs,
38# not 4 [as one would expect, because algorithm complexity is n^2].
39# The reason for padding is that inputs shorter than 512 bits won't
40# be processed faster anyway, because minimal critical path of the
41# core loop happens to match 512-bit timing. Either way, it resulted
42# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
43# 1024-bit one [in comparison to original version of *this* module].
44#
dacdcf3c
AP
45# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
46# this module is:
47# sign verify sign/s verify/s
a000759a
AP
48# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
49# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
dacdcf3c
AP
50# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
51# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
a000759a 52# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
dacdcf3c
AP
53# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
54# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
55#
a000759a 56# ... and *without* (but still with ia64.S):
dacdcf3c
AP
57#
58# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
59# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
60# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
61# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
62# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
63# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
64# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
65#
a000759a
AP
66# As it can be seen, RSA sign performance improves by 130-30%,
67# hereafter less for longer keys, while verify - by 74-13%.
68# DSA performance improves by 115-30%.
dacdcf3c 69
1aa89a7a
RL
70# $output is the last argument if it looks like a file (it has an extension)
71$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
6bd7a4d9 72
dacdcf3c
AP
73if ($^O eq "hpux") {
74 $ADDP="addp4";
75 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
76} else { $ADDP="add"; }
77
78$code=<<___;
79.explicit
80.text
4407700c 81
dacdcf3c
AP
82// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
83// const BN_ULONG *bp,const BN_ULONG *np,
609b0852 84// const BN_ULONG *n0p,int num);
56c5f703 85.align 64
dacdcf3c
AP
86.global bn_mul_mont#
87.proc bn_mul_mont#
4407700c
AP
88bn_mul_mont:
89 .prologue
90 .body
91{ .mmi; cmp4.le p6,p7=2,r37;;
92(p6) cmp4.lt.unc p8,p9=8,r37
93 mov ret0=r0 };;
94{ .bbb;
95(p9) br.cond.dptk.many bn_mul_mont_8
96(p8) br.cond.dpnt.many bn_mul_mont_general
97(p7) br.ret.spnt.many b0 };;
98.endp bn_mul_mont#
99\f
100prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
dacdcf3c 101
4407700c
AP
102rptr=r8; aptr=r9; bptr=r14; nptr=r15;
103tptr=r16; // &tp[0]
104tp_1=r17; // &tp[-1]
105num=r18; len=r19; lc=r20;
106topbit=r21; // carry bit from tmp[num]
dacdcf3c 107
4407700c
AP
108n0=f6;
109m0=f7;
110bi=f8;
dacdcf3c 111
56c5f703 112.align 64
4407700c
AP
113.local bn_mul_mont_general#
114.proc bn_mul_mont_general#
4407700c 115bn_mul_mont_general:
dacdcf3c
AP
116 .prologue
117{ .mmi; .save ar.pfs,prevfs
118 alloc prevfs=ar.pfs,6,2,0,8
119 $ADDP aptr=0,in1
120 .save ar.lc,prevlc
121 mov prevlc=ar.lc }
122{ .mmi; .vframe prevsp
123 mov prevsp=sp
124 $ADDP bptr=0,in2
dacdcf3c 125 .save pr,prevpr
4407700c 126 mov prevpr=pr };;
dacdcf3c
AP
127
128 .body
129 .rotf alo[6],nlo[4],ahi[8],nhi[6]
130 .rotr a[3],n[3],t[2]
131
132{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
133 ldf8 alo[4]=[aptr],16 // ap[0]
134 $ADDP r30=8,in1 };;
135{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
136 ldf8 alo[2]=[aptr],16 // ap[2]
137 $ADDP in4=0,in4 };;
138{ .mmi; ldf8 alo[1]=[r30] // ap[3]
139 ldf8 n0=[in4] // n0
140 $ADDP rptr=0,in0 }
141{ .mmi; $ADDP nptr=0,in3
142 mov r31=16
143 zxt4 num=in5 };;
144{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
145 shladd len=num,3,r0
146 shladd r31=num,3,r31 };;
147{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
148 add lc=-5,num
149 sub r31=sp,r31 };;
150{ .mfb; and sp=-16,r31 // alloca
151 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
152 nop.b 0 }
153{ .mfb; nop.m 0
154 xmpy.lu alo[4]=alo[4],bi
155 brp.loop.imp .L1st_ctop,.L1st_cend-16
156 };;
157{ .mfi; nop.m 0
158 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
74f22606 159 add tp_1=8,sp }
dacdcf3c
AP
160{ .mfi; nop.m 0
161 xma.lu alo[3]=alo[3],bi,ahi[2]
162 mov pr.rot=0x20001f<<16
163 // ------^----- (p40) at first (p23)
164 // ----------^^ p[16:20]=1
165 };;
166{ .mfi; nop.m 0
167 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
168 mov ar.lc=lc }
169{ .mfi; nop.m 0
170 fcvt.fxu.s1 nhi[1]=f0
171 mov ar.ec=8 };;
172
173.align 32
174.L1st_ctop:
175.pred.rel "mutex",p40,p42
176{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
177 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
178 (p40) add n[2]=n[2],a[2] } // (p23) }
179{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
180 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
181 (p42) add n[2]=n[2],a[2],1 };; // (p23)
182{ .mfi; (p21) getf.sig a[0]=alo[5]
183 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
184 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
185{ .mfi; (p23) st8 [tp_1]=n[2],8
186 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
187 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
188{ .mmb; (p21) getf.sig n[0]=nlo[3]
189 (p16) nop.m 0
190 br.ctop.sptk .L1st_ctop };;
191.L1st_cend:
192
193{ .mmi; getf.sig a[0]=ahi[6] // (p24)
194 getf.sig n[0]=nhi[4]
195 add num=-1,num };; // num--
196{ .mmi; .pred.rel "mutex",p40,p42
197(p40) add n[0]=n[0],a[0]
198(p42) add n[0]=n[0],a[0],1
199 sub aptr=aptr,len };; // rewind
200{ .mmi; .pred.rel "mutex",p40,p42
201(p40) cmp.ltu p41,p39=n[0],a[0]
202(p42) cmp.leu p41,p39=n[0],a[0]
203 sub nptr=nptr,len };;
204{ .mmi; .pred.rel "mutex",p39,p41
205(p39) add topbit=r0,r0
206(p41) add topbit=r0,r0,1
609b0852 207 nop.i 0 }
dacdcf3c 208{ .mmi; st8 [tp_1]=n[0]
74f22606
AP
209 add tptr=16,sp
210 add tp_1=8,sp };;
211\f
dacdcf3c
AP
212.Louter:
213{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
214 ldf8 ahi[3]=[tptr] // tp[0]
215 add r30=8,aptr };;
216{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
217 ldf8 alo[3]=[r30],16 // ap[1]
218 add r31=8,nptr };;
219{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
220 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
221 brp.loop.imp .Linner_ctop,.Linner_cend-16
222 }
223{ .mfb; ldf8 alo[1]=[r30] // ap[3]
224 xma.lu alo[4]=alo[4],bi,ahi[3]
225 clrrrb.pr };;
226{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
227 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
228 nop.i 0 }
229{ .mfi; ldf8 nlo[1]=[r31] // np[1]
230 xma.lu alo[3]=alo[3],bi,ahi[2]
231 mov pr.rot=0x20101f<<16
232 // ------^----- (p40) at first (p23)
233 // --------^--- (p30) at first (p22)
234 // ----------^^ p[16:20]=1
235 };;
236{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
237 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
238 mov ar.lc=lc }
239{ .mfi;
240 fcvt.fxu.s1 nhi[1]=f0
241 mov ar.ec=8 };;
242
243// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
244// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
245// in latter case accounts for two-tick pipeline stall, which means
246// that its performance would be ~20% lower than optimal one. No
247// attempt was made to address this, because original Itanium is
248// hardly represented out in the wild...
249.align 32
250.Linner_ctop:
251.pred.rel "mutex",p40,p42
252.pred.rel "mutex",p30,p32
253{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
254 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
255 (p40) add n[2]=n[2],a[2] } // (p23)
256{ .mfi; (p16) nop.m 0
257 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
258 (p42) add n[2]=n[2],a[2],1 };; // (p23)
259{ .mfi; (p21) getf.sig a[0]=alo[5]
260 (p16) nop.f 0
261 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
262{ .mfi; (p21) ld8 t[0]=[tptr],8
263 (p16) nop.f 0
264 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
265{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
266 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
267 (p30) add a[1]=a[1],t[1] } // (p22)
268{ .mfi; (p16) nop.m 0
269 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
270 (p32) add a[1]=a[1],t[1],1 };; // (p22)
271{ .mmi; (p21) getf.sig n[0]=nlo[3]
272 (p16) nop.m 0
273 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
274{ .mmb; (p23) st8 [tp_1]=n[2],8
275 (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
276 br.ctop.sptk .Linner_ctop };;
277.Linner_cend:
278
279{ .mmi; getf.sig a[0]=ahi[6] // (p24)
280 getf.sig n[0]=nhi[4]
281 nop.i 0 };;
282
283{ .mmi; .pred.rel "mutex",p31,p33
284(p31) add a[0]=a[0],topbit
285(p33) add a[0]=a[0],topbit,1
286 mov topbit=r0 };;
287{ .mfi; .pred.rel "mutex",p31,p33
288(p31) cmp.ltu p32,p30=a[0],topbit
289(p33) cmp.leu p32,p30=a[0],topbit
290 }
291{ .mfi; .pred.rel "mutex",p40,p42
292(p40) add n[0]=n[0],a[0]
293(p42) add n[0]=n[0],a[0],1
294 };;
295{ .mmi; .pred.rel "mutex",p44,p46
296(p40) cmp.ltu p41,p39=n[0],a[0]
297(p42) cmp.leu p41,p39=n[0],a[0]
298(p32) add topbit=r0,r0,1 }
299
300{ .mmi; st8 [tp_1]=n[0],8
301 cmp4.ne p6,p0=1,num
302 sub aptr=aptr,len };; // rewind
303{ .mmi; sub nptr=nptr,len
304(p41) add topbit=r0,r0,1
74f22606
AP
305 add tptr=16,sp }
306{ .mmb; add tp_1=8,sp
dacdcf3c
AP
307 add num=-1,num // num--
308(p6) br.cond.sptk.many .Louter };;
309\f
310{ .mbb; add lc=4,lc
311 brp.loop.imp .Lsub_ctop,.Lsub_cend-16
312 clrrrb.pr };;
313{ .mii; nop.m 0
314 mov pr.rot=0x10001<<16
315 // ------^---- (p33) at first (p17)
316 mov ar.lc=lc }
317{ .mii; nop.m 0
318 mov ar.ec=3
319 nop.i 0 };;
320
321.Lsub_ctop:
322.pred.rel "mutex",p33,p35
323{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
324 (p16) nop.f 0
325 (p33) sub n[1]=t[1],n[1] } // (p17)
326{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
327 (p16) nop.f 0
328 (p35) sub n[1]=t[1],n[1],1 };; // (p17)
329{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
330 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
331 (p18) nop.b 0 }
332{ .mib; (p18) nop.m 0
333 (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
334 br.ctop.sptk .Lsub_ctop };;
335.Lsub_cend:
336
337{ .mmb; .pred.rel "mutex",p34,p36
338(p34) sub topbit=topbit,r0 // (p19)
339(p36) sub topbit=topbit,r0,1
340 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
341 }
342{ .mmb; sub rptr=rptr,len // rewind
343 sub tptr=tptr,len
344 clrrrb.pr };;
774ff8fe
AP
345{ .mmi; mov aptr=rptr
346 mov bptr=tptr
dacdcf3c 347 mov pr.rot=1<<16 };;
774ff8fe 348{ .mii; cmp.eq p0,p6=topbit,r0
dacdcf3c 349 mov ar.lc=lc
774ff8fe 350 mov ar.ec=2 };;
dacdcf3c
AP
351
352.Lcopy_ctop:
774ff8fe
AP
353{ .mmi; (p16) ld8 a[0]=[aptr],8
354 (p16) ld8 t[0]=[bptr],8
355 (p6) mov a[1]=t[1] };; // (p17)
356{ .mmb; (p17) st8 [rptr]=a[1],8
357 (p17) st8 [tptr]=r0,8
dacdcf3c
AP
358 br.ctop.sptk .Lcopy_ctop };;
359.Lcopy_cend:
360
361{ .mmi; mov ret0=1 // signal "handled"
362 rum 1<<5 // clear um.mfh
363 mov ar.lc=prevlc }
364{ .mib; .restore sp
365 mov sp=prevsp
56c5f703 366 mov pr=prevpr,0x1ffff
dacdcf3c 367 br.ret.sptk.many b0 };;
4407700c
AP
368.endp bn_mul_mont_general#
369\f
370a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
371n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
372t0=r15;
373
374ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
375ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
376
56c5f703
AP
377.align 64
378.skip 48 // aligns loop body
4407700c
AP
379.local bn_mul_mont_8#
380.proc bn_mul_mont_8#
4407700c
AP
381bn_mul_mont_8:
382 .prologue
383{ .mmi; .save ar.pfs,prevfs
384 alloc prevfs=ar.pfs,6,2,0,8
385 .vframe prevsp
386 mov prevsp=sp
387 .save ar.lc,prevlc
388 mov prevlc=ar.lc }
389{ .mmi; add r17=-6*16,sp
390 add sp=-7*16,sp
391 .save pr,prevpr
392 mov prevpr=pr };;
393
394{ .mmi; .save.gf 0,0x10
395 stf.spill [sp]=f16,-16
396 .save.gf 0,0x20
397 stf.spill [r17]=f17,32
398 add r16=-5*16,prevsp};;
399{ .mmi; .save.gf 0,0x40
400 stf.spill [r16]=f18,32
401 .save.gf 0,0x80
402 stf.spill [r17]=f19,32
403 $ADDP aptr=0,in1 };;
404{ .mmi; .save.gf 0,0x100
405 stf.spill [r16]=f20,32
406 .save.gf 0,0x200
407 stf.spill [r17]=f21,32
408 $ADDP r29=8,in1 };;
409{ .mmi; .save.gf 0,0x400
410 stf.spill [r16]=f22
411 .save.gf 0,0x800
412 stf.spill [r17]=f23
413 $ADDP rptr=0,in0 };;
414\f
415 .body
416 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
417 .rotr t[8]
418
419// load input vectors padding them to 8 elements
420{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
421 ldf8 ai1=[r29],16 // ap[1]
422 $ADDP bptr=0,in2 }
423{ .mmi; $ADDP r30=8,in2
424 $ADDP nptr=0,in3
425 $ADDP r31=8,in3 };;
426{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
427 ldf8 bj[6]=[r30],16 // bp[1]
428 cmp4.le p4,p5=3,in5 }
429{ .mmi; ldf8 ni0=[nptr],16 // np[0]
430 ldf8 ni1=[r31],16 // np[1]
431 cmp4.le p6,p7=4,in5 };;
432
433{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
434 (p5)fcvt.fxu ai2=f0
435 cmp4.le p8,p9=5,in5 }
436{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
437 (p7)fcvt.fxu ai3=f0
438 cmp4.le p10,p11=6,in5 }
439{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
440 (p5)fcvt.fxu bj[5]=f0
441 cmp4.le p12,p13=7,in5 }
442{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
443 (p7)fcvt.fxu bj[4]=f0
444 cmp4.le p14,p15=8,in5 }
445{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
446 (p5)fcvt.fxu ni2=f0
447 addp4 r28=-1,in5 }
448{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
449 (p7)fcvt.fxu ni3=f0
450 $ADDP in4=0,in4 };;
451
452{ .mfi; ldf8 n0=[in4]
453 fcvt.fxu tf[1]=f0
454 nop.i 0 }
455
456{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
457 (p9)fcvt.fxu ai4=f0
458 mov t[0]=r0 }
459{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
460 (p11)fcvt.fxu ai5=f0
461 mov t[1]=r0 }
462{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
463 (p9)fcvt.fxu bj[3]=f0
464 mov t[2]=r0 }
465{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
466 (p11)fcvt.fxu bj[2]=f0
467 mov t[3]=r0 }
468{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
469 (p9)fcvt.fxu ni4=f0
470 mov t[4]=r0 }
471{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
472 (p11)fcvt.fxu ni5=f0
473 mov t[5]=r0 };;
474
475{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
476 (p13)fcvt.fxu ai6=f0
477 mov t[6]=r0 }
478{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
479 (p15)fcvt.fxu ai7=f0
480 mov t[7]=r0 }
481{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
482 (p13)fcvt.fxu bj[1]=f0
483 mov ar.lc=r28 }
484{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
485 (p15)fcvt.fxu bj[0]=f0
a000759a 486 mov ar.ec=1 }
4407700c
AP
487{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
488 (p13)fcvt.fxu ni6=f0
489 mov pr.rot=1<<16 }
490{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
491 (p15)fcvt.fxu ni7=f0
492 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
493 };;
494\f
a000759a
AP
495// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
496// to measure with help of Interval Time Counter indicated that the
4407700c
AP
497// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
498// addressing the issue is problematic, because I don't have access
499// to platform-specific instruction-level profiler. On Itanium it
a000759a 500// should run in 56*n ticks, because of higher xma latency...
4407700c
AP
501.Louter_8_ctop:
502 .pred.rel "mutex",p40,p42
503 .pred.rel "mutex",p48,p50
504{ .mfi; (p16) nop.m 0 // 0:
505 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
506 (p40) add a3=a3,n3 } // (p17) a3+=n3
507{ .mfi; (p42) add a3=a3,n3,1
508 (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
509 (p16) nop.i 0 };;
510{ .mii; (p17) getf.sig a7=alo[8] // 1:
511 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
512 (p50) add t[6]=t[6],a3,1 };;
513{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
514 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
515 (p40) cmp.ltu p43,p41=a3,n3 }
516{ .mfi; (p42) cmp.leu p43,p41=a3,n3
517 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
518 (p16) nop.i 0 };;
519{ .mii; (p17) getf.sig n5=nlo[6] // 3:
520 (p48) cmp.ltu p51,p49=t[6],a3
521 (p50) cmp.leu p51,p49=t[6],a3 };;
522 .pred.rel "mutex",p41,p43
523 .pred.rel "mutex",p49,p51
524{ .mfi; (p16) nop.m 0 // 4:
525 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
526 (p41) add a4=a4,n4 } // (p17) a4+=n4
527{ .mfi; (p43) add a4=a4,n4,1
528 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
529 (p16) nop.i 0 };;
530{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
531 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
532 (p51) add t[5]=t[5],a4,1 };;
533{ .mfi; (p16) nop.m 0 // 6:
534 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
535 (p41) cmp.ltu p42,p40=a4,n4 }
536{ .mfi; (p43) cmp.leu p42,p40=a4,n4
537 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
538 (p16) nop.i 0 };;
539{ .mii; (p17) getf.sig n6=nlo[7] // 7:
540 (p49) cmp.ltu p50,p48=t[5],a4
541 (p51) cmp.leu p50,p48=t[5],a4 };;
542 .pred.rel "mutex",p40,p42
543 .pred.rel "mutex",p48,p50
544{ .mfi; (p16) nop.m 0 // 8:
545 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
546 (p40) add a5=a5,n5 } // (p17) a5+=n5
547{ .mfi; (p42) add a5=a5,n5,1
548 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
549 (p16) nop.i 0 };;
550{ .mii; (p16) getf.sig a1=alo[1] // 9:
551 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
552 (p50) add t[4]=t[4],a5,1 };;
553{ .mfi; (p16) nop.m 0 // 10:
554 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
555 (p40) cmp.ltu p43,p41=a5,n5 }
556{ .mfi; (p42) cmp.leu p43,p41=a5,n5
557 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
558 (p16) nop.i 0 };;
559{ .mii; (p17) getf.sig n7=nlo[8] // 11:
560 (p48) cmp.ltu p51,p49=t[4],a5
561 (p50) cmp.leu p51,p49=t[4],a5 };;
562 .pred.rel "mutex",p41,p43
563 .pred.rel "mutex",p49,p51
564{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
565 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
566 (p41) add a6=a6,n6 } // (p17) a6+=n6
567{ .mfi; (p43) add a6=a6,n6,1
568 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
569 (p16) nop.i 0 };;
570{ .mii; (p16) getf.sig a2=alo[2] // 13:
571 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
572 (p51) add t[3]=t[3],a6,1 };;
573{ .mfi; (p16) nop.m 0 // 14:
574 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
575 (p41) cmp.ltu p42,p40=a6,n6 }
576{ .mfi; (p43) cmp.leu p42,p40=a6,n6
577 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
578 (p16) nop.i 0 };;
579{ .mii; (p16) nop.m 0 // 15:
580 (p49) cmp.ltu p50,p48=t[3],a6
581 (p51) cmp.leu p50,p48=t[3],a6 };;
582 .pred.rel "mutex",p40,p42
583 .pred.rel "mutex",p48,p50
584{ .mfi; (p16) nop.m 0 // 16:
585 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
586 (p40) add a7=a7,n7 } // (p17) a7+=n7
587{ .mfi; (p42) add a7=a7,n7,1
588 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
589 (p16) nop.i 0 };;
590{ .mii; (p16) getf.sig a3=alo[3] // 17:
591 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
592 (p50) add t[2]=t[2],a7,1 };;
593{ .mfi; (p16) nop.m 0 // 18:
594 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
595 (p40) cmp.ltu p43,p41=a7,n7 }
596{ .mfi; (p42) cmp.leu p43,p41=a7,n7
597 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
598 (p16) nop.i 0 };;
599{ .mii; (p16) getf.sig n1=nlo[1] // 19:
600 (p48) cmp.ltu p51,p49=t[2],a7
601 (p50) cmp.leu p51,p49=t[2],a7 };;
602 .pred.rel "mutex",p41,p43
603 .pred.rel "mutex",p49,p51
604{ .mfi; (p16) nop.m 0 // 20:
605 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
606 (p41) add a8=a8,n8 } // (p17) a8+=n8
607{ .mfi; (p43) add a8=a8,n8,1
608 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
609 (p16) nop.i 0 };;
610{ .mii; (p16) getf.sig a4=alo[4] // 21:
611 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
612 (p51) add t[1]=t[1],a8,1 };;
613{ .mfi; (p16) nop.m 0 // 22:
614 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
615 (p41) cmp.ltu p42,p40=a8,n8 }
616{ .mfi; (p43) cmp.leu p42,p40=a8,n8
617 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
618 (p16) nop.i 0 };;
619{ .mii; (p16) getf.sig n2=nlo[2] // 23:
620 (p49) cmp.ltu p50,p48=t[1],a8
621 (p51) cmp.leu p50,p48=t[1],a8 };;
622{ .mfi; (p16) nop.m 0 // 24:
623 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
624 (p16) add a1=a1,n1 } // (p16) a1+=n1
625{ .mfi; (p16) nop.m 0
626 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
627 (p17) mov t[0]=r0 };;
628{ .mii; (p16) getf.sig a5=alo[5] // 25:
629 (p16) add t0=t[7],a1 // (p16) t[7]+=a1
630 (p42) add t[0]=t[0],r0,1 };;
631{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
632 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
633 (p50) add t[0]=t[0],r0,1 }
634{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
635 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
636 (p16) nop.i 0 };;
637{ .mii; (p16) getf.sig n3=nlo[3] // 27:
638 (p16) cmp.ltu.unc p50,p48=t0,a1
639 (p16) nop.i 0 };;
640 .pred.rel "mutex",p40,p42
641 .pred.rel "mutex",p48,p50
642{ .mfi; (p16) nop.m 0 // 28:
643 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
644 (p40) add a2=a2,n2 } // (p16) a2+=n2
645{ .mfi; (p42) add a2=a2,n2,1
646 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
647 (p16) nop.i 0 };;
648{ .mii; (p16) getf.sig a6=alo[6] // 29:
649 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
650 (p50) add t[6]=t[6],a2,1 };;
651{ .mfi; (p16) nop.m 0 // 30:
652 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
653 (p40) cmp.ltu p41,p39=a2,n2 }
654{ .mfi; (p42) cmp.leu p41,p39=a2,n2
655 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
656 (p16) nop.i 0 };;
657{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
658 (p16) nop.f 0
659 (p48) cmp.ltu p49,p47=t[6],a2 }
660{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
661 (p16) nop.f 0
662 br.ctop.sptk.many .Louter_8_ctop };;
663.Louter_8_cend:
664\f
a000759a
AP
665// above loop has to execute one more time, without (p16), which is
666// replaced with merged move of np[8] to GPR bank
667 .pred.rel "mutex",p40,p42
668 .pred.rel "mutex",p48,p50
669{ .mmi; (p0) getf.sig n1=ni0 // 0:
670 (p40) add a3=a3,n3 // (p17) a3+=n3
671 (p42) add a3=a3,n3,1 };;
672{ .mii; (p17) getf.sig a7=alo[8] // 1:
673 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
674 (p50) add t[6]=t[6],a3,1 };;
675{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
676 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
677 (p40) cmp.ltu p43,p41=a3,n3 }
678{ .mfi; (p42) cmp.leu p43,p41=a3,n3
679 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
680 (p0) nop.i 0 };;
681{ .mii; (p17) getf.sig n5=nlo[6] // 3:
682 (p48) cmp.ltu p51,p49=t[6],a3
683 (p50) cmp.leu p51,p49=t[6],a3 };;
684 .pred.rel "mutex",p41,p43
685 .pred.rel "mutex",p49,p51
686{ .mmi; (p0) getf.sig n2=ni1 // 4:
687 (p41) add a4=a4,n4 // (p17) a4+=n4
688 (p43) add a4=a4,n4,1 };;
689{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
690 (p0) nop.f 0
691 (p51) add t[5]=t[5],a4,1 };;
692{ .mfi; (p0) getf.sig n3=ni2 // 6:
693 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
694 (p41) cmp.ltu p42,p40=a4,n4 }
695{ .mfi; (p43) cmp.leu p42,p40=a4,n4
696 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
697 (p0) nop.i 0 };;
698{ .mii; (p17) getf.sig n6=nlo[7] // 7:
699 (p49) cmp.ltu p50,p48=t[5],a4
700 (p51) cmp.leu p50,p48=t[5],a4 };;
701 .pred.rel "mutex",p40,p42
702 .pred.rel "mutex",p48,p50
703{ .mii; (p0) getf.sig n4=ni3 // 8:
704 (p40) add a5=a5,n5 // (p17) a5+=n5
705 (p42) add a5=a5,n5,1 };;
706{ .mii; (p0) nop.m 0 // 9:
707 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
708 (p50) add t[4]=t[4],a5,1 };;
709{ .mii; (p0) nop.m 0 // 10:
710 (p40) cmp.ltu p43,p41=a5,n5
711 (p42) cmp.leu p43,p41=a5,n5 };;
712{ .mii; (p17) getf.sig n7=nlo[8] // 11:
713 (p48) cmp.ltu p51,p49=t[4],a5
714 (p50) cmp.leu p51,p49=t[4],a5 };;
715 .pred.rel "mutex",p41,p43
716 .pred.rel "mutex",p49,p51
717{ .mii; (p17) getf.sig n8=nhi[8] // 12:
718 (p41) add a6=a6,n6 // (p17) a6+=n6
719 (p43) add a6=a6,n6,1 };;
720{ .mii; (p0) getf.sig n5=ni4 // 13:
721 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
722 (p51) add t[3]=t[3],a6,1 };;
723{ .mii; (p0) nop.m 0 // 14:
724 (p41) cmp.ltu p42,p40=a6,n6
725 (p43) cmp.leu p42,p40=a6,n6 };;
726{ .mii; (p0) getf.sig n6=ni5 // 15:
727 (p49) cmp.ltu p50,p48=t[3],a6
728 (p51) cmp.leu p50,p48=t[3],a6 };;
729 .pred.rel "mutex",p40,p42
730 .pred.rel "mutex",p48,p50
731{ .mii; (p0) nop.m 0 // 16:
732 (p40) add a7=a7,n7 // (p17) a7+=n7
733 (p42) add a7=a7,n7,1 };;
734{ .mii; (p0) nop.m 0 // 17:
735 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
736 (p50) add t[2]=t[2],a7,1 };;
737{ .mii; (p0) nop.m 0 // 18:
738 (p40) cmp.ltu p43,p41=a7,n7
739 (p42) cmp.leu p43,p41=a7,n7 };;
740{ .mii; (p0) getf.sig n7=ni6 // 19:
741 (p48) cmp.ltu p51,p49=t[2],a7
742 (p50) cmp.leu p51,p49=t[2],a7 };;
743 .pred.rel "mutex",p41,p43
744 .pred.rel "mutex",p49,p51
745{ .mii; (p0) nop.m 0 // 20:
746 (p41) add a8=a8,n8 // (p17) a8+=n8
747 (p43) add a8=a8,n8,1 };;
748{ .mmi; (p0) nop.m 0 // 21:
749 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
750 (p51) add t[1]=t[1],a8,1 }
751{ .mmi; (p17) mov t[0]=r0
752 (p41) cmp.ltu p42,p40=a8,n8
753 (p43) cmp.leu p42,p40=a8,n8 };;
754{ .mmi; (p0) getf.sig n8=ni7 // 22:
755 (p49) cmp.ltu p50,p48=t[1],a8
756 (p51) cmp.leu p50,p48=t[1],a8 }
757{ .mmi; (p42) add t[0]=t[0],r0,1
758 (p0) add r16=-7*16,prevsp
759 (p0) add r17=-6*16,prevsp };;
760\f
761// subtract np[8] from carrybit|tmp[8]
4407700c 762// carrybit|tmp[8] layout upon exit from above loop is:
a000759a
AP
763// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
764{ .mmi; (p50)add t[0]=t[0],r0,1
765 add r18=-5*16,prevsp
4407700c
AP
766 sub n1=t0,n1 };;
767{ .mmi; cmp.gtu p34,p32=n1,t0;;
768 .pred.rel "mutex",p32,p34
a000759a
AP
769 (p32)sub n2=t[7],n2
770 (p34)sub n2=t[7],n2,1 };;
771{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
772 (p34)cmp.geu p35,p33=n2,t[7];;
4407700c 773 .pred.rel "mutex",p33,p35
a000759a
AP
774 (p33)sub n3=t[6],n3 }
775{ .mmi; (p35)sub n3=t[6],n3,1;;
776 (p33)cmp.gtu p34,p32=n3,t[6]
777 (p35)cmp.geu p34,p32=n3,t[6] };;
4407700c 778 .pred.rel "mutex",p32,p34
a000759a
AP
779{ .mii; (p32)sub n4=t[5],n4
780 (p34)sub n4=t[5],n4,1;;
781 (p32)cmp.gtu p35,p33=n4,t[5] }
782{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
4407700c 783 .pred.rel "mutex",p33,p35
a000759a
AP
784 (p33)sub n5=t[4],n5
785 (p35)sub n5=t[4],n5,1 };;
786{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
787 (p35)cmp.geu p34,p32=n5,t[4];;
4407700c 788 .pred.rel "mutex",p32,p34
a000759a
AP
789 (p32)sub n6=t[3],n6 }
790{ .mmi; (p34)sub n6=t[3],n6,1;;
791 (p32)cmp.gtu p35,p33=n6,t[3]
792 (p34)cmp.geu p35,p33=n6,t[3] };;
4407700c 793 .pred.rel "mutex",p33,p35
a000759a
AP
794{ .mii; (p33)sub n7=t[2],n7
795 (p35)sub n7=t[2],n7,1;;
796 (p33)cmp.gtu p34,p32=n7,t[2] }
797{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
4407700c 798 .pred.rel "mutex",p32,p34
a000759a
AP
799 (p32)sub n8=t[1],n8
800 (p34)sub n8=t[1],n8,1 };;
801{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
802 (p34)cmp.geu p35,p33=n8,t[1];;
4407700c 803 .pred.rel "mutex",p33,p35
a000759a
AP
804 (p33)sub a8=t[0],r0 }
805{ .mmi; (p35)sub a8=t[0],r0,1;;
806 (p33)cmp.gtu p34,p32=a8,t[0]
807 (p35)cmp.geu p34,p32=a8,t[0] };;
4407700c
AP
808\f
809// save the result, either tmp[num] or tmp[num]-np[num]
810 .pred.rel "mutex",p32,p34
811{ .mmi; (p32)st8 [rptr]=n1,8
812 (p34)st8 [rptr]=t0,8
813 add r19=-4*16,prevsp};;
814{ .mmb; (p32)st8 [rptr]=n2,8
a000759a 815 (p34)st8 [rptr]=t[7],8
4407700c
AP
816 (p5)br.cond.dpnt.few .Ldone };;
817{ .mmb; (p32)st8 [rptr]=n3,8
a000759a 818 (p34)st8 [rptr]=t[6],8
4407700c
AP
819 (p7)br.cond.dpnt.few .Ldone };;
820{ .mmb; (p32)st8 [rptr]=n4,8
a000759a 821 (p34)st8 [rptr]=t[5],8
4407700c
AP
822 (p9)br.cond.dpnt.few .Ldone };;
823{ .mmb; (p32)st8 [rptr]=n5,8
a000759a 824 (p34)st8 [rptr]=t[4],8
4407700c
AP
825 (p11)br.cond.dpnt.few .Ldone };;
826{ .mmb; (p32)st8 [rptr]=n6,8
a000759a 827 (p34)st8 [rptr]=t[3],8
4407700c
AP
828 (p13)br.cond.dpnt.few .Ldone };;
829{ .mmb; (p32)st8 [rptr]=n7,8
a000759a 830 (p34)st8 [rptr]=t[2],8
4407700c
AP
831 (p15)br.cond.dpnt.few .Ldone };;
832{ .mmb; (p32)st8 [rptr]=n8,8
a000759a 833 (p34)st8 [rptr]=t[1],8
4407700c
AP
834 nop.b 0 };;
835.Ldone: // epilogue
836{ .mmi; ldf.fill f16=[r16],64
837 ldf.fill f17=[r17],64
838 nop.i 0 }
839{ .mmi; ldf.fill f18=[r18],64
840 ldf.fill f19=[r19],64
56c5f703 841 mov pr=prevpr,0x1ffff };;
4407700c
AP
842{ .mmi; ldf.fill f20=[r16]
843 ldf.fill f21=[r17]
844 mov ar.lc=prevlc }
845{ .mmi; ldf.fill f22=[r18]
846 ldf.fill f23=[r19]
847 mov ret0=1 } // signal "handled"
848{ .mib; rum 1<<5
849 .restore sp
850 mov sp=prevsp
851 br.ret.sptk.many b0 };;
852.endp bn_mul_mont_8#
853
dacdcf3c
AP
854.type copyright#,\@object
855copyright:
856stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
857___
858
6bd7a4d9 859open STDOUT,">$output" if $output;
dacdcf3c 860print $code;
a21314db 861close STDOUT or die "error closing STDOUT: $!";