]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/ghash-ia64.pl
sparcv9cap.c: disengange Solaris-specific CPU detection routine in favour
[thirdparty/openssl.git] / crypto / modes / asm / ghash-ia64.pl
CommitLineData
480cd6ab
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
c3473126
AP
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15# GHASH performance was measured to be 6.35 cycles per processed byte
16# on Itanium 2, which is >90% better than Microsoft compiler generated
17# code. Well, the number should have been ~6.5. The deviation has
18# everything to do with the way performance is measured: as difference
19# between GCM and straightforward 128-bit counter mode. To anchor to
20# something else sha1-ia64.pl module processes one byte in 6.0 cycles.
21# On Itanium GHASH should run at ~8.5 cycles per byte.
480cd6ab 22
85e28dfa
AP
23# Note about "528B" variant. In Itanium 2 case it makes lesser sense
24# to implement it for following reason. Because number of functional
25# units is naturally limited, it's impossible to implement "528B" loop
26# in 4 cycles, only in 5. This means that theoretically performance
27# improvement can't be more than 20%, ~15% is more realistic. This
28# is considered below justification level for implementing new code.
29# Not to mention that on original Itanium it would actually run
30# slower, spending >9 cycles per byte.
31
480cd6ab
AP
32$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
33
34if ($^O eq "hpux") {
35 $ADDP="addp4";
36 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
37} else { $ADDP="add"; }
38for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
39 $big_endian=0 if (/\-DL_ENDIAN/); }
40if (!defined($big_endian))
41 { $big_endian=(unpack('L',pack('N',1))==1); }
42
43sub loop() {
44my $label=shift;
45my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
46
47# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
48# in scalable manner;-) Naturally assuming data in L1 cache...
49# Special note about 'dep' instruction, which is used to construct
50# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
51# bytes boundary and lower 7 bits of its address are guaranteed to
52# be zero.
53$code.=<<___;
54$label:
55{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
56 (p19) dep rem=Zlo,rem_4bitp,3,4 }
57{ .mfi; (p19) xor Zhi=Zhi,Hhi
58 ($p17) xor xi[1]=xi[1],in[1] };;
59{ .mfi; (p18) ld8 Hhi=[Hi[1]]
60 (p19) shrp Zlo=Zhi,Zlo,4 }
61{ .mfi; (p19) ld8 rem=[rem]
62 (p18) and Hi[1]=mask0xf0,xi[2] };;
63{ .mmi; ($p16) ld1 in[0]=[inp],-1
64 (p18) xor Zlo=Zlo,Hlo
65 (p19) shr.u Zhi=Zhi,4 }
66{ .mib; (p19) xor Hhi=Hhi,rem
67 (p18) add Hi[1]=Htbl,Hi[1] };;
68
69{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
70 (p18) dep rem=Zlo,rem_4bitp,3,4 }
71{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
72 (p18) xor Zhi=Zhi,Hhi };;
73{ .mfi; (p18) ld8 Hhi=[Hi[1]]
74 (p18) shrp Zlo=Zhi,Zlo,4 }
75{ .mfi; (p18) ld8 rem=[rem]
76 (p17) and Hi[0]=mask0xf0,Hi[0] };;
77{ .mmi; (p16) ld1 xi[0]=[Xi],-1
78 (p18) xor Zlo=Zlo,Hlo
79 (p18) shr.u Zhi=Zhi,4 }
80{ .mib; (p18) xor Hhi=Hhi,rem
81 (p17) add Hi[0]=Htbl,Hi[0]
82 br.ctop.sptk $label };;
83___
84}
85
86$code=<<___;
87.explicit
88.text
89
90prevfs=r2; prevlc=r3; prevpr=r8;
91mask0xf0=r21;
92rem=r22; rem_4bitp=r23;
93Xi=r24; Htbl=r25;
94inp=r26; end=r27;
95Hhi=r28; Hlo=r29;
96Zhi=r30; Zlo=r31;
97
98.global gcm_gmult_4bit#
99.proc gcm_gmult_4bit#
100.align 128
101.skip 16;; // aligns loop body
102gcm_gmult_4bit:
103 .prologue
104{ .mmi; .save ar.pfs,prevfs
105 alloc prevfs=ar.pfs,2,6,0,8
106 $ADDP Xi=15,in0 // &Xi[15]
107 mov rem_4bitp=ip }
108{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
109 .save ar.lc,prevlc
110 mov prevlc=ar.lc
111 .save pr,prevpr
112 mov prevpr=pr };;
113
114 .body
115 .rotr in[3],xi[3],Hi[2]
116
117{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
118 mov mask0xf0=0xf0
119 brp.loop.imp .Loop1,.Lend1-16};;
120{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
121 };;
122{ .mii; shladd Hi[1]=xi[2],4,r0
123 mov pr.rot=0x7<<16
124 mov ar.lc=13 };;
125{ .mii; and Hi[1]=mask0xf0,Hi[1]
126 mov ar.ec=3
127 xor Zlo=Zlo,Zlo };;
128{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
129 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
130 xor Zhi=Zhi,Zhi };;
131___
132 &loop (".Loop1",1);
133$code.=<<___;
134.Lend1:
135{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
136{ .mib; mux1 Zlo=Zlo,\@rev };;
137{ .mib; mux1 Zhi=Zhi,\@rev };;
138{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
139 add Hhi=1,Xi };; // pipeline flush on Itanium
140{ .mib; st8 [Hlo]=Zlo
141 mov pr=prevpr,-2 };;
142{ .mib; st8 [Hhi]=Zhi
143 mov ar.lc=prevlc
144 br.ret.sptk.many b0 };;
145.endp gcm_gmult_4bit#
146
147.global gcm_ghash_4bit#
148.proc gcm_ghash_4bit#
149.align 32;;
150gcm_ghash_4bit:
151 .prologue
152{ .mmi; .save ar.pfs,prevfs
153 alloc prevfs=ar.pfs,4,4,0,8
4f39edbf 154 $ADDP inp=15,in2 // &inp[15]
480cd6ab 155 mov rem_4bitp=ip }
4f39edbf
AP
156{ .mmi; $ADDP end=in3,in2 // &inp[len]
157 $ADDP Xi=15,in0 // &Xi[15]
480cd6ab
AP
158 .save ar.lc,prevlc
159 mov prevlc=ar.lc };;
4f39edbf 160{ .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo
480cd6ab
AP
161 mov mask0xf0=0xf0
162 .save pr,prevpr
163 mov prevpr=pr }
164
165 .body
166 .rotr in[3],xi[3],Hi[2]
167
168{ .mmi; ld1 in[2]=[inp],-1 // inp[15]
169 ld1 xi[2]=[Xi],-1 // Xi[15]
170 add end=-17,end };;
171{ .mmi; ld1 in[1]=[inp],-1 // inp[14]
172 ld1 xi[1]=[Xi],-1 // Xi[14]
173 xor xi[2]=xi[2],in[2] };;
174{ .mii; shladd Hi[1]=xi[2],4,r0
175 mov pr.rot=0x7<<16
176 mov ar.lc=13 };;
177{ .mii; and Hi[1]=mask0xf0,Hi[1]
178 mov ar.ec=3
179 xor Zlo=Zlo,Zlo };;
180{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
181 add rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp
182 xor Zhi=Zhi,Zhi };;
183___
184 &loop (".LoopN");
185$code.=<<___;
186{ .mib; xor Zhi=Zhi,Hhi // modulo-scheduling artefact
187 extr.u xi[2]=Zlo,0,8 } // Xi[15]
188{ .mib; cmp.ltu p6,p0=inp,end // are we done?
189 add inp=32,inp // advance inp
190 clrrrb.pr };;
191{ .mii;
192(p6) ld1 in[2]=[inp],-1 // inp[15]
193(p6) extr.u xi[1]=Zlo,8,8 // Xi[14]
194(p6) mov ar.lc=13 };;
195{ .mii;
196(p6) ld1 in[1]=[inp],-1 // inp[14]
197(p6) mov ar.ec=3
198 mux1 Zlo=Zlo,\@rev };;
199{ .mii;
200(p6) xor xi[2]=xi[2],in[2]
201 mux1 Zhi=Zhi,\@rev };;
202{ .mii;
203(p6) shladd Hi[1]=xi[2],4,r0
204 add Hlo=9,Xi // Xi is &Xi[-1]
205 add Hhi=1,Xi };;
206{ .mii;
207(p6) and Hi[1]=mask0xf0,Hi[1]
208(p6) add Xi=14,Xi // &Xi[13]
209(p6) mov pr.rot=0x7<<16 };;
210
211{ .mii; st8 [Hlo]=Zlo
212(p6) xor Zlo=Zlo,Zlo
213(p6) add Hi[1]=Htbl,Hi[1] };;
214{ .mib; st8 [Hhi]=Zhi
215(p6) xor Zhi=Zhi,Zhi
216(p6) br.cond.dptk.many .LoopN };;
217
218{ .mib; mov pr=prevpr,-2 }
219{ .mib; mov ar.lc=prevlc
220 br.ret.sptk.many b0 };;
221.endp gcm_ghash_4bit#
222
223.align 128;;
224.type rem_4bit#,\@object
225rem_4bit:
226 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
227 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
228 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
229 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
230.size rem_4bit#,128
231stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
232___
233
234$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
235
236print $code;
237close STDOUT;