]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/ghash-ia64.pl
e_capi.c: fix typo.
[thirdparty/openssl.git] / crypto / modes / asm / ghash-ia64.pl
CommitLineData
480cd6ab
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# March 2010
11#
12# The module implements "4-bit" Galois field multiplication and
13# streamed GHASH function. "4-bit" means that it uses 256 bytes
14# per-key table [+128 bytes shared table]. Streamed GHASH performance
15# was measured to be 6.35 cycles per processed byte on Itanium 2,
16# which is >90% better than Microsoft compiler generated code. Well,
17# the number should have been ~6.5. The deviation has everything to do
18# with the way performance is measured, as difference between GCM and
19# straightforward 128-bit counter mode. To anchor to something else
20# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
21# GHASH should run at ~8.5 cycles per byte.
22
23$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
24
25if ($^O eq "hpux") {
26 $ADDP="addp4";
27 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
28} else { $ADDP="add"; }
29for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
30 $big_endian=0 if (/\-DL_ENDIAN/); }
31if (!defined($big_endian))
32 { $big_endian=(unpack('L',pack('N',1))==1); }
33
34sub loop() {
35my $label=shift;
36my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
37
38# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
39# in scalable manner;-) Naturally assuming data in L1 cache...
40# Special note about 'dep' instruction, which is used to construct
41# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
42# bytes boundary and lower 7 bits of its address are guaranteed to
43# be zero.
44$code.=<<___;
45$label:
46{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
47 (p19) dep rem=Zlo,rem_4bitp,3,4 }
48{ .mfi; (p19) xor Zhi=Zhi,Hhi
49 ($p17) xor xi[1]=xi[1],in[1] };;
50{ .mfi; (p18) ld8 Hhi=[Hi[1]]
51 (p19) shrp Zlo=Zhi,Zlo,4 }
52{ .mfi; (p19) ld8 rem=[rem]
53 (p18) and Hi[1]=mask0xf0,xi[2] };;
54{ .mmi; ($p16) ld1 in[0]=[inp],-1
55 (p18) xor Zlo=Zlo,Hlo
56 (p19) shr.u Zhi=Zhi,4 }
57{ .mib; (p19) xor Hhi=Hhi,rem
58 (p18) add Hi[1]=Htbl,Hi[1] };;
59
60{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
61 (p18) dep rem=Zlo,rem_4bitp,3,4 }
62{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
63 (p18) xor Zhi=Zhi,Hhi };;
64{ .mfi; (p18) ld8 Hhi=[Hi[1]]
65 (p18) shrp Zlo=Zhi,Zlo,4 }
66{ .mfi; (p18) ld8 rem=[rem]
67 (p17) and Hi[0]=mask0xf0,Hi[0] };;
68{ .mmi; (p16) ld1 xi[0]=[Xi],-1
69 (p18) xor Zlo=Zlo,Hlo
70 (p18) shr.u Zhi=Zhi,4 }
71{ .mib; (p18) xor Hhi=Hhi,rem
72 (p17) add Hi[0]=Htbl,Hi[0]
73 br.ctop.sptk $label };;
74___
75}
76
77$code=<<___;
78.explicit
79.text
80
81prevfs=r2; prevlc=r3; prevpr=r8;
82mask0xf0=r21;
83rem=r22; rem_4bitp=r23;
84Xi=r24; Htbl=r25;
85inp=r26; end=r27;
86Hhi=r28; Hlo=r29;
87Zhi=r30; Zlo=r31;
88
89.global gcm_gmult_4bit#
90.proc gcm_gmult_4bit#
91.align 128
92.skip 16;; // aligns loop body
93gcm_gmult_4bit:
94 .prologue
95{ .mmi; .save ar.pfs,prevfs
96 alloc prevfs=ar.pfs,2,6,0,8
97 $ADDP Xi=15,in0 // &Xi[15]
98 mov rem_4bitp=ip }
99{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
100 .save ar.lc,prevlc
101 mov prevlc=ar.lc
102 .save pr,prevpr
103 mov prevpr=pr };;
104
105 .body
106 .rotr in[3],xi[3],Hi[2]
107
108{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
109 mov mask0xf0=0xf0
110 brp.loop.imp .Loop1,.Lend1-16};;
111{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
112 };;
113{ .mii; shladd Hi[1]=xi[2],4,r0
114 mov pr.rot=0x7<<16
115 mov ar.lc=13 };;
116{ .mii; and Hi[1]=mask0xf0,Hi[1]
117 mov ar.ec=3
118 xor Zlo=Zlo,Zlo };;
119{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
120 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
121 xor Zhi=Zhi,Zhi };;
122___
123 &loop (".Loop1",1);
124$code.=<<___;
125.Lend1:
126{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
127{ .mib; mux1 Zlo=Zlo,\@rev };;
128{ .mib; mux1 Zhi=Zhi,\@rev };;
129{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
130 add Hhi=1,Xi };; // pipeline flush on Itanium
131{ .mib; st8 [Hlo]=Zlo
132 mov pr=prevpr,-2 };;
133{ .mib; st8 [Hhi]=Zhi
134 mov ar.lc=prevlc
135 br.ret.sptk.many b0 };;
136.endp gcm_gmult_4bit#
137
138.global gcm_ghash_4bit#
139.proc gcm_ghash_4bit#
140.align 32;;
141gcm_ghash_4bit:
142 .prologue
143{ .mmi; .save ar.pfs,prevfs
144 alloc prevfs=ar.pfs,4,4,0,8
145 $ADDP inp=15,in0 // &inp[15]
146 mov rem_4bitp=ip }
147{ .mmi; $ADDP end=in1,in0 // &inp[len]
148 $ADDP Xi=15,in2 // &Xi[15]
149 .save ar.lc,prevlc
150 mov prevlc=ar.lc };;
151{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo
152 mov mask0xf0=0xf0
153 .save pr,prevpr
154 mov prevpr=pr }
155
156 .body
157 .rotr in[3],xi[3],Hi[2]
158
159{ .mmi; ld1 in[2]=[inp],-1 // inp[15]
160 ld1 xi[2]=[Xi],-1 // Xi[15]
161 add end=-17,end };;
162{ .mmi; ld1 in[1]=[inp],-1 // inp[14]
163 ld1 xi[1]=[Xi],-1 // Xi[14]
164 xor xi[2]=xi[2],in[2] };;
165{ .mii; shladd Hi[1]=xi[2],4,r0
166 mov pr.rot=0x7<<16
167 mov ar.lc=13 };;
168{ .mii; and Hi[1]=mask0xf0,Hi[1]
169 mov ar.ec=3
170 xor Zlo=Zlo,Zlo };;
171{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
172 add rem_4bitp=rem_4bit#-gcm_ghash_4bit#,rem_4bitp
173 xor Zhi=Zhi,Zhi };;
174___
175 &loop (".LoopN");
176$code.=<<___;
177{ .mib; xor Zhi=Zhi,Hhi // modulo-scheduling artefact
178 extr.u xi[2]=Zlo,0,8 } // Xi[15]
179{ .mib; cmp.ltu p6,p0=inp,end // are we done?
180 add inp=32,inp // advance inp
181 clrrrb.pr };;
182{ .mii;
183(p6) ld1 in[2]=[inp],-1 // inp[15]
184(p6) extr.u xi[1]=Zlo,8,8 // Xi[14]
185(p6) mov ar.lc=13 };;
186{ .mii;
187(p6) ld1 in[1]=[inp],-1 // inp[14]
188(p6) mov ar.ec=3
189 mux1 Zlo=Zlo,\@rev };;
190{ .mii;
191(p6) xor xi[2]=xi[2],in[2]
192 mux1 Zhi=Zhi,\@rev };;
193{ .mii;
194(p6) shladd Hi[1]=xi[2],4,r0
195 add Hlo=9,Xi // Xi is &Xi[-1]
196 add Hhi=1,Xi };;
197{ .mii;
198(p6) and Hi[1]=mask0xf0,Hi[1]
199(p6) add Xi=14,Xi // &Xi[13]
200(p6) mov pr.rot=0x7<<16 };;
201
202{ .mii; st8 [Hlo]=Zlo
203(p6) xor Zlo=Zlo,Zlo
204(p6) add Hi[1]=Htbl,Hi[1] };;
205{ .mib; st8 [Hhi]=Zhi
206(p6) xor Zhi=Zhi,Zhi
207(p6) br.cond.dptk.many .LoopN };;
208
209{ .mib; mov pr=prevpr,-2 }
210{ .mib; mov ar.lc=prevlc
211 br.ret.sptk.many b0 };;
212.endp gcm_ghash_4bit#
213
214.align 128;;
215.type rem_4bit#,\@object
216rem_4bit:
217 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
218 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
219 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
220 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
221.size rem_4bit#,128
222stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
223___
224
225$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
226
227print $code;
228close STDOUT;