]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/modes/asm/ghash-ia64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / modes / asm / ghash-ia64.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
81cae8ce 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
480cd6ab
AP
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March 2010
18#
c3473126
AP
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
a3b0c44b 22# GHASH performance was measured to be 6.67 cycles per processed byte
c3473126 23# on Itanium 2, which is >90% better than Microsoft compiler generated
a3b0c44b
AP
24# code. To anchor to something else sha1-ia64.pl module processes one
25# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
26# byte.
27
28# September 2010
29#
30# It was originally thought that it makes lesser sense to implement
31# "528B" variant on Itanium 2 for following reason. Because number of
32# functional units is naturally limited, it appeared impossible to
33# implement "528B" loop in 4 cycles, only in 5. This would mean that
34# theoretically performance improvement couldn't be more than 20%.
35# But occasionally you prove yourself wrong:-) I figured out a way to
36# fold couple of instructions and having freed yet another instruction
37# slot by unrolling the loop... Resulting performance is 4.45 cycles
38# per processed byte and 50% better than "256B" version. On original
39# Itanium performance should remain the same as the "256B" version,
40# i.e. ~8.5 cycles.
85e28dfa 41
4f0d5f18 42$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
480cd6ab
AP
43
44if ($^O eq "hpux") {
45 $ADDP="addp4";
46 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
47} else { $ADDP="add"; }
48for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
49 $big_endian=0 if (/\-DL_ENDIAN/); }
50if (!defined($big_endian))
51 { $big_endian=(unpack('L',pack('N',1))==1); }
52
53sub loop() {
54my $label=shift;
55my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
56
57# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
58# in scalable manner;-) Naturally assuming data in L1 cache...
59# Special note about 'dep' instruction, which is used to construct
60# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
61# bytes boundary and lower 7 bits of its address are guaranteed to
62# be zero.
63$code.=<<___;
64$label:
65{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
66 (p19) dep rem=Zlo,rem_4bitp,3,4 }
67{ .mfi; (p19) xor Zhi=Zhi,Hhi
68 ($p17) xor xi[1]=xi[1],in[1] };;
69{ .mfi; (p18) ld8 Hhi=[Hi[1]]
70 (p19) shrp Zlo=Zhi,Zlo,4 }
71{ .mfi; (p19) ld8 rem=[rem]
72 (p18) and Hi[1]=mask0xf0,xi[2] };;
73{ .mmi; ($p16) ld1 in[0]=[inp],-1
74 (p18) xor Zlo=Zlo,Hlo
75 (p19) shr.u Zhi=Zhi,4 }
76{ .mib; (p19) xor Hhi=Hhi,rem
77 (p18) add Hi[1]=Htbl,Hi[1] };;
78
79{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
80 (p18) dep rem=Zlo,rem_4bitp,3,4 }
81{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
82 (p18) xor Zhi=Zhi,Hhi };;
83{ .mfi; (p18) ld8 Hhi=[Hi[1]]
84 (p18) shrp Zlo=Zhi,Zlo,4 }
85{ .mfi; (p18) ld8 rem=[rem]
86 (p17) and Hi[0]=mask0xf0,Hi[0] };;
87{ .mmi; (p16) ld1 xi[0]=[Xi],-1
88 (p18) xor Zlo=Zlo,Hlo
89 (p18) shr.u Zhi=Zhi,4 }
90{ .mib; (p18) xor Hhi=Hhi,rem
91 (p17) add Hi[0]=Htbl,Hi[0]
92 br.ctop.sptk $label };;
93___
94}
95
96$code=<<___;
97.explicit
98.text
99
100prevfs=r2; prevlc=r3; prevpr=r8;
101mask0xf0=r21;
102rem=r22; rem_4bitp=r23;
103Xi=r24; Htbl=r25;
104inp=r26; end=r27;
105Hhi=r28; Hlo=r29;
106Zhi=r30; Zlo=r31;
107
56c5f703
AP
108.align 128
109.skip 16 // aligns loop body
480cd6ab
AP
110.global gcm_gmult_4bit#
111.proc gcm_gmult_4bit#
480cd6ab
AP
112gcm_gmult_4bit:
113 .prologue
114{ .mmi; .save ar.pfs,prevfs
115 alloc prevfs=ar.pfs,2,6,0,8
116 $ADDP Xi=15,in0 // &Xi[15]
117 mov rem_4bitp=ip }
118{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
119 .save ar.lc,prevlc
120 mov prevlc=ar.lc
121 .save pr,prevpr
122 mov prevpr=pr };;
123
124 .body
125 .rotr in[3],xi[3],Hi[2]
126
127{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
128 mov mask0xf0=0xf0
129 brp.loop.imp .Loop1,.Lend1-16};;
130{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
131 };;
132{ .mii; shladd Hi[1]=xi[2],4,r0
133 mov pr.rot=0x7<<16
134 mov ar.lc=13 };;
135{ .mii; and Hi[1]=mask0xf0,Hi[1]
136 mov ar.ec=3
137 xor Zlo=Zlo,Zlo };;
138{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
139 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
140 xor Zhi=Zhi,Zhi };;
141___
142 &loop (".Loop1",1);
143$code.=<<___;
144.Lend1:
145{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
146{ .mib; mux1 Zlo=Zlo,\@rev };;
147{ .mib; mux1 Zhi=Zhi,\@rev };;
148{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
149 add Hhi=1,Xi };; // pipeline flush on Itanium
150{ .mib; st8 [Hlo]=Zlo
56c5f703 151 mov pr=prevpr,0x1ffff };;
480cd6ab
AP
152{ .mib; st8 [Hhi]=Zhi
153 mov ar.lc=prevlc
154 br.ret.sptk.many b0 };;
155.endp gcm_gmult_4bit#
a3b0c44b
AP
156___
157
158######################################################################
46f4e1be 159# "528B" (well, "512B" actually) streamed GHASH
a3b0c44b
AP
160#
161$Xip="in0";
162$Htbl="in1";
163$inp="in2";
164$len="in3";
165$rem_8bit="loc0";
166$mask0xff="loc1";
167($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
168
169sub load_htable() {
170 for (my $i=0;$i<8;$i++) {
171 $code.=<<___;
172{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
173 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
174{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
175 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
176___
177 $code.=shift if (($i+$#_)==7);
178 $code.="\t};;\n"
179 }
180}
181
182$code.=<<___;
183prevsp=r3;
480cd6ab 184
56c5f703
AP
185.align 32
186.skip 16 // aligns loop body
480cd6ab
AP
187.global gcm_ghash_4bit#
188.proc gcm_ghash_4bit#
480cd6ab
AP
189gcm_ghash_4bit:
190 .prologue
191{ .mmi; .save ar.pfs,prevfs
a3b0c44b
AP
192 alloc prevfs=ar.pfs,4,2,0,0
193 .vframe prevsp
194 mov prevsp=sp
195 mov $rem_8bit=ip };;
480cd6ab 196 .body
a3b0c44b
AP
197{ .mfi; $ADDP r8=0+0,$Htbl
198 $ADDP r9=0+8,$Htbl }
199{ .mfi; $ADDP r10=128+0,$Htbl
200 $ADDP r11=128+8,$Htbl };;
480cd6ab 201___
a3b0c44b
AP
202 &load_htable(
203 " $ADDP $Xip=15,$Xip", # &Xi[15]
204 " $ADDP $len=$len,$inp", # &inp[len]
205 " $ADDP $inp=15,$inp", # &inp[15]
206 " mov $mask0xff=0xff",
207 " add sp=-512,sp",
208 " andcm sp=sp,$mask0xff", # align stack frame
209 " add r14=0,sp",
210 " add r15=8,sp");
480cd6ab 211$code.=<<___;
a3b0c44b
AP
212{ .mmi; $sum 1<<1 // go big-endian
213 add r8=256+0,sp
214 add r9=256+8,sp }
215{ .mmi; add r10=256+128+0,sp
216 add r11=256+128+8,sp
217 add $len=-17,$len };;
218___
219for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
220my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
221$code.=<<___;
222{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
223 st8 [r9]=$rhi,16 // Htable[$i].hi
224 shrp $rlo=$rhi,$rlo,4 }//;;
225{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
226 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
227 shr.u $rhi=$rhi,4 };;
228{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
229 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
230___
231}
232$code.=<<___;
233{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
234 ld8 r17=[r9],16 };; // Htable[8].hi
235{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
236 ld8 r19=[r9],16 } // Htable[9].hi
237{ .mmi; rum 1<<5 // clear um.mfh
238 shrp r16=r17,r16,4 };;
239___
240for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
241$code.=<<___;
242{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
243 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
244 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
248___
249}
250$code.=<<___;
251{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
252{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
253 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
254 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
255{ .mmi; add $Htbl=256,sp // &Htable[0]
256 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
257 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
258{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
259 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
260___
261
262$in="r15";
263@xi=("r16","r17");
264@rem=("r18","r19");
265($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
266($Atbl,$Btbl)=("r26","r27");
267
268$code.=<<___; # (p16)
269{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
270 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
271 cmp.eq p0,p6=r0,r0 };; // clear p6
272___
273push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
274
275$code.=<<___; # (p16),(p17)
276{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
277 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
278{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
279 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
280 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
281.align 32
282.LOOP:
283{ .mmi;
284(p6) st8 [$Xip]=$Zhi,13
285 xor $Zlo=$Zlo,$Zlo
286 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
287___
288push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
289
290$code.=<<___; # (p16),(p17),(p18)
291{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
292 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
293 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
294{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
295 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
296{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
297 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
298{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
299 ld1 $in=[$inp],-1 } //(p16) *inp--
300{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
301 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
302 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
303{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
304 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
305 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
306{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
307 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
308___
309push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
310
311for ($i=1;$i<14;$i++) {
312# Above and below fragments are derived from this one by removing
313# unsuitable (p??) instructions.
314$code.=<<___; # (p16),(p17),(p18),(p19)
315{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
316 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
317 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
318{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
319 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
320 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
321{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
322 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
323 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
324{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
325 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
326 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
327{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
328 ld1 $in=[$inp],-1 //(p16) *inp--
329 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
330{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
331 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
332 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
333{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
334 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
335 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
336{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
337 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
338 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
339___
340push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
341}
342
343$code.=<<___; # (p17),(p18),(p19)
344{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
345 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
346 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
347{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
348 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
349 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
350{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
351 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
352 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
353{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
354 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
355 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
356{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
357 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
358{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
359 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
360 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
361{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
362 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
363{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
364 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
365 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
366___
367push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
368
369$code.=<<___; # (p18),(p19)
370{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
371 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
372{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
373 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
374{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
375 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
376{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
377 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
378{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
379 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
380{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
381 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
382{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
383 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
384{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
385 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
386___
387push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
388
389$code.=<<___; # (p19)
390{ .mmi; cmp.ltu p6,p0=$inp,$len
391 add $inp=32,$inp
392 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
393{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
394 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
395 add $Xip=9,$Xip };; // &Xi.lo
396{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
397(p6) ld1 $in=[$inp],-1 //[p16] *inp--
398(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
399{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
400(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
401{ .mmi; st8 [$Xip]=$Zlo,-8
402(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
403 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
404{ .mmi;
405(p6) ld1 $in=[$inp],-1 //[p16] *inp--
406 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
407(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
408{ .mib;
409(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
410(p6) br.cond.dptk.many .LOOP };;
480cd6ab 411
a3b0c44b
AP
412{ .mib; st8 [$Xip]=$Zhi };;
413{ .mib; $rum 1<<1 // return to little-endian
414 .restore sp
415 mov sp=prevsp
480cd6ab
AP
416 br.ret.sptk.many b0 };;
417.endp gcm_ghash_4bit#
a3b0c44b
AP
418___
419$code.=<<___;
56c5f703 420.align 128
480cd6ab
AP
421.type rem_4bit#,\@object
422rem_4bit:
423 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
424 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
425 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
426 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
427.size rem_4bit#,128
a3b0c44b
AP
428.type rem_8bit#,\@object
429rem_8bit:
430 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
431 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
432 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
433 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
434 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
435 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
436 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
437 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
438 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
439 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
440 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
441 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
442 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
443 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
444 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
445 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
446 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
447 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
448 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
449 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
450 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
451 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
452 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
453 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
454 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
455 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
456 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
457 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
458 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
459 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
460 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
461 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
462.size rem_8bit#,512
480cd6ab
AP
463stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
464___
465
466$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
a3b0c44b 467$code =~ s/\`([^\`]*)\`/eval $1/gem;
480cd6ab
AP
468
469print $code;
a21314db 470close STDOUT or die "error closing STDOUT: $!";