]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/modes/asm/ghash-ia64.pl
Update copyright year
[thirdparty/openssl.git] / crypto / modes / asm / ghash-ia64.pl
1 #! /usr/bin/env perl
2 # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # March 2010
18 #
19 # The module implements "4-bit" GCM GHASH function and underlying
20 # single multiplication operation in GF(2^128). "4-bit" means that it
21 # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
22 # GHASH performance was measured to be 6.67 cycles per processed byte
23 # on Itanium 2, which is >90% better than Microsoft compiler generated
24 # code. To anchor to something else sha1-ia64.pl module processes one
25 # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
26 # byte.
27
28 # September 2010
29 #
30 # It was originally thought that it makes lesser sense to implement
31 # "528B" variant on Itanium 2 for following reason. Because number of
32 # functional units is naturally limited, it appeared impossible to
33 # implement "528B" loop in 4 cycles, only in 5. This would mean that
34 # theoretically performance improvement couldn't be more than 20%.
35 # But occasionally you prove yourself wrong:-) I figured out a way to
36 # fold couple of instructions and having freed yet another instruction
37 # slot by unrolling the loop... Resulting performance is 4.45 cycles
38 # per processed byte and 50% better than "256B" version. On original
39 # Itanium performance should remain the same as the "256B" version,
40 # i.e. ~8.5 cycles.
41
42 $output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
43
44 if ($^O eq "hpux") {
45 $ADDP="addp4";
46 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
47 } else { $ADDP="add"; }
48 for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
49 $big_endian=0 if (/\-DL_ENDIAN/); }
50 if (!defined($big_endian))
51 { $big_endian=(unpack('L',pack('N',1))==1); }
52
53 sub loop() {
54 my $label=shift;
55 my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
56
57 # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
58 # in scalable manner;-) Naturally assuming data in L1 cache...
59 # Special note about 'dep' instruction, which is used to construct
60 # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
61 # bytes boundary and lower 7 bits of its address are guaranteed to
62 # be zero.
63 $code.=<<___;
64 $label:
65 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
66 (p19) dep rem=Zlo,rem_4bitp,3,4 }
67 { .mfi; (p19) xor Zhi=Zhi,Hhi
68 ($p17) xor xi[1]=xi[1],in[1] };;
69 { .mfi; (p18) ld8 Hhi=[Hi[1]]
70 (p19) shrp Zlo=Zhi,Zlo,4 }
71 { .mfi; (p19) ld8 rem=[rem]
72 (p18) and Hi[1]=mask0xf0,xi[2] };;
73 { .mmi; ($p16) ld1 in[0]=[inp],-1
74 (p18) xor Zlo=Zlo,Hlo
75 (p19) shr.u Zhi=Zhi,4 }
76 { .mib; (p19) xor Hhi=Hhi,rem
77 (p18) add Hi[1]=Htbl,Hi[1] };;
78
79 { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
80 (p18) dep rem=Zlo,rem_4bitp,3,4 }
81 { .mfi; (p17) shladd Hi[0]=xi[1],4,r0
82 (p18) xor Zhi=Zhi,Hhi };;
83 { .mfi; (p18) ld8 Hhi=[Hi[1]]
84 (p18) shrp Zlo=Zhi,Zlo,4 }
85 { .mfi; (p18) ld8 rem=[rem]
86 (p17) and Hi[0]=mask0xf0,Hi[0] };;
87 { .mmi; (p16) ld1 xi[0]=[Xi],-1
88 (p18) xor Zlo=Zlo,Hlo
89 (p18) shr.u Zhi=Zhi,4 }
90 { .mib; (p18) xor Hhi=Hhi,rem
91 (p17) add Hi[0]=Htbl,Hi[0]
92 br.ctop.sptk $label };;
93 ___
94 }
95
96 $code=<<___;
97 .explicit
98 .text
99
100 prevfs=r2; prevlc=r3; prevpr=r8;
101 mask0xf0=r21;
102 rem=r22; rem_4bitp=r23;
103 Xi=r24; Htbl=r25;
104 inp=r26; end=r27;
105 Hhi=r28; Hlo=r29;
106 Zhi=r30; Zlo=r31;
107
108 .align 128
109 .skip 16 // aligns loop body
110 .global gcm_gmult_4bit#
111 .proc gcm_gmult_4bit#
112 gcm_gmult_4bit:
113 .prologue
114 { .mmi; .save ar.pfs,prevfs
115 alloc prevfs=ar.pfs,2,6,0,8
116 $ADDP Xi=15,in0 // &Xi[15]
117 mov rem_4bitp=ip }
118 { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
119 .save ar.lc,prevlc
120 mov prevlc=ar.lc
121 .save pr,prevpr
122 mov prevpr=pr };;
123
124 .body
125 .rotr in[3],xi[3],Hi[2]
126
127 { .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
128 mov mask0xf0=0xf0
129 brp.loop.imp .Loop1,.Lend1-16};;
130 { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
131 };;
132 { .mii; shladd Hi[1]=xi[2],4,r0
133 mov pr.rot=0x7<<16
134 mov ar.lc=13 };;
135 { .mii; and Hi[1]=mask0xf0,Hi[1]
136 mov ar.ec=3
137 xor Zlo=Zlo,Zlo };;
138 { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
139 add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
140 xor Zhi=Zhi,Zhi };;
141 ___
142 &loop (".Loop1",1);
143 $code.=<<___;
144 .Lend1:
145 { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
146 { .mib; mux1 Zlo=Zlo,\@rev };;
147 { .mib; mux1 Zhi=Zhi,\@rev };;
148 { .mmi; add Hlo=9,Xi;; // ;; is here to prevent
149 add Hhi=1,Xi };; // pipeline flush on Itanium
150 { .mib; st8 [Hlo]=Zlo
151 mov pr=prevpr,0x1ffff };;
152 { .mib; st8 [Hhi]=Zhi
153 mov ar.lc=prevlc
154 br.ret.sptk.many b0 };;
155 .endp gcm_gmult_4bit#
156 ___
157
158 ######################################################################
159 # "528B" (well, "512B" actually) streamed GHASH
160 #
161 $Xip="in0";
162 $Htbl="in1";
163 $inp="in2";
164 $len="in3";
165 $rem_8bit="loc0";
166 $mask0xff="loc1";
167 ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
168
169 sub load_htable() {
170 for (my $i=0;$i<8;$i++) {
171 $code.=<<___;
172 { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
173 ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
174 { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
175 ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
176 ___
177 $code.=shift if (($i+$#_)==7);
178 $code.="\t};;\n"
179 }
180 }
181
182 $code.=<<___;
183 prevsp=r3;
184
185 .align 32
186 .skip 16 // aligns loop body
187 .global gcm_ghash_4bit#
188 .proc gcm_ghash_4bit#
189 gcm_ghash_4bit:
190 .prologue
191 { .mmi; .save ar.pfs,prevfs
192 alloc prevfs=ar.pfs,4,2,0,0
193 .vframe prevsp
194 mov prevsp=sp
195 mov $rem_8bit=ip };;
196 .body
197 { .mfi; $ADDP r8=0+0,$Htbl
198 $ADDP r9=0+8,$Htbl }
199 { .mfi; $ADDP r10=128+0,$Htbl
200 $ADDP r11=128+8,$Htbl };;
201 ___
202 &load_htable(
203 " $ADDP $Xip=15,$Xip", # &Xi[15]
204 " $ADDP $len=$len,$inp", # &inp[len]
205 " $ADDP $inp=15,$inp", # &inp[15]
206 " mov $mask0xff=0xff",
207 " add sp=-512,sp",
208 " andcm sp=sp,$mask0xff", # align stack frame
209 " add r14=0,sp",
210 " add r15=8,sp");
211 $code.=<<___;
212 { .mmi; $sum 1<<1 // go big-endian
213 add r8=256+0,sp
214 add r9=256+8,sp }
215 { .mmi; add r10=256+128+0,sp
216 add r11=256+128+8,sp
217 add $len=-17,$len };;
218 ___
219 for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
220 my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
221 $code.=<<___;
222 { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
223 st8 [r9]=$rhi,16 // Htable[$i].hi
224 shrp $rlo=$rhi,$rlo,4 }//;;
225 { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
226 stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
227 shr.u $rhi=$rhi,4 };;
228 { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
229 st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
230 ___
231 }
232 $code.=<<___;
233 { .mmi; ld8 r16=[r8],16 // Htable[8].lo
234 ld8 r17=[r9],16 };; // Htable[8].hi
235 { .mmi; ld8 r18=[r8],16 // Htable[9].lo
236 ld8 r19=[r9],16 } // Htable[9].hi
237 { .mmi; rum 1<<5 // clear um.mfh
238 shrp r16=r17,r16,4 };;
239 ___
240 for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
241 $code.=<<___;
242 { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
243 ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
244 shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
245 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
246 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
247 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
248 ___
249 }
250 $code.=<<___;
251 { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
252 { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
253 st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
254 shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
255 { .mmi; add $Htbl=256,sp // &Htable[0]
256 add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
257 shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
258 { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
259 st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
260 ___
261
262 $in="r15";
263 @xi=("r16","r17");
264 @rem=("r18","r19");
265 ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
266 ($Atbl,$Btbl)=("r26","r27");
267
268 $code.=<<___; # (p16)
269 { .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
270 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
271 cmp.eq p0,p6=r0,r0 };; // clear p6
272 ___
273 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
274
275 $code.=<<___; # (p16),(p17)
276 { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
277 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
278 { .mii; ld1 $in=[$inp],-1 //(p16) *inp--
279 dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
280 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
281 .align 32
282 .LOOP:
283 { .mmi;
284 (p6) st8 [$Xip]=$Zhi,13
285 xor $Zlo=$Zlo,$Zlo
286 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
287 ___
288 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
289
290 $code.=<<___; # (p16),(p17),(p18)
291 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
292 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
293 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
294 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
295 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
296 { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
297 xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
298 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
299 ld1 $in=[$inp],-1 } //(p16) *inp--
300 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
301 mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
302 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
303 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
304 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
305 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
306 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
307 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
308 ___
309 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
310
311 for ($i=1;$i<14;$i++) {
312 # Above and below fragments are derived from this one by removing
313 # unsuitable (p??) instructions.
314 $code.=<<___; # (p16),(p17),(p18),(p19)
315 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
316 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
317 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
318 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
319 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
320 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
321 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
322 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
323 dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
324 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
325 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
326 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
327 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
328 ld1 $in=[$inp],-1 //(p16) *inp--
329 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
330 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
331 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
332 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
333 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
334 ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
335 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
336 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
337 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
338 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
339 ___
340 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
341 }
342
343 $code.=<<___; # (p17),(p18),(p19)
344 { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
345 ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
346 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
347 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
348 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
349 xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
350 { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
351 ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
352 dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
353 { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
354 xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
355 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
356 { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
357 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
358 { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
359 xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
360 and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
361 { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
362 shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
363 { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
364 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
365 add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
366 ___
367 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
368
369 $code.=<<___; # (p18),(p19)
370 { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
371 shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
372 { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
373 xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
374 { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
375 xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
376 { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
377 xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
378 { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
379 shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
380 { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
381 xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
382 { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
383 shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
384 { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
385 xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
386 ___
387 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
388
389 $code.=<<___; # (p19)
390 { .mmi; cmp.ltu p6,p0=$inp,$len
391 add $inp=32,$inp
392 shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
393 { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
394 xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
395 add $Xip=9,$Xip };; // &Xi.lo
396 { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
397 (p6) ld1 $in=[$inp],-1 //[p16] *inp--
398 (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
399 { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
400 (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
401 { .mmi; st8 [$Xip]=$Zlo,-8
402 (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
403 shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
404 { .mmi;
405 (p6) ld1 $in=[$inp],-1 //[p16] *inp--
406 xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
407 (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
408 { .mib;
409 (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
410 (p6) br.cond.dptk.many .LOOP };;
411
412 { .mib; st8 [$Xip]=$Zhi };;
413 { .mib; $rum 1<<1 // return to little-endian
414 .restore sp
415 mov sp=prevsp
416 br.ret.sptk.many b0 };;
417 .endp gcm_ghash_4bit#
418 ___
419 $code.=<<___;
420 .align 128
421 .type rem_4bit#,\@object
422 rem_4bit:
423 data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
424 data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
425 data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
426 data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
427 .size rem_4bit#,128
428 .type rem_8bit#,\@object
429 rem_8bit:
430 data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
431 data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
432 data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
433 data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
434 data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
435 data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
436 data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
437 data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
438 data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
439 data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
440 data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
441 data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
442 data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
443 data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
444 data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
445 data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
446 data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
447 data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
448 data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
449 data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
450 data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
451 data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
452 data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
453 data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
454 data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
455 data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
456 data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
457 data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
458 data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
459 data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
460 data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
461 data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
462 .size rem_8bit#,512
463 stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
464 ___
465
466 $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
467 $code =~ s/\`([^\`]*)\`/eval $1/gem;
468
469 print $code;
470 close STDOUT or die "error closing STDOUT: $!";