]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sha/asm/keccak1600-avx2.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-avx2.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for AVX2.
17 #
18 # July 2017.
19 #
20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
23 # dedicated to one axis*, Pi permutation is reduced to intra-register
24 # shuffles...
25 #
26 # It makes other steps more intricate, but overall, is it a win? To be
27 # more specific index permutations organized by quadruples are:
28 #
29 # [4][4] [3][3] [2][2] [1][1]<-+
30 # [0][4] [0][3] [0][2] [0][1]<-+
31 # [3][0] [1][0] [4][0] [2][0] |
32 # [4][3] [3][1] [2][4] [1][2] |
33 # [3][4] [1][3] [4][2] [2][1] |
34 # [2][3] [4][1] [1][4] [3][2] |
35 # [2][2] [4][4] [1][1] [3][3] -+
36 #
37 # This however is highly impractical for Theta and Chi. What would help
38 # Theta is if x indices were aligned column-wise, or in other words:
39 #
40 # [0][4] [0][3] [0][2] [0][1]
41 # [3][0] [1][0] [4][0] [2][0]
42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43 # [2][4] [4][3] [1][2] [3][1]
44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45 # [3][4] [1][3] [4][2] [2][1]
46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47 # [1][4] [2][3] [3][2] [4][1]
48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49 # [4][4] [3][3] [2][2] [1][1]
50 #
51 # So here we have it, lines not marked with vpermq() represent the magic
52 # order in which data is to be loaded and maintained. [And lines marked
53 # with vpermq() represent Pi circular permutation in chosen layout. Note
54 # that first step is permutation-free.] A[0][0] is loaded to register of
55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56 # Digits in variables' names denote right-most coordinates:
57
58 my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
59 $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
60 $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
61 $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
62 $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
63 $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
64 $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
65 map("%ymm$_",(0..6));
66
67 # We also need to map the magic order into offsets within structure:
68
69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
70 [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
71 [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
72 [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
73 [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
74 @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
75
76 # But on the other hand Chi is much better off if y indices were aligned
77 # column-wise, not x. For this reason we have to shuffle data prior
78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
79 # Pi itself:
80 #
81 # [0][4] [0][3] [0][2] [0][1]
82 # [3][0] [1][0] [4][0] [2][0]
83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85 # [3][1] [1][2] [4][3] [2][4]
86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88 # [3][4] [1][3] [4][2] [2][1]
89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91 # [3][2] [1][4] [4][1] [2][3]
92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94 # [3][3] [1][1] [4][4] [2][2]
95 #
96 # And reverse post-Chi permutation:
97 #
98 # [0][4] [0][3] [0][2] [0][1]
99 # [3][0] [1][0] [4][0] [2][0]
100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101 # [2][4] [4][3] [1][2] [3][1]
102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103 # [3][4] [1][3] [4][2] [2][1]
104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105 # [1][4] [2][3] [3][2] [4][1]
106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107 # [4][4] [3][3] [2][2] [1][1]
108 #
109 ########################################################################
110 # Numbers are cycles per processed byte out of large message.
111 #
112 # r=1088(*)
113 #
114 # Haswell 8.7/+10%
115 # Skylake 7.8/+20%
116 # Ryzen 17(**)
117 #
118 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
119 # coefficient in comparison to scalar keccak1600-x86_64.pl.
120 # (**) It's expected that Ryzen performs poorly, because instruction
121 # issue rate is limited to two AVX2 instructions per cycle and
122 # in addition vpblendd is reportedly bound to specific port.
123 # Obviously this code path should not be executed on Ryzen.
124
125 my @T = map("%ymm$_",(7..15));
126 my ($C14,$C00,$D00,$D14) = @T[5..8];
127
128 $code.=<<___;
129 .text
130
131 .type __KeccakF1600,\@function
132 .align 32
133 __KeccakF1600:
134 lea rhotates_left+96(%rip),%r8
135 lea rhotates_right+96(%rip),%r9
136 lea iotas(%rip),%r10
137 mov \$24,%eax
138 jmp .Loop_avx2
139
140 .align 32
141 .Loop_avx2:
142 ######################################### Theta
143 vpshufd \$0b01001110,$A20,$C00
144 vpxor $A31,$A41,$C14
145 vpxor $A11,$A21,@T[2]
146 vpxor $A01,$C14,$C14
147 vpxor @T[2],$C14,$C14 # C[1..4]
148
149 vpermq \$0b10010011,$C14,@T[4]
150 vpxor $A20,$C00,$C00
151 vpermq \$0b01001110,$C00,@T[0]
152
153 vpsrlq \$63,$C14,@T[1]
154 vpaddq $C14,$C14,@T[2]
155 vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
156
157 vpermq \$0b00111001,@T[1],$D14
158 vpxor @T[4],@T[1],$D00
159 vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
160
161 vpxor $A00,$C00,$C00
162 vpxor @T[0],$C00,$C00 # C[0..0]
163
164 vpsrlq \$63,$C00,@T[0]
165 vpaddq $C00,$C00,@T[1]
166 vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
167
168 vpxor $D00,$A20,$A20 # ^= D[0..0]
169 vpxor $D00,$A00,$A00 # ^= D[0..0]
170
171 vpblendd \$0b11000000,@T[1],$D14,$D14
172 vpblendd \$0b00000011,$C00,@T[4],@T[4]
173 vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
174
175 ######################################### Rho + Pi + pre-Chi shuffle
176 vpsllvq 0*32-96(%r8),$A20,@T[3]
177 vpsrlvq 0*32-96(%r9),$A20,$A20
178 vpor @T[3],$A20,$A20
179
180 vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta
181 vpsllvq 2*32-96(%r8),$A31,@T[4]
182 vpsrlvq 2*32-96(%r9),$A31,$A31
183 vpor @T[4],$A31,$A31
184
185 vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta
186 vpsllvq 3*32-96(%r8),$A21,@T[5]
187 vpsrlvq 3*32-96(%r9),$A21,$A21
188 vpor @T[5],$A21,$A21
189
190 vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta
191 vpsllvq 4*32-96(%r8),$A41,@T[6]
192 vpsrlvq 4*32-96(%r9),$A41,$A41
193 vpor @T[6],$A41,$A41
194
195 vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta
196 vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
197 vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
198 vpsllvq 5*32-96(%r8),$A11,@T[7]
199 vpsrlvq 5*32-96(%r9),$A11,@T[1]
200 vpor @T[7],@T[1],@T[1] # $A11 -> future $A01
201
202 vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta
203 vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
204 vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
205 vpsllvq 1*32-96(%r8),$A01,@T[8]
206 vpsrlvq 1*32-96(%r9),$A01,@T[2]
207 vpor @T[8],@T[2],@T[2] # $A01 -> future $A20
208
209 ######################################### Chi
210 vpsrldq \$8,@T[1],@T[7]
211 vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
212
213 vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
214 vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
215 vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
216 vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
217 vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
218 vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
219 vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
220 vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
221 vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
222 vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
223 vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
224 vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
225 vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
226 vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
227
228 vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
229 vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
230 vpxor @T[3],$A31,$A31
231 vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
232 vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
233 vpxor @T[5],$A41,$A41
234 vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
235 vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
236 vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
237 vpxor @T[6],$A11,$A11
238
239 vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
240 vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
241 vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
242 vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
243 vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
244
245 vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
246 vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
247 vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
248 vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
249 vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
250 vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
251 vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
252 vpxor @T[2],$A20,$A20
253
254 vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
255 vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
256 vpermq \$0b10001101,$A41,$A41
257 vpermq \$0b01110010,$A11,$A11
258
259 vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
260 vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
261 vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
262 vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
263 vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
264 vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
265 vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
266
267 vpxor @T[0],$A00,$A00
268 vpxor @T[1],$A01,$A01
269 vpxor @T[4],$A21,$A21
270
271 ######################################### Iota
272 vpxor (%r10),$A00,$A00
273 lea 32(%r10),%r10
274
275 dec %eax
276 jnz .Loop_avx2
277
278 ret
279 .size __KeccakF1600,.-__KeccakF1600
280 ___
281 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
282 my $out = $inp; # in squeeze
283
284 $code.=<<___;
285 .globl SHA3_absorb
286 .type SHA3_absorb,\@function
287 .align 32
288 SHA3_absorb:
289 mov %rsp,%r11
290
291 lea -240(%rsp),%rsp
292 and \$-32,%rsp
293
294 lea 96($A_flat),$A_flat
295 lea 96($inp),$inp
296 lea 96(%rsp),%r10
297
298 vzeroupper
299
300 vpbroadcastq -96($A_flat),$A00 # load A[5][5]
301 vmovdqu 8+32*0-96($A_flat),$A01
302 vmovdqu 8+32*1-96($A_flat),$A20
303 vmovdqu 8+32*2-96($A_flat),$A31
304 vmovdqu 8+32*3-96($A_flat),$A21
305 vmovdqu 8+32*4-96($A_flat),$A41
306 vmovdqu 8+32*5-96($A_flat),$A11
307
308 vpxor @T[0],@T[0],@T[0]
309 vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
310 vmovdqa @T[0],32*3-96(%r10)
311 vmovdqa @T[0],32*4-96(%r10)
312 vmovdqa @T[0],32*5-96(%r10)
313 vmovdqa @T[0],32*6-96(%r10)
314
315 .Loop_absorb_avx2:
316 mov $bsz,%rax
317 sub $bsz,$len
318 jc .Ldone_absorb_avx2
319
320 shr \$3,%eax
321 vpbroadcastq 0-96($inp),@T[0]
322 vmovdqu 8-96($inp),@T[1]
323 sub \$4,%eax
324 ___
325 for(my $i=5; $i<25; $i++) {
326 $code.=<<___
327 dec %eax
328 jz .Labsorved_avx2
329 mov 8*$i-96($inp),%r8
330 mov %r8,$A_jagged[$i]-96(%r10)
331 ___
332 }
333 $code.=<<___;
334 .Labsorved_avx2:
335 lea ($inp,$bsz),$inp
336
337 vpxor @T[0],$A00,$A00
338 vpxor @T[1],$A01,$A01
339 vpxor 32*2-96(%r10),$A20,$A20
340 vpxor 32*3-96(%r10),$A31,$A31
341 vpxor 32*4-96(%r10),$A21,$A21
342 vpxor 32*5-96(%r10),$A41,$A41
343 vpxor 32*6-96(%r10),$A11,$A11
344
345 call __KeccakF1600
346
347 lea 96(%rsp),%r10
348 jmp .Loop_absorb_avx2
349
350 .Ldone_absorb_avx2:
351 vmovq %xmm0,-96($A_flat)
352 vmovdqu $A01,8+32*0-96($A_flat)
353 vmovdqu $A20,8+32*1-96($A_flat)
354 vmovdqu $A31,8+32*2-96($A_flat)
355 vmovdqu $A21,8+32*3-96($A_flat)
356 vmovdqu $A41,8+32*4-96($A_flat)
357 vmovdqu $A11,8+32*5-96($A_flat)
358
359 vzeroupper
360
361 lea (%r11),%rsp
362 lea ($len,$bsz),%rax # return value
363 ret
364 .size SHA3_absorb,.-SHA3_absorb
365
366 .globl SHA3_squeeze
367 .type SHA3_squeeze,\@function
368 .align 32
369 SHA3_squeeze:
370 mov %rsp,%r11
371
372 lea 96($A_flat),$A_flat
373 shr \$3,$bsz
374
375 vzeroupper
376
377 vpbroadcastq -96($A_flat),$A00
378 vpxor @T[0],@T[0],@T[0]
379 vmovdqu 8+32*0-96($A_flat),$A01
380 vmovdqu 8+32*1-96($A_flat),$A20
381 vmovdqu 8+32*2-96($A_flat),$A31
382 vmovdqu 8+32*3-96($A_flat),$A21
383 vmovdqu 8+32*4-96($A_flat),$A41
384 vmovdqu 8+32*5-96($A_flat),$A11
385
386 mov $bsz,%rax
387
388 .Loop_squeeze_avx2:
389 mov @A_jagged[$i]-96($A_flat),%r8
390 ___
391 for (my $i=0; $i<25; $i++) {
392 $code.=<<___;
393 sub \$8,$len
394 jc .Ltail_squeeze_avx2
395 mov %r8,($out)
396 lea 8($out),$out
397 je .Ldone_squeeze_avx2
398 dec %eax
399 je .Lextend_output_avx2
400 mov @A_jagged[$i+1]-120($A_flat),%r8
401 ___
402 }
403 $code.=<<___;
404 .Lextend_output_avx2:
405 call __KeccakF1600
406
407 vmovq %xmm0,-96($A_flat)
408 vmovdqu $A01,8+32*0-96($A_flat)
409 vmovdqu $A20,8+32*1-96($A_flat)
410 vmovdqu $A31,8+32*2-96($A_flat)
411 vmovdqu $A21,8+32*3-96($A_flat)
412 vmovdqu $A41,8+32*4-96($A_flat)
413 vmovdqu $A11,8+32*5-96($A_flat)
414
415 mov $bsz,%rax
416 jmp .Loop_squeeze_avx2
417
418
419 .Ltail_squeeze_avx2:
420 add \$8,$len
421 .Loop_tail_avx2:
422 mov %r8b,($out)
423 lea 1($out),$out
424 shr \$8,%r8
425 dec $len
426 jnz .Loop_tail_avx2
427
428 .Ldone_squeeze_avx2:
429 vzeroupper
430
431 lea (%r11),%rsp
432 ret
433 .size SHA3_squeeze,.-SHA3_squeeze
434
435 .align 64
436 rhotates_left:
437 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
438 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
439 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
440 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
441 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
442 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
443 rhotates_right:
444 .quad 64-3, 64-18, 64-36, 64-41
445 .quad 64-1, 64-62, 64-28, 64-27
446 .quad 64-45, 64-6, 64-56, 64-39
447 .quad 64-10, 64-61, 64-55, 64-8
448 .quad 64-2, 64-15, 64-25, 64-20
449 .quad 64-44, 64-43, 64-21, 64-14
450 iotas:
451 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
452 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
453 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
454 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
455 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
456 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
457 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
458 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
459 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
460 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
461 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
462 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
463 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
464 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
465 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
466 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
467 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
468 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
469 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
470 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
471 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
472 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
473 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
474 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
475
476 .asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
477 ___
478
479 $output=pop;
480 open STDOUT,">$output";
481 print $code;
482 close STDOUT or die "error closing STDOUT: $!";