]>
Commit | Line | Data |
---|---|---|
313fa47f | 1 | #!/usr/bin/env perl |
b0edda11 | 2 | # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. |
313fa47f | 3 | # |
a598ed0d | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
313fa47f AP |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | # | |
9 | # ==================================================================== | |
10 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
11 | # project. The module is, however, dual licensed under OpenSSL and | |
12 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
13 | # details see http://www.openssl.org/~appro/cryptogams/. | |
14 | # ==================================================================== | |
15 | # | |
16 | # Keccak-1600 for AVX-512F. | |
17 | # | |
18 | # July 2017. | |
19 | # | |
20 | # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c). | |
21 | # Pretty straightforward, the only "magic" is data layout in registers. | |
22 | # It's impossible to have one that is optimal for every step, hence | |
e3c79f0f AP |
23 | # it's changing as algorithm progresses. Data is saved in linear order, |
24 | # but in-register order morphs between rounds. Even rounds take in | |
25 | # linear layout, and odd rounds - transposed, or "verticaly-shaped"... | |
313fa47f AP |
26 | # |
27 | ######################################################################## | |
28 | # Numbers are cycles per processed byte out of large message. | |
29 | # | |
30 | # r=1088(*) | |
31 | # | |
e3c79f0f AP |
32 | # Knights Landing 7.6 |
33 | # Skylake-X 5.7 | |
313fa47f AP |
34 | # |
35 | # (*) Corresponds to SHA3-256. | |
36 | ||
37 | ######################################################################## | |
e3c79f0f AP |
38 | # Below code is combination of two ideas. One is taken from Keccak Code |
39 | # Package, hereafter KCP, and another one from initial version of this | |
40 | # module. What is common is observation that Pi's input and output are | |
41 | # "mostly transposed", i.e. if input is aligned by x coordinate, then | |
42 | # output is [mostly] aligned by y. Both versions, KCP and predecessor, | |
43 | # were trying to use one of them from round to round, which resulted in | |
44 | # some kind of transposition in each round. This version still does | |
45 | # transpose data, but only every second round. Another essential factor | |
46 | # is that KCP transposition has to be performed with instructions that | |
47 | # turned to be rather expensive on Knights Landing, both latency- and | |
48 | # throughput-wise. Not to mention that some of them have to depend on | |
49 | # each other. On the other hand initial version of this module was | |
50 | # relying heavily on blend instructions. There were lots of them, | |
51 | # resulting in higher instruction count, yet it performed better on | |
52 | # Knights Landing, because processor can execute pair of them each | |
53 | # cycle and they have minimal latency. This module is an attempt to | |
54 | # bring best parts together:-) | |
55 | # | |
56 | # Coordinates below correspond to those in sha/keccak1600.c. Input | |
57 | # layout is straight linear: | |
58 | # | |
59 | # [0][4] [0][3] [0][2] [0][1] [0][0] | |
60 | # [1][4] [1][3] [1][2] [1][1] [1][0] | |
61 | # [2][4] [2][3] [2][2] [2][1] [2][0] | |
62 | # [3][4] [3][3] [3][2] [3][1] [3][0] | |
63 | # [4][4] [4][3] [4][2] [4][1] [4][0] | |
64 | # | |
65 | # It's perfect for Theta, while Pi is reduced to intra-register | |
66 | # permutations which yield layout perfect for Chi: | |
67 | # | |
68 | # [4][0] [3][0] [2][0] [1][0] [0][0] | |
69 | # [4][1] [3][1] [2][1] [1][1] [0][1] | |
70 | # [4][2] [3][2] [2][2] [1][2] [0][2] | |
71 | # [4][3] [3][3] [2][3] [1][3] [0][3] | |
72 | # [4][4] [3][4] [2][4] [1][4] [0][4] | |
73 | # | |
74 | # Now instead of performing full transposition and feeding it to next | |
75 | # identical round, we perform kind of diagonal transposition to layout | |
76 | # from initial version of this module, and make it suitable for Theta: | |
313fa47f AP |
77 | # |
78 | # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0] | |
79 | # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0] | |
80 | # [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0] | |
81 | # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0] | |
82 | # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0] | |
83 | # | |
e3c79f0f AP |
84 | # Now intra-register permutations yield initial [almost] straight |
85 | # linear layout: | |
313fa47f | 86 | # |
e3c79f0f | 87 | # [4][4] [3][3] [2][2] [1][1] [0][0] |
313fa47f | 88 | ##[0][4] [0][3] [0][2] [0][1] [0][0] |
e3c79f0f | 89 | # [3][4] [2][3] [1][2] [0][1] [4][0] |
313fa47f | 90 | ##[2][3] [2][2] [2][1] [2][0] [2][4] |
e3c79f0f | 91 | # [2][4] [1][3] [0][2] [4][1] [3][0] |
313fa47f | 92 | ##[4][2] [4][1] [4][0] [4][4] [4][3] |
e3c79f0f | 93 | # [1][4] [0][3] [4][2] [3][1] [2][0] |
313fa47f | 94 | ##[1][1] [1][0] [1][4] [1][3] [1][2] |
e3c79f0f | 95 | # [0][4] [4][3] [3][2] [2][1] [1][0] |
313fa47f AP |
96 | ##[3][0] [3][4] [3][3] [3][2] [3][1] |
97 | # | |
e3c79f0f AP |
98 | # This means that odd round Chi is performed in less suitable layout, |
99 | # with a number of additional permutations. But overall it turned to be | |
100 | # a win. Permutations are fastest possible on Knights Landing and they | |
101 | # are laid down to be independent of each other. In the essence I traded | |
102 | # 20 blend instructions for 3 permutations. The result is 13% faster | |
103 | # than KCP on Skylake-X, and >40% on Knights Landing. | |
313fa47f | 104 | # |
e3c79f0f AP |
105 | # As implied, data is loaded in straight linear order. Digits in |
106 | # variables' names represent coordinates of right-most element of | |
107 | # loaded data chunk: | |
108 | ||
109 | my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0] | |
110 | $A10, # [1][4] [1][3] [1][2] [1][1] [1][0] | |
111 | $A20, # [2][4] [2][3] [2][2] [2][1] [2][0] | |
112 | $A30, # [3][4] [3][3] [3][2] [3][1] [3][0] | |
113 | $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0] | |
313fa47f AP |
114 | map("%zmm$_",(0..4)); |
115 | ||
116 | # We also need to map the magic order into offsets within structure: | |
117 | ||
e3c79f0f AP |
118 | my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4], |
119 | [1,0], [1,1], [1,2], [1,3], [1,4], | |
120 | [2,0], [2,1], [2,2], [2,3], [2,4], | |
121 | [3,0], [3,1], [3,2], [3,3], [3,4], | |
122 | [4,0], [4,1], [4,2], [4,3], [4,4]); | |
123 | @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear | |
313fa47f | 124 | |
e3c79f0f AP |
125 | my @T = map("%zmm$_",(5..12)); |
126 | my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo | |
127 | my @Pi0 = map("%zmm$_",(17..21)); | |
128 | my @Rhotate0 = map("%zmm$_",(22..26)); | |
129 | my @Rhotate1 = map("%zmm$_",(27..31)); | |
313fa47f AP |
130 | |
131 | my ($C00,$D00) = @T[0..1]; | |
132 | my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6)); | |
133 | ||
134 | $code.=<<___; | |
135 | .text | |
136 | ||
137 | .type __KeccakF1600,\@function | |
138 | .align 32 | |
139 | __KeccakF1600: | |
140 | lea iotas(%rip),%r10 | |
e3c79f0f | 141 | mov \$12,%eax |
313fa47f AP |
142 | jmp .Loop_avx512 |
143 | ||
144 | .align 32 | |
145 | .Loop_avx512: | |
e3c79f0f | 146 | ######################################### Theta, even round |
0d7903f8 | 147 | vmovdqa64 $A00,@T[0] # put aside original A00 |
e3c79f0f AP |
148 | vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00" |
149 | vpternlogq \$0x96,$A40,$A30,$A00 | |
313fa47f | 150 | |
0d7903f8 AP |
151 | vprolq \$1,$A00,$D00 |
152 | vpermq $A00,@Theta[1],$A00 | |
313fa47f AP |
153 | vpermq $D00,@Theta[4],$D00 |
154 | ||
0d7903f8 | 155 | vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00 |
e3c79f0f AP |
156 | vpternlogq \$0x96,$A00,$D00,$A10 |
157 | vpternlogq \$0x96,$A00,$D00,$A20 | |
158 | vpternlogq \$0x96,$A00,$D00,$A30 | |
159 | vpternlogq \$0x96,$A00,$D00,$A40 | |
313fa47f AP |
160 | |
161 | ######################################### Rho | |
e3c79f0f AP |
162 | vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00 |
163 | vprolvq @Rhotate0[1],$A10,$A10 | |
164 | vprolvq @Rhotate0[2],$A20,$A20 | |
165 | vprolvq @Rhotate0[3],$A30,$A30 | |
166 | vprolvq @Rhotate0[4],$A40,$A40 | |
313fa47f AP |
167 | |
168 | ######################################### Pi | |
e3c79f0f AP |
169 | vpermq $A00,@Pi0[0],$A00 |
170 | vpermq $A10,@Pi0[1],$A10 | |
171 | vpermq $A20,@Pi0[2],$A20 | |
172 | vpermq $A30,@Pi0[3],$A30 | |
173 | vpermq $A40,@Pi0[4],$A40 | |
313fa47f AP |
174 | |
175 | ######################################### Chi | |
176 | vmovdqa64 $A00,@T[0] | |
e3c79f0f AP |
177 | vmovdqa64 $A10,@T[1] |
178 | vpternlogq \$0xD2,$A20,$A10,$A00 | |
179 | vpternlogq \$0xD2,$A30,$A20,$A10 | |
180 | vpternlogq \$0xD2,$A40,$A30,$A20 | |
181 | vpternlogq \$0xD2,@T[0],$A40,$A30 | |
182 | vpternlogq \$0xD2,@T[1],@T[0],$A40 | |
313fa47f AP |
183 | |
184 | ######################################### Iota | |
185 | vpxorq (%r10),$A00,${A00}{$k00001} | |
e3c79f0f AP |
186 | lea 16(%r10),%r10 |
187 | ||
188 | ######################################### Harmonize rounds | |
189 | vpblendmq $A20,$A10,@{T[1]}{$k00010} | |
190 | vpblendmq $A30,$A20,@{T[2]}{$k00010} | |
191 | vpblendmq $A40,$A30,@{T[3]}{$k00010} | |
192 | vpblendmq $A10,$A00,@{T[0]}{$k00010} | |
193 | vpblendmq $A00,$A40,@{T[4]}{$k00010} | |
194 | ||
195 | vpblendmq $A30,@T[1],@{T[1]}{$k00100} | |
196 | vpblendmq $A40,@T[2],@{T[2]}{$k00100} | |
197 | vpblendmq $A20,@T[0],@{T[0]}{$k00100} | |
198 | vpblendmq $A00,@T[3],@{T[3]}{$k00100} | |
199 | vpblendmq $A10,@T[4],@{T[4]}{$k00100} | |
200 | ||
201 | vpblendmq $A40,@T[1],@{T[1]}{$k01000} | |
202 | vpblendmq $A30,@T[0],@{T[0]}{$k01000} | |
203 | vpblendmq $A00,@T[2],@{T[2]}{$k01000} | |
204 | vpblendmq $A10,@T[3],@{T[3]}{$k01000} | |
205 | vpblendmq $A20,@T[4],@{T[4]}{$k01000} | |
206 | ||
207 | vpblendmq $A40,@T[0],@{T[0]}{$k10000} | |
208 | vpblendmq $A00,@T[1],@{T[1]}{$k10000} | |
209 | vpblendmq $A10,@T[2],@{T[2]}{$k10000} | |
210 | vpblendmq $A20,@T[3],@{T[3]}{$k10000} | |
211 | vpblendmq $A30,@T[4],@{T[4]}{$k10000} | |
212 | ||
213 | #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order | |
214 | vpermq @T[1],@Theta[1],$A10 | |
215 | vpermq @T[2],@Theta[2],$A20 | |
216 | vpermq @T[3],@Theta[3],$A30 | |
217 | vpermq @T[4],@Theta[4],$A40 | |
218 | ||
219 | ######################################### Theta, odd round | |
220 | vmovdqa64 $T[0],$A00 # real A00 | |
221 | vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias | |
222 | vpternlogq \$0x96,$A40,$A30,$C00 | |
223 | ||
224 | vprolq \$1,$C00,$D00 | |
225 | vpermq $C00,@Theta[1],$C00 | |
226 | vpermq $D00,@Theta[4],$D00 | |
227 | ||
228 | vpternlogq \$0x96,$C00,$D00,$A00 | |
229 | vpternlogq \$0x96,$C00,$D00,$A30 | |
230 | vpternlogq \$0x96,$C00,$D00,$A10 | |
231 | vpternlogq \$0x96,$C00,$D00,$A40 | |
232 | vpternlogq \$0x96,$C00,$D00,$A20 | |
233 | ||
234 | ######################################### Rho | |
235 | vprolvq @Rhotate1[0],$A00,$A00 | |
236 | vprolvq @Rhotate1[3],$A30,@T[1] | |
237 | vprolvq @Rhotate1[1],$A10,@T[2] | |
238 | vprolvq @Rhotate1[4],$A40,@T[3] | |
239 | vprolvq @Rhotate1[2],$A20,@T[4] | |
240 | ||
241 | vpermq $A00,@Theta[4],@T[5] | |
242 | vpermq $A00,@Theta[3],@T[6] | |
243 | ||
244 | ######################################### Iota | |
245 | vpxorq -8(%r10),$A00,${A00}{$k00001} | |
246 | ||
247 | ######################################### Pi | |
248 | vpermq @T[1],@Theta[2],$A10 | |
249 | vpermq @T[2],@Theta[4],$A20 | |
250 | vpermq @T[3],@Theta[1],$A30 | |
251 | vpermq @T[4],@Theta[3],$A40 | |
252 | ||
253 | ######################################### Chi | |
254 | vpternlogq \$0xD2,@T[6],@T[5],$A00 | |
255 | ||
256 | vpermq @T[1],@Theta[1],@T[7] | |
257 | #vpermq @T[1],@Theta[0],@T[1] | |
258 | vpternlogq \$0xD2,@T[1],@T[7],$A10 | |
259 | ||
260 | vpermq @T[2],@Theta[3],@T[0] | |
261 | vpermq @T[2],@Theta[2],@T[2] | |
262 | vpternlogq \$0xD2,@T[2],@T[0],$A20 | |
263 | ||
264 | #vpermq @T[3],@Theta[0],@T[3] | |
265 | vpermq @T[3],@Theta[4],@T[1] | |
266 | vpternlogq \$0xD2,@T[1],@T[3],$A30 | |
267 | ||
268 | vpermq @T[4],@Theta[2],@T[0] | |
269 | vpermq @T[4],@Theta[1],@T[4] | |
270 | vpternlogq \$0xD2,@T[4],@T[0],$A40 | |
313fa47f AP |
271 | |
272 | dec %eax | |
273 | jnz .Loop_avx512 | |
274 | ||
275 | ret | |
276 | .size __KeccakF1600,.-__KeccakF1600 | |
277 | ___ | |
278 | ||
279 | my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); | |
280 | my $out = $inp; # in squeeze | |
281 | ||
282 | $code.=<<___; | |
283 | .globl SHA3_absorb | |
284 | .type SHA3_absorb,\@function | |
285 | .align 32 | |
286 | SHA3_absorb: | |
287 | mov %rsp,%r11 | |
288 | ||
289 | lea -320(%rsp),%rsp | |
290 | and \$-64,%rsp | |
291 | ||
292 | lea 96($A_flat),$A_flat | |
293 | lea 96($inp),$inp | |
294 | lea 128(%rsp),%r9 | |
295 | ||
313fa47f AP |
296 | lea theta_perm(%rip),%r8 |
297 | ||
298 | kxnorw $k11111,$k11111,$k11111 | |
299 | kshiftrw \$15,$k11111,$k00001 | |
300 | kshiftrw \$11,$k11111,$k11111 | |
301 | kshiftlw \$1,$k00001,$k00010 | |
302 | kshiftlw \$2,$k00001,$k00100 | |
303 | kshiftlw \$3,$k00001,$k01000 | |
304 | kshiftlw \$4,$k00001,$k10000 | |
305 | ||
306 | #vmovdqa64 64*0(%r8),@Theta[0] | |
307 | vmovdqa64 64*1(%r8),@Theta[1] | |
308 | vmovdqa64 64*2(%r8),@Theta[2] | |
309 | vmovdqa64 64*3(%r8),@Theta[3] | |
310 | vmovdqa64 64*4(%r8),@Theta[4] | |
311 | ||
e3c79f0f AP |
312 | vmovdqa64 64*5(%r8),@Rhotate1[0] |
313 | vmovdqa64 64*6(%r8),@Rhotate1[1] | |
314 | vmovdqa64 64*7(%r8),@Rhotate1[2] | |
315 | vmovdqa64 64*8(%r8),@Rhotate1[3] | |
316 | vmovdqa64 64*9(%r8),@Rhotate1[4] | |
317 | ||
318 | vmovdqa64 64*10(%r8),@Rhotate0[0] | |
319 | vmovdqa64 64*11(%r8),@Rhotate0[1] | |
320 | vmovdqa64 64*12(%r8),@Rhotate0[2] | |
321 | vmovdqa64 64*13(%r8),@Rhotate0[3] | |
322 | vmovdqa64 64*14(%r8),@Rhotate0[4] | |
313fa47f | 323 | |
e3c79f0f AP |
324 | vmovdqa64 64*15(%r8),@Pi0[0] |
325 | vmovdqa64 64*16(%r8),@Pi0[1] | |
326 | vmovdqa64 64*17(%r8),@Pi0[2] | |
327 | vmovdqa64 64*18(%r8),@Pi0[3] | |
328 | vmovdqa64 64*19(%r8),@Pi0[4] | |
313fa47f AP |
329 | |
330 | vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} | |
331 | vpxorq @T[0],@T[0],@T[0] | |
e3c79f0f AP |
332 | vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} |
333 | vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} | |
334 | vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} | |
335 | vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} | |
313fa47f AP |
336 | |
337 | vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack | |
338 | vmovdqa64 @T[0],1*64-128(%r9) | |
339 | vmovdqa64 @T[0],2*64-128(%r9) | |
340 | vmovdqa64 @T[0],3*64-128(%r9) | |
341 | vmovdqa64 @T[0],4*64-128(%r9) | |
342 | jmp .Loop_absorb_avx512 | |
343 | ||
344 | .align 32 | |
345 | .Loop_absorb_avx512: | |
346 | mov $bsz,%rax | |
347 | sub $bsz,$len | |
348 | jc .Ldone_absorb_avx512 | |
349 | ||
350 | shr \$3,%eax | |
313fa47f | 351 | ___ |
0d7903f8 | 352 | for(my $i=0; $i<25; $i++) { |
313fa47f | 353 | $code.=<<___ |
313fa47f | 354 | mov 8*$i-96($inp),%r8 |
e3c79f0f | 355 | mov %r8,$A_jagged[$i]-128(%r9) |
0d7903f8 AP |
356 | dec %eax |
357 | jz .Labsorved_avx512 | |
313fa47f AP |
358 | ___ |
359 | } | |
360 | $code.=<<___; | |
361 | .Labsorved_avx512: | |
362 | lea ($inp,$bsz),$inp | |
363 | ||
0d7903f8 | 364 | vpxorq 64*0-128(%r9),$A00,$A00 |
e3c79f0f AP |
365 | vpxorq 64*1-128(%r9),$A10,$A10 |
366 | vpxorq 64*2-128(%r9),$A20,$A20 | |
367 | vpxorq 64*3-128(%r9),$A30,$A30 | |
368 | vpxorq 64*4-128(%r9),$A40,$A40 | |
313fa47f AP |
369 | |
370 | call __KeccakF1600 | |
371 | ||
372 | jmp .Loop_absorb_avx512 | |
373 | ||
374 | .align 32 | |
375 | .Ldone_absorb_avx512: | |
376 | vmovdqu64 $A00,40*0-96($A_flat){$k11111} | |
e3c79f0f AP |
377 | vmovdqu64 $A10,40*1-96($A_flat){$k11111} |
378 | vmovdqu64 $A20,40*2-96($A_flat){$k11111} | |
379 | vmovdqu64 $A30,40*3-96($A_flat){$k11111} | |
380 | vmovdqu64 $A40,40*4-96($A_flat){$k11111} | |
313fa47f AP |
381 | |
382 | vzeroupper | |
383 | ||
384 | lea (%r11),%rsp | |
385 | lea ($len,$bsz),%rax # return value | |
386 | ret | |
387 | .size SHA3_absorb,.-SHA3_absorb | |
388 | ||
389 | .globl SHA3_squeeze | |
390 | .type SHA3_squeeze,\@function | |
391 | .align 32 | |
392 | SHA3_squeeze: | |
393 | mov %rsp,%r11 | |
394 | ||
395 | lea 96($A_flat),$A_flat | |
396 | cmp $bsz,$len | |
397 | jbe .Lno_output_extension_avx512 | |
398 | ||
313fa47f AP |
399 | lea theta_perm(%rip),%r8 |
400 | ||
401 | kxnorw $k11111,$k11111,$k11111 | |
402 | kshiftrw \$15,$k11111,$k00001 | |
403 | kshiftrw \$11,$k11111,$k11111 | |
404 | kshiftlw \$1,$k00001,$k00010 | |
405 | kshiftlw \$2,$k00001,$k00100 | |
406 | kshiftlw \$3,$k00001,$k01000 | |
407 | kshiftlw \$4,$k00001,$k10000 | |
408 | ||
409 | #vmovdqa64 64*0(%r8),@Theta[0] | |
410 | vmovdqa64 64*1(%r8),@Theta[1] | |
411 | vmovdqa64 64*2(%r8),@Theta[2] | |
412 | vmovdqa64 64*3(%r8),@Theta[3] | |
413 | vmovdqa64 64*4(%r8),@Theta[4] | |
414 | ||
e3c79f0f AP |
415 | vmovdqa64 64*5(%r8),@Rhotate1[0] |
416 | vmovdqa64 64*6(%r8),@Rhotate1[1] | |
417 | vmovdqa64 64*7(%r8),@Rhotate1[2] | |
418 | vmovdqa64 64*8(%r8),@Rhotate1[3] | |
419 | vmovdqa64 64*9(%r8),@Rhotate1[4] | |
420 | ||
421 | vmovdqa64 64*10(%r8),@Rhotate0[0] | |
422 | vmovdqa64 64*11(%r8),@Rhotate0[1] | |
423 | vmovdqa64 64*12(%r8),@Rhotate0[2] | |
424 | vmovdqa64 64*13(%r8),@Rhotate0[3] | |
425 | vmovdqa64 64*14(%r8),@Rhotate0[4] | |
313fa47f | 426 | |
e3c79f0f AP |
427 | vmovdqa64 64*15(%r8),@Pi0[0] |
428 | vmovdqa64 64*16(%r8),@Pi0[1] | |
429 | vmovdqa64 64*17(%r8),@Pi0[2] | |
430 | vmovdqa64 64*18(%r8),@Pi0[3] | |
431 | vmovdqa64 64*19(%r8),@Pi0[4] | |
313fa47f AP |
432 | |
433 | vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} | |
e3c79f0f AP |
434 | vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} |
435 | vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} | |
436 | vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} | |
437 | vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} | |
313fa47f AP |
438 | |
439 | .Lno_output_extension_avx512: | |
440 | shr \$3,$bsz | |
e3c79f0f | 441 | lea -96($A_flat),%r9 |
313fa47f | 442 | mov $bsz,%rax |
e3c79f0f | 443 | jmp .Loop_squeeze_avx512 |
313fa47f | 444 | |
e3c79f0f | 445 | .align 32 |
313fa47f | 446 | .Loop_squeeze_avx512: |
e3c79f0f AP |
447 | cmp \$8,$len |
448 | jb .Ltail_squeeze_avx512 | |
449 | ||
450 | mov (%r9),%r8 | |
451 | lea 8(%r9),%r9 | |
313fa47f AP |
452 | mov %r8,($out) |
453 | lea 8($out),$out | |
e3c79f0f AP |
454 | sub \$8,$len # len -= 8 |
455 | jz .Ldone_squeeze_avx512 | |
456 | ||
457 | sub \$1,%rax # bsz-- | |
458 | jnz .Loop_squeeze_avx512 | |
459 | ||
460 | #vpermq @Theta[4],@Theta[4],@Theta[3] | |
461 | #vpermq @Theta[3],@Theta[4],@Theta[2] | |
462 | #vpermq @Theta[3],@Theta[3],@Theta[1] | |
463 | ||
464 | call __KeccakF1600 | |
313fa47f AP |
465 | |
466 | vmovdqu64 $A00,40*0-96($A_flat){$k11111} | |
e3c79f0f AP |
467 | vmovdqu64 $A10,40*1-96($A_flat){$k11111} |
468 | vmovdqu64 $A20,40*2-96($A_flat){$k11111} | |
469 | vmovdqu64 $A30,40*3-96($A_flat){$k11111} | |
470 | vmovdqu64 $A40,40*4-96($A_flat){$k11111} | |
313fa47f | 471 | |
e3c79f0f | 472 | lea -96($A_flat),%r9 |
313fa47f AP |
473 | mov $bsz,%rax |
474 | jmp .Loop_squeeze_avx512 | |
475 | ||
313fa47f | 476 | .Ltail_squeeze_avx512: |
e3c79f0f | 477 | mov $out,%rdi |
3c1a60e5 | 478 | mov %r9,%rsi |
e3c79f0f AP |
479 | mov $len,%rcx |
480 | .byte 0xf3,0xa4 # rep movsb | |
313fa47f AP |
481 | |
482 | .Ldone_squeeze_avx512: | |
483 | vzeroupper | |
484 | ||
485 | lea (%r11),%rsp | |
486 | ret | |
487 | .size SHA3_squeeze,.-SHA3_squeeze | |
488 | ||
489 | .align 64 | |
490 | theta_perm: | |
491 | .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] | |
492 | .quad 4, 0, 1, 2, 3, 5, 6, 7 | |
493 | .quad 3, 4, 0, 1, 2, 5, 6, 7 | |
494 | .quad 2, 3, 4, 0, 1, 5, 6, 7 | |
495 | .quad 1, 2, 3, 4, 0, 5, 6, 7 | |
496 | ||
e3c79f0f | 497 | rhotates1: |
313fa47f AP |
498 | .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] |
499 | .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] | |
500 | .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] | |
501 | .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] | |
502 | .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] | |
503 | ||
e3c79f0f AP |
504 | rhotates0: |
505 | .quad 0, 1, 62, 28, 27, 0, 0, 0 | |
506 | .quad 36, 44, 6, 55, 20, 0, 0, 0 | |
507 | .quad 3, 10, 43, 25, 39, 0, 0, 0 | |
508 | .quad 41, 45, 15, 21, 8, 0, 0, 0 | |
509 | .quad 18, 2, 61, 56, 14, 0, 0, 0 | |
510 | ||
511 | pi0_perm: | |
512 | .quad 0, 3, 1, 4, 2, 5, 6, 7 | |
513 | .quad 1, 4, 2, 0, 3, 5, 6, 7 | |
514 | .quad 2, 0, 3, 1, 4, 5, 6, 7 | |
515 | .quad 3, 1, 4, 2, 0, 5, 6, 7 | |
516 | .quad 4, 2, 0, 3, 1, 5, 6, 7 | |
517 | ||
313fa47f AP |
518 | |
519 | iotas: | |
520 | .quad 0x0000000000000001 | |
521 | .quad 0x0000000000008082 | |
522 | .quad 0x800000000000808a | |
523 | .quad 0x8000000080008000 | |
524 | .quad 0x000000000000808b | |
525 | .quad 0x0000000080000001 | |
526 | .quad 0x8000000080008081 | |
527 | .quad 0x8000000000008009 | |
528 | .quad 0x000000000000008a | |
529 | .quad 0x0000000000000088 | |
530 | .quad 0x0000000080008009 | |
531 | .quad 0x000000008000000a | |
532 | .quad 0x000000008000808b | |
533 | .quad 0x800000000000008b | |
534 | .quad 0x8000000000008089 | |
535 | .quad 0x8000000000008003 | |
536 | .quad 0x8000000000008002 | |
537 | .quad 0x8000000000000080 | |
538 | .quad 0x000000000000800a | |
539 | .quad 0x800000008000000a | |
540 | .quad 0x8000000080008081 | |
541 | .quad 0x8000000000008080 | |
542 | .quad 0x0000000080000001 | |
543 | .quad 0x8000000080008008 | |
544 | ||
545 | .asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>" | |
546 | ___ | |
547 | ||
2bd3b626 RL |
548 | $output=pop; |
549 | open STDOUT,">$output"; | |
313fa47f AP |
550 | print $code; |
551 | close STDOUT; |