]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sha/asm/keccak1600-avx512vl.pl
Update copyright year
[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-avx512vl.pl
CommitLineData
24d06e8c 1#!/usr/bin/env perl
b0edda11 2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
24d06e8c
AP
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for AVX512VL.
17#
18# December 2017.
19#
20# This is an adaptation of AVX2 module that reuses register data
21# layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
22# module for further information on layout.
23#
24########################################################################
25# Numbers are cycles per processed byte out of large message.
26#
27# r=1088(*)
28#
29# Skylake-X 6.4/+47%
30#
31# (*) Corresponds to SHA3-256. Percentage after slash is improvement
32# coefficient in comparison to scalar keccak1600-x86_64.pl.
33
34# Digits in variables' names denote right-most coordinates:
35
36my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
37 $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
38 $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
39 $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
40 $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
41 $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
42 $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
43 map("%ymm$_",(0..6));
44
45# We also need to map the magic order into offsets within structure:
46
47my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
48 [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
49 [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
50 [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
51 [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
52 @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
53
54my @T = map("%ymm$_",(7..15));
55my ($C14,$C00,$D00,$D14) = @T[5..8];
56my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
57
58$code.=<<___;
59.text
60
61.type __KeccakF1600,\@function
62.align 32
63__KeccakF1600:
64 lea iotas(%rip),%r10
65 mov \$24,%eax
66 jmp .Loop_avx512vl
67
68.align 32
69.Loop_avx512vl:
70 ######################################### Theta
71 vpshufd \$0b01001110,$A20,$C00
72 vpxor $A31,$A41,$C14
73 vpxor $A11,$A21,@T[2]
74 vpternlogq \$0x96,$A01,$T[2],$C14 # C[1..4]
75
76 vpxor $A20,$C00,$C00
77 vpermq \$0b01001110,$C00,@T[0]
78
79 vpermq \$0b10010011,$C14,@T[4]
80 vprolq \$1,$C14,@T[1] # ROL64(C[1..4],1)
81
82 vpermq \$0b00111001,@T[1],$D14
83 vpxor @T[4],@T[1],$D00
84 vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
85
86 vpternlogq \$0x96,@T[0],$A00,$C00 # C[0..0]
87 vprolq \$1,$C00,@T[1] # ROL64(C[0..0],1)
88
89 vpxor $D00,$A00,$A00 # ^= D[0..0]
90
91 vpblendd \$0b11000000,@T[1],$D14,$D14
92 vpblendd \$0b00000011,$C00,@T[4],@T[0]
93
94 ######################################### Rho + Pi + pre-Chi shuffle
95 vpxor $D00,$A20,$A20 # ^= D[0..0] from Theta
96 vprolvq $R20,$A20,$A20
97
98 vpternlogq \$0x96,@T[0],$D14,$A31 # ^= D[1..4] from Theta
99 vprolvq $R31,$A31,$A31
100
101 vpternlogq \$0x96,@T[0],$D14,$A21 # ^= D[1..4] from Theta
102 vprolvq $R21,$A21,$A21
103
104 vpternlogq \$0x96,@T[0],$D14,$A41 # ^= D[1..4] from Theta
105 vprolvq $R41,$A41,$A41
106
107 vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
108 vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
109 vpternlogq \$0x96,@T[0],$D14,$A11 # ^= D[1..4] from Theta
110 vprolvq $R11,$A11,@T[1] # $A11 -> future $A01
111
112 vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
113 vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
114 vpternlogq \$0x96,@T[0],$D14,$A01 # ^= D[1..4] from Theta
115 vprolvq $R01,$A01,@T[2] # $A01 -> future $A20
116
117 ######################################### Chi
118 vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
119 vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
120 vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
121 vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
122 vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
123 vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
124 vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
125 vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
126 vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
127 vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
128 vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
129 vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
130 vpternlogq \$0xC6,@T[8],@T[3],$A31 # [3][1] [1][2] [4][3] [2][4]
131 vpternlogq \$0xC6,@T[7],@T[5],$A41 # [3][2] [1][4] [4][1] [2][3]
132
133 vpsrldq \$8,@T[1],@T[0]
134 vpandn @T[0],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
135
136 vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
137 vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
138 vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
139 vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
140 vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
141 vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
142 vpternlogq \$0xC6,@T[8],@T[6],$A11 # [3][3] [1][1] [4][4] [2][2]
143
144 vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
145 vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
146 vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
147 vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
148
149 vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
150 vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
151 vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
152 vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
153 vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
154 vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
155 vpternlogq \$0xC6,@T[7],@T[2],$A20 # [3][0] [1][0] [4][0] [2][0]
156
157 vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
158 vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
159 vpermq \$0b10001101,$A41,$A41
160 vpermq \$0b01110010,$A11,$A11
161
162 vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
163 vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
164 vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
165 vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
166 vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
167 vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
168
169 vpternlogq \$0xC6,@T[8],@T[1],$A01 # [0][4] [0][3] [0][2] [0][1]
170 vpternlogq \$0xC6,@T[7],@T[4],$A21 # [3][4] [1][3] [4][2] [2][1]
171
172 ######################################### Iota
173 vpternlogq \$0x96,(%r10),@T[0],$A00
174 lea 32(%r10),%r10
175
176 dec %eax
177 jnz .Loop_avx512vl
178
179 ret
180.size __KeccakF1600,.-__KeccakF1600
181___
182my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
183my $out = $inp; # in squeeze
184
185$code.=<<___;
186.globl SHA3_absorb
187.type SHA3_absorb,\@function
188.align 32
189SHA3_absorb:
190 mov %rsp,%r11
191
192 lea -240(%rsp),%rsp
193 and \$-32,%rsp
194
195 lea 96($A_flat),$A_flat
196 lea 96($inp),$inp
197 lea 96(%rsp),%r10
198 lea rhotates_left(%rip),%r8
199
200 vzeroupper
201
202 vpbroadcastq -96($A_flat),$A00 # load A[5][5]
203 vmovdqu 8+32*0-96($A_flat),$A01
204 vmovdqu 8+32*1-96($A_flat),$A20
205 vmovdqu 8+32*2-96($A_flat),$A31
206 vmovdqu 8+32*3-96($A_flat),$A21
207 vmovdqu 8+32*4-96($A_flat),$A41
208 vmovdqu 8+32*5-96($A_flat),$A11
209
210 vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
211 vmovdqa64 1*32(%r8),$R01
212 vmovdqa64 2*32(%r8),$R31
213 vmovdqa64 3*32(%r8),$R21
214 vmovdqa64 4*32(%r8),$R41
215 vmovdqa64 5*32(%r8),$R11
216
217 vpxor @T[0],@T[0],@T[0]
218 vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
219 vmovdqa @T[0],32*3-96(%r10)
220 vmovdqa @T[0],32*4-96(%r10)
221 vmovdqa @T[0],32*5-96(%r10)
222 vmovdqa @T[0],32*6-96(%r10)
223
224.Loop_absorb_avx512vl:
225 mov $bsz,%rax
226 sub $bsz,$len
227 jc .Ldone_absorb_avx512vl
228
229 shr \$3,%eax
230 vpbroadcastq 0-96($inp),@T[0]
231 vmovdqu 8-96($inp),@T[1]
232 sub \$4,%eax
233___
234for(my $i=5; $i<25; $i++) {
235$code.=<<___
236 dec %eax
237 jz .Labsorved_avx512vl
238 mov 8*$i-96($inp),%r8
239 mov %r8,$A_jagged[$i]-96(%r10)
240___
241}
242$code.=<<___;
243.Labsorved_avx512vl:
244 lea ($inp,$bsz),$inp
245
246 vpxor @T[0],$A00,$A00
247 vpxor @T[1],$A01,$A01
248 vpxor 32*2-96(%r10),$A20,$A20
249 vpxor 32*3-96(%r10),$A31,$A31
250 vpxor 32*4-96(%r10),$A21,$A21
251 vpxor 32*5-96(%r10),$A41,$A41
252 vpxor 32*6-96(%r10),$A11,$A11
253
254 call __KeccakF1600
255
256 lea 96(%rsp),%r10
257 jmp .Loop_absorb_avx512vl
258
259.Ldone_absorb_avx512vl:
260 vmovq %xmm0,-96($A_flat)
261 vmovdqu $A01,8+32*0-96($A_flat)
262 vmovdqu $A20,8+32*1-96($A_flat)
263 vmovdqu $A31,8+32*2-96($A_flat)
264 vmovdqu $A21,8+32*3-96($A_flat)
265 vmovdqu $A41,8+32*4-96($A_flat)
266 vmovdqu $A11,8+32*5-96($A_flat)
267
268 vzeroupper
269
270 lea (%r11),%rsp
271 lea ($len,$bsz),%rax # return value
272 ret
273.size SHA3_absorb,.-SHA3_absorb
274
275.globl SHA3_squeeze
276.type SHA3_squeeze,\@function
277.align 32
278SHA3_squeeze:
279 mov %rsp,%r11
280
281 lea 96($A_flat),$A_flat
282 lea rhotates_left(%rip),%r8
283 shr \$3,$bsz
284
285 vzeroupper
286
287 vpbroadcastq -96($A_flat),$A00
288 vpxor @T[0],@T[0],@T[0]
289 vmovdqu 8+32*0-96($A_flat),$A01
290 vmovdqu 8+32*1-96($A_flat),$A20
291 vmovdqu 8+32*2-96($A_flat),$A31
292 vmovdqu 8+32*3-96($A_flat),$A21
293 vmovdqu 8+32*4-96($A_flat),$A41
294 vmovdqu 8+32*5-96($A_flat),$A11
295
296 vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
297 vmovdqa64 1*32(%r8),$R01
298 vmovdqa64 2*32(%r8),$R31
299 vmovdqa64 3*32(%r8),$R21
300 vmovdqa64 4*32(%r8),$R41
301 vmovdqa64 5*32(%r8),$R11
302
303 mov $bsz,%rax
304
305.Loop_squeeze_avx512vl:
306 mov @A_jagged[$i]-96($A_flat),%r8
307___
308for (my $i=0; $i<25; $i++) {
309$code.=<<___;
310 sub \$8,$len
311 jc .Ltail_squeeze_avx512vl
312 mov %r8,($out)
313 lea 8($out),$out
314 je .Ldone_squeeze_avx512vl
315 dec %eax
316 je .Lextend_output_avx512vl
317 mov @A_jagged[$i+1]-120($A_flat),%r8
318___
319}
320$code.=<<___;
321.Lextend_output_avx512vl:
322 call __KeccakF1600
323
324 vmovq %xmm0,-96($A_flat)
325 vmovdqu $A01,8+32*0-96($A_flat)
326 vmovdqu $A20,8+32*1-96($A_flat)
327 vmovdqu $A31,8+32*2-96($A_flat)
328 vmovdqu $A21,8+32*3-96($A_flat)
329 vmovdqu $A41,8+32*4-96($A_flat)
330 vmovdqu $A11,8+32*5-96($A_flat)
331
332 mov $bsz,%rax
333 jmp .Loop_squeeze_avx512vl
334
335
336.Ltail_squeeze_avx512vl:
337 add \$8,$len
338.Loop_tail_avx512vl:
339 mov %r8b,($out)
340 lea 1($out),$out
341 shr \$8,%r8
342 dec $len
343 jnz .Loop_tail_avx512vl
344
345.Ldone_squeeze_avx512vl:
346 vzeroupper
347
348 lea (%r11),%rsp
349 ret
350.size SHA3_squeeze,.-SHA3_squeeze
351
352.align 64
353rhotates_left:
354 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
355 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
356 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
357 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
358 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
359 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
360iotas:
361 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
362 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
363 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
364 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
365 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
366 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
367 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
368 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
369 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
370 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
371 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
372 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
373 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
374 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
375 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
376 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
377 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
378 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
379 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
380 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
381 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
382 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
383 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
384 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
385
386.asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
387___
388
2bd3b626
RL
389$output=pop;
390open STDOUT,">$output";
24d06e8c
AP
391print $code;
392close STDOUT;