[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-avx512.pl

#!/usr/bin/env perl
# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for AVX-512F.
#
# July 2017.
#
# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
# Pretty straightforward, the only "magic" is data layout in registers.
# It's impossible to have one that is optimal for every step, hence
# it's changing as algorithm progresses. Data is saved in linear order,
# but in-register order morphs between rounds. Even rounds take in
# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
#
########################################################################
# Numbers are cycles per processed byte out of large message.
#
#			r=1088(*)
#
# Knights Landing	7.6
# Skylake-X		5.7
#
# (*)	Corresponds to SHA3-256.

########################################################################
# Below code is combination of two ideas. One is taken from Keccak Code
# Package, hereafter KCP, and another one from initial version of this
# module. What is common is observation that Pi's input and output are
# "mostly transposed", i.e. if input is aligned by x coordinate, then
# output is [mostly] aligned by y. Both versions, KCP and predecessor,
# were trying to use one of them from round to round, which resulted in
# some kind of transposition in each round. This version still does
# transpose data, but only every second round. Another essential factor
# is that KCP transposition has to be performed with instructions that
# turned to be rather expensive on Knights Landing, both latency- and
# throughput-wise. Not to mention that some of them have to depend on
# each other. On the other hand initial version of this module was
# relying heavily on blend instructions. There were lots of them,
# resulting in higher instruction count, yet it performed better on
# Knights Landing, because processor can execute pair of them each
# cycle and they have minimal latency. This module is an attempt to
# bring best parts together:-)
#
# Coordinates below correspond to those in sha/keccak1600.c. Input
# layout is straight linear:
#
# [0][4] [0][3] [0][2] [0][1] [0][0]
# [1][4] [1][3] [1][2] [1][1] [1][0]
# [2][4] [2][3] [2][2] [2][1] [2][0]
# [3][4] [3][3] [3][2] [3][1] [3][0]
# [4][4] [4][3] [4][2] [4][1] [4][0]
#
# It's perfect for Theta, while Pi is reduced to intra-register
# permutations which yield layout perfect for Chi:
#
# [4][0] [3][0] [2][0] [1][0] [0][0]
# [4][1] [3][1] [2][1] [1][1] [0][1]
# [4][2] [3][2] [2][2] [1][2] [0][2]
# [4][3] [3][3] [2][3] [1][3] [0][3]
# [4][4] [3][4] [2][4] [1][4] [0][4]
#
# Now instead of performing full transposition and feeding it to next
# identical round, we perform kind of diagonal transposition to layout
# from initial version of this module, and make it suitable for Theta:
#
# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
#
# Now intra-register permutations yield initial [almost] straight
# linear layout:
#
# [4][4] [3][3] [2][2] [1][1] [0][0]
##[0][4] [0][3] [0][2] [0][1] [0][0]
# [3][4] [2][3] [1][2] [0][1] [4][0]
##[2][3] [2][2] [2][1] [2][0] [2][4]
# [2][4] [1][3] [0][2] [4][1] [3][0]
##[4][2] [4][1] [4][0] [4][4] [4][3]
# [1][4] [0][3] [4][2] [3][1] [2][0]
##[1][1] [1][0] [1][4] [1][3] [1][2]
# [0][4] [4][3] [3][2] [2][1] [1][0]
##[3][0] [3][4] [3][3] [3][2] [3][1]
#
# This means that odd round Chi is performed in less suitable layout,
# with a number of additional permutations. But overall it turned to be
# a win. Permutations are fastest possible on Knights Landing and they
# are laid down to be independent of each other. In the essence I traded
# 20 blend instructions for 3 permutations. The result is 13% faster
# than KCP on Skylake-X, and >40% on Knights Landing.
#
# As implied, data is loaded in straight linear order. Digits in
# variables' names represent coordinates of right-most element of
# loaded data chunk:

my ($A00,	# [0][4] [0][3] [0][2] [0][1] [0][0]
    $A10,	# [1][4] [1][3] [1][2] [1][1] [1][0]
    $A20,	# [2][4] [2][3] [2][2] [2][1] [2][0]
    $A30,	# [3][4] [3][3] [3][2] [3][1] [3][0]
    $A40) =	# [4][4] [4][3] [4][2] [4][1] [4][0]
    map("%zmm$_",(0..4));

# We also need to map the magic order into offsets within structure:

my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
		[1,0], [1,1], [1,2], [1,3], [1,4],
		[2,0], [2,1], [2,2], [2,3], [2,4],
		[3,0], [3,1], [3,2], [3,3], [3,4],
		[4,0], [4,1], [4,2], [4,3], [4,4]);
   @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged);	# ... and now linear

my @T        = map("%zmm$_",(5..12));
my @Theta    = map("%zmm$_",(33,13..16));	# invalid @Theta[0] is not typo
my @Pi0      = map("%zmm$_",(17..21));
my @Rhotate0 = map("%zmm$_",(22..26));
my @Rhotate1 = map("%zmm$_",(27..31));

my ($C00,$D00) = @T[0..1];
my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));

$code.=<<___;
.text

.type	__KeccakF1600,\@function
.align	32
__KeccakF1600:
	lea		iotas(%rip),%r10
	mov		\$12,%eax
	jmp		.Loop_avx512

.align	32
.Loop_avx512:
	######################################### Theta, even round
	vmovdqa64	$A00,@T[0]		# put aside original A00
	vpternlogq	\$0x96,$A20,$A10,$A00	# and use it as "C00"
	vpternlogq	\$0x96,$A40,$A30,$A00

	vprolq		\$1,$A00,$D00
	vpermq		$A00,@Theta[1],$A00
	vpermq		$D00,@Theta[4],$D00

	vpternlogq	\$0x96,$A00,$D00,@T[0]	# T[0] is original A00
	vpternlogq	\$0x96,$A00,$D00,$A10
	vpternlogq	\$0x96,$A00,$D00,$A20
	vpternlogq	\$0x96,$A00,$D00,$A30
	vpternlogq	\$0x96,$A00,$D00,$A40

	######################################### Rho
	vprolvq		@Rhotate0[0],@T[0],$A00	# T[0] is original A00
	vprolvq		@Rhotate0[1],$A10,$A10
	vprolvq		@Rhotate0[2],$A20,$A20
	vprolvq		@Rhotate0[3],$A30,$A30
	vprolvq		@Rhotate0[4],$A40,$A40

	######################################### Pi
	vpermq		$A00,@Pi0[0],$A00
	vpermq		$A10,@Pi0[1],$A10
	vpermq		$A20,@Pi0[2],$A20
	vpermq		$A30,@Pi0[3],$A30
	vpermq		$A40,@Pi0[4],$A40

	######################################### Chi
	vmovdqa64	$A00,@T[0]
	vmovdqa64	$A10,@T[1]
	vpternlogq	\$0xD2,$A20,$A10,$A00
	vpternlogq	\$0xD2,$A30,$A20,$A10
	vpternlogq	\$0xD2,$A40,$A30,$A20
	vpternlogq	\$0xD2,@T[0],$A40,$A30
	vpternlogq	\$0xD2,@T[1],@T[0],$A40

	######################################### Iota
	vpxorq		(%r10),$A00,${A00}{$k00001}
	lea		16(%r10),%r10

	######################################### Harmonize rounds
	vpblendmq	$A20,$A10,@{T[1]}{$k00010}
	vpblendmq	$A30,$A20,@{T[2]}{$k00010}
	vpblendmq	$A40,$A30,@{T[3]}{$k00010}
	 vpblendmq	$A10,$A00,@{T[0]}{$k00010}
	vpblendmq	$A00,$A40,@{T[4]}{$k00010}

	vpblendmq	$A30,@T[1],@{T[1]}{$k00100}
	vpblendmq	$A40,@T[2],@{T[2]}{$k00100}
	 vpblendmq	$A20,@T[0],@{T[0]}{$k00100}
	vpblendmq	$A00,@T[3],@{T[3]}{$k00100}
	vpblendmq	$A10,@T[4],@{T[4]}{$k00100}

	vpblendmq	$A40,@T[1],@{T[1]}{$k01000}
	 vpblendmq	$A30,@T[0],@{T[0]}{$k01000}
	vpblendmq	$A00,@T[2],@{T[2]}{$k01000}
	vpblendmq	$A10,@T[3],@{T[3]}{$k01000}
	vpblendmq	$A20,@T[4],@{T[4]}{$k01000}

	vpblendmq	$A40,@T[0],@{T[0]}{$k10000}
	vpblendmq	$A00,@T[1],@{T[1]}{$k10000}
	vpblendmq	$A10,@T[2],@{T[2]}{$k10000}
	vpblendmq	$A20,@T[3],@{T[3]}{$k10000}
	vpblendmq	$A30,@T[4],@{T[4]}{$k10000}

	#vpermq		@T[0],@Theta[0],$A00	# doesn't actually change order
	vpermq		@T[1],@Theta[1],$A10
	vpermq		@T[2],@Theta[2],$A20
	vpermq		@T[3],@Theta[3],$A30
	vpermq		@T[4],@Theta[4],$A40

	######################################### Theta, odd round
	vmovdqa64	$T[0],$A00		# real A00
	vpternlogq	\$0x96,$A20,$A10,$C00	# C00 is @T[0]'s alias
	vpternlogq	\$0x96,$A40,$A30,$C00

	vprolq		\$1,$C00,$D00
	vpermq		$C00,@Theta[1],$C00
	vpermq		$D00,@Theta[4],$D00

	vpternlogq	\$0x96,$C00,$D00,$A00
	vpternlogq	\$0x96,$C00,$D00,$A30
	vpternlogq	\$0x96,$C00,$D00,$A10
	vpternlogq	\$0x96,$C00,$D00,$A40
	vpternlogq	\$0x96,$C00,$D00,$A20

	######################################### Rho
	vprolvq		@Rhotate1[0],$A00,$A00
	vprolvq		@Rhotate1[3],$A30,@T[1]
	vprolvq		@Rhotate1[1],$A10,@T[2]
	vprolvq		@Rhotate1[4],$A40,@T[3]
	vprolvq		@Rhotate1[2],$A20,@T[4]

	 vpermq		$A00,@Theta[4],@T[5]
	 vpermq		$A00,@Theta[3],@T[6]

	######################################### Iota
	vpxorq		-8(%r10),$A00,${A00}{$k00001}

	######################################### Pi
	vpermq		@T[1],@Theta[2],$A10
	vpermq		@T[2],@Theta[4],$A20
	vpermq		@T[3],@Theta[1],$A30
	vpermq		@T[4],@Theta[3],$A40

	######################################### Chi
	vpternlogq	\$0xD2,@T[6],@T[5],$A00

	vpermq		@T[1],@Theta[1],@T[7]
	#vpermq		@T[1],@Theta[0],@T[1]
	vpternlogq	\$0xD2,@T[1],@T[7],$A10

	vpermq		@T[2],@Theta[3],@T[0]
	vpermq		@T[2],@Theta[2],@T[2]
	vpternlogq	\$0xD2,@T[2],@T[0],$A20

	#vpermq		@T[3],@Theta[0],@T[3]
	vpermq		@T[3],@Theta[4],@T[1]
	vpternlogq	\$0xD2,@T[1],@T[3],$A30

	vpermq		@T[4],@Theta[2],@T[0]
	vpermq		@T[4],@Theta[1],@T[4]
	vpternlogq	\$0xD2,@T[4],@T[0],$A40

	dec		%eax
	jnz		.Loop_avx512

	ret
.size	__KeccakF1600,.-__KeccakF1600
___

my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
my  $out = $inp;	# in squeeze

$code.=<<___;
.globl	SHA3_absorb
.type	SHA3_absorb,\@function
.align	32
SHA3_absorb:
	mov	%rsp,%r11

	lea	-320(%rsp),%rsp
	and	\$-64,%rsp

	lea	96($A_flat),$A_flat
	lea	96($inp),$inp
	lea	128(%rsp),%r9

	lea		theta_perm(%rip),%r8

	kxnorw		$k11111,$k11111,$k11111
	kshiftrw	\$15,$k11111,$k00001
	kshiftrw	\$11,$k11111,$k11111
	kshiftlw	\$1,$k00001,$k00010
	kshiftlw	\$2,$k00001,$k00100
	kshiftlw	\$3,$k00001,$k01000
	kshiftlw	\$4,$k00001,$k10000

	#vmovdqa64	64*0(%r8),@Theta[0]
	vmovdqa64	64*1(%r8),@Theta[1]
	vmovdqa64	64*2(%r8),@Theta[2]
	vmovdqa64	64*3(%r8),@Theta[3]
	vmovdqa64	64*4(%r8),@Theta[4]

	vmovdqa64	64*5(%r8),@Rhotate1[0]
	vmovdqa64	64*6(%r8),@Rhotate1[1]
	vmovdqa64	64*7(%r8),@Rhotate1[2]
	vmovdqa64	64*8(%r8),@Rhotate1[3]
	vmovdqa64	64*9(%r8),@Rhotate1[4]

	vmovdqa64	64*10(%r8),@Rhotate0[0]
	vmovdqa64	64*11(%r8),@Rhotate0[1]
	vmovdqa64	64*12(%r8),@Rhotate0[2]
	vmovdqa64	64*13(%r8),@Rhotate0[3]
	vmovdqa64	64*14(%r8),@Rhotate0[4]

	vmovdqa64	64*15(%r8),@Pi0[0]
	vmovdqa64	64*16(%r8),@Pi0[1]
	vmovdqa64	64*17(%r8),@Pi0[2]
	vmovdqa64	64*18(%r8),@Pi0[3]
	vmovdqa64	64*19(%r8),@Pi0[4]

	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
	vpxorq		@T[0],@T[0],@T[0]
	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}

	vmovdqa64	@T[0],0*64-128(%r9)	# zero transfer area on stack
	vmovdqa64	@T[0],1*64-128(%r9)
	vmovdqa64	@T[0],2*64-128(%r9)
	vmovdqa64	@T[0],3*64-128(%r9)
	vmovdqa64	@T[0],4*64-128(%r9)
	jmp		.Loop_absorb_avx512

.align	32
.Loop_absorb_avx512:
	mov		$bsz,%rax
	sub		$bsz,$len
	jc		.Ldone_absorb_avx512

	shr		\$3,%eax
___
for(my $i=0; $i<25; $i++) {
$code.=<<___
	mov	8*$i-96($inp),%r8
	mov	%r8,$A_jagged[$i]-128(%r9)
	dec	%eax
	jz	.Labsorved_avx512
___
}
$code.=<<___;
.Labsorved_avx512:
	lea	($inp,$bsz),$inp

	vpxorq	64*0-128(%r9),$A00,$A00
	vpxorq	64*1-128(%r9),$A10,$A10
	vpxorq	64*2-128(%r9),$A20,$A20
	vpxorq	64*3-128(%r9),$A30,$A30
	vpxorq	64*4-128(%r9),$A40,$A40

	call	__KeccakF1600

	jmp	.Loop_absorb_avx512

.align	32
.Ldone_absorb_avx512:
	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
	vmovdqu64	$A40,40*4-96($A_flat){$k11111}

	vzeroupper

	lea	(%r11),%rsp
	lea	($len,$bsz),%rax		# return value
	ret
.size	SHA3_absorb,.-SHA3_absorb

.globl	SHA3_squeeze
.type	SHA3_squeeze,\@function
.align	32
SHA3_squeeze:
	mov	%rsp,%r11

	lea	96($A_flat),$A_flat
	cmp	$bsz,$len
	jbe	.Lno_output_extension_avx512

	lea		theta_perm(%rip),%r8

	kxnorw		$k11111,$k11111,$k11111
	kshiftrw	\$15,$k11111,$k00001
	kshiftrw	\$11,$k11111,$k11111
	kshiftlw	\$1,$k00001,$k00010
	kshiftlw	\$2,$k00001,$k00100
	kshiftlw	\$3,$k00001,$k01000
	kshiftlw	\$4,$k00001,$k10000

	#vmovdqa64	64*0(%r8),@Theta[0]
	vmovdqa64	64*1(%r8),@Theta[1]
	vmovdqa64	64*2(%r8),@Theta[2]
	vmovdqa64	64*3(%r8),@Theta[3]
	vmovdqa64	64*4(%r8),@Theta[4]

	vmovdqa64	64*5(%r8),@Rhotate1[0]
	vmovdqa64	64*6(%r8),@Rhotate1[1]
	vmovdqa64	64*7(%r8),@Rhotate1[2]
	vmovdqa64	64*8(%r8),@Rhotate1[3]
	vmovdqa64	64*9(%r8),@Rhotate1[4]

	vmovdqa64	64*10(%r8),@Rhotate0[0]
	vmovdqa64	64*11(%r8),@Rhotate0[1]
	vmovdqa64	64*12(%r8),@Rhotate0[2]
	vmovdqa64	64*13(%r8),@Rhotate0[3]
	vmovdqa64	64*14(%r8),@Rhotate0[4]

	vmovdqa64	64*15(%r8),@Pi0[0]
	vmovdqa64	64*16(%r8),@Pi0[1]
	vmovdqa64	64*17(%r8),@Pi0[2]
	vmovdqa64	64*18(%r8),@Pi0[3]
	vmovdqa64	64*19(%r8),@Pi0[4]

	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}

.Lno_output_extension_avx512:
	shr	\$3,$bsz
	lea	-96($A_flat),%r9
	mov	$bsz,%rax
	jmp	.Loop_squeeze_avx512

.align	32
.Loop_squeeze_avx512:
	cmp	\$8,$len
	jb	.Ltail_squeeze_avx512

	mov	(%r9),%r8
	lea	8(%r9),%r9
	mov	%r8,($out)
	lea	8($out),$out
	sub	\$8,$len		# len -= 8
	jz	.Ldone_squeeze_avx512

	sub	\$1,%rax		# bsz--
	jnz	.Loop_squeeze_avx512

	#vpermq		@Theta[4],@Theta[4],@Theta[3]
	#vpermq		@Theta[3],@Theta[4],@Theta[2]
	#vpermq		@Theta[3],@Theta[3],@Theta[1]

	call		__KeccakF1600

	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
	vmovdqu64	$A40,40*4-96($A_flat){$k11111}

	lea	-96($A_flat),%r9
	mov	$bsz,%rax
	jmp	.Loop_squeeze_avx512

.Ltail_squeeze_avx512:
	mov	$out,%rdi
	mov	%r9,%rsi
	mov	$len,%rcx
	.byte	0xf3,0xa4		# rep movsb

.Ldone_squeeze_avx512:
	vzeroupper

	lea	(%r11),%rsp
	ret
.size	SHA3_squeeze,.-SHA3_squeeze

.align	64
theta_perm:
	.quad	0, 1, 2, 3, 4, 5, 6, 7		# [not used]
	.quad	4, 0, 1, 2, 3, 5, 6, 7
	.quad	3, 4, 0, 1, 2, 5, 6, 7
	.quad	2, 3, 4, 0, 1, 5, 6, 7
	.quad	1, 2, 3, 4, 0, 5, 6, 7

rhotates1:
	.quad	0,  44, 43, 21, 14, 0, 0, 0	# [0][0] [1][1] [2][2] [3][3] [4][4]
	.quad	18, 1,  6,  25, 8,  0, 0, 0	# [4][0] [0][1] [1][2] [2][3] [3][4]
	.quad	41, 2,	62, 55, 39, 0, 0, 0	# [3][0] [4][1] [0][2] [1][3] [2][4]
	.quad	3,  45, 61, 28, 20, 0, 0, 0	# [2][0] [3][1] [4][2] [0][3] [1][4]
	.quad	36, 10, 15, 56, 27, 0, 0, 0	# [1][0] [2][1] [3][2] [4][3] [0][4]

rhotates0:
	.quad	 0,  1, 62, 28, 27, 0, 0, 0
	.quad	36, 44,  6, 55, 20, 0, 0, 0
	.quad	 3, 10, 43, 25, 39, 0, 0, 0
	.quad	41, 45, 15, 21,  8, 0, 0, 0
	.quad	18,  2, 61, 56, 14, 0, 0, 0

pi0_perm:
	.quad	0, 3, 1, 4, 2, 5, 6, 7
	.quad	1, 4, 2, 0, 3, 5, 6, 7
	.quad	2, 0, 3, 1, 4, 5, 6, 7
	.quad	3, 1, 4, 2, 0, 5, 6, 7
	.quad	4, 2, 0, 3, 1, 5, 6, 7


iotas:
	.quad	0x0000000000000001
	.quad	0x0000000000008082
	.quad	0x800000000000808a
	.quad	0x8000000080008000
	.quad	0x000000000000808b
	.quad	0x0000000080000001
	.quad	0x8000000080008081
	.quad	0x8000000000008009
	.quad	0x000000000000008a
	.quad	0x0000000000000088
	.quad	0x0000000080008009
	.quad	0x000000008000000a
	.quad	0x000000008000808b
	.quad	0x800000000000008b
	.quad	0x8000000000008089
	.quad	0x8000000000008003
	.quad	0x8000000000008002
	.quad	0x8000000000000080
	.quad	0x000000000000800a
	.quad	0x800000008000000a
	.quad	0x8000000080008081
	.quad	0x8000000000008080
	.quad	0x0000000080000001
	.quad	0x8000000080008008

.asciz	"Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
___

$output=pop;
open STDOUT,">$output";
print $code;
close STDOUT or die "error closing STDOUT: $!";
Commit	Line	Data
313fa47f	1	#!/usr/bin/env perl
e7ff223a	2	# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
313fa47f AP	3	#
	4	# Licensed under the OpenSSL license (the "License"). You may not use
	5	# this file except in compliance with the License. You can obtain a copy
	6	# in the file LICENSE in the source distribution or at
	7	# https://www.openssl.org/source/license.html
	8	#
	9	# ====================================================================
	10	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	11	# project. The module is, however, dual licensed under OpenSSL and
	12	# CRYPTOGAMS licenses depending on where you obtain it. For further
	13	# details see http://www.openssl.org/~appro/cryptogams/.
	14	# ====================================================================
	15	#
	16	# Keccak-1600 for AVX-512F.
	17	#
	18	# July 2017.
	19	#
	20	# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
	21	# Pretty straightforward, the only "magic" is data layout in registers.
	22	# It's impossible to have one that is optimal for every step, hence
e3c79f0f AP	23	# it's changing as algorithm progresses. Data is saved in linear order,
	24	# but in-register order morphs between rounds. Even rounds take in
	25	# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
313fa47f AP	26	#
	27	########################################################################
	28	# Numbers are cycles per processed byte out of large message.
	29	#
	30	# r=1088(*)
	31	#
e3c79f0f AP	32	# Knights Landing 7.6
e3c79f0f AP	33	# Skylake-X 5.7
313fa47f AP	34	#
	35	# (*) Corresponds to SHA3-256.
	36
	37	########################################################################
e3c79f0f AP	38	# Below code is combination of two ideas. One is taken from Keccak Code
	39	# Package, hereafter KCP, and another one from initial version of this
	40	# module. What is common is observation that Pi's input and output are
	41	# "mostly transposed", i.e. if input is aligned by x coordinate, then
	42	# output is [mostly] aligned by y. Both versions, KCP and predecessor,
	43	# were trying to use one of them from round to round, which resulted in
	44	# some kind of transposition in each round. This version still does
	45	# transpose data, but only every second round. Another essential factor
	46	# is that KCP transposition has to be performed with instructions that
	47	# turned to be rather expensive on Knights Landing, both latency- and
	48	# throughput-wise. Not to mention that some of them have to depend on
	49	# each other. On the other hand initial version of this module was
	50	# relying heavily on blend instructions. There were lots of them,
	51	# resulting in higher instruction count, yet it performed better on
	52	# Knights Landing, because processor can execute pair of them each
	53	# cycle and they have minimal latency. This module is an attempt to
	54	# bring best parts together:-)
	55	#
	56	# Coordinates below correspond to those in sha/keccak1600.c. Input
	57	# layout is straight linear:
	58	#
	59	# [0][4] [0][3] [0][2] [0][1] [0][0]
	60	# [1][4] [1][3] [1][2] [1][1] [1][0]
	61	# [2][4] [2][3] [2][2] [2][1] [2][0]
	62	# [3][4] [3][3] [3][2] [3][1] [3][0]
	63	# [4][4] [4][3] [4][2] [4][1] [4][0]
	64	#
	65	# It's perfect for Theta, while Pi is reduced to intra-register
	66	# permutations which yield layout perfect for Chi:
	67	#
	68	# [4][0] [3][0] [2][0] [1][0] [0][0]
	69	# [4][1] [3][1] [2][1] [1][1] [0][1]
	70	# [4][2] [3][2] [2][2] [1][2] [0][2]
	71	# [4][3] [3][3] [2][3] [1][3] [0][3]
	72	# [4][4] [3][4] [2][4] [1][4] [0][4]
	73	#
	74	# Now instead of performing full transposition and feeding it to next
	75	# identical round, we perform kind of diagonal transposition to layout
	76	# from initial version of this module, and make it suitable for Theta:
313fa47f AP	77	#
	78	# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
	79	# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
	80	# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
	81	# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
	82	# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
	83	#
e3c79f0f AP	84	# Now intra-register permutations yield initial [almost] straight
e3c79f0f AP	85	# linear layout:
313fa47f	86	#
e3c79f0f	87	# [4][4] [3][3] [2][2] [1][1] [0][0]
313fa47f	88	##[0][4] [0][3] [0][2] [0][1] [0][0]
e3c79f0f	89	# [3][4] [2][3] [1][2] [0][1] [4][0]
313fa47f	90	##[2][3] [2][2] [2][1] [2][0] [2][4]
e3c79f0f	91	# [2][4] [1][3] [0][2] [4][1] [3][0]
313fa47f	92	##[4][2] [4][1] [4][0] [4][4] [4][3]
e3c79f0f	93	# [1][4] [0][3] [4][2] [3][1] [2][0]
313fa47f	94	##[1][1] [1][0] [1][4] [1][3] [1][2]
e3c79f0f	95	# [0][4] [4][3] [3][2] [2][1] [1][0]
313fa47f AP	96	##[3][0] [3][4] [3][3] [3][2] [3][1]
313fa47f AP	97	#
e3c79f0f AP	98	# This means that odd round Chi is performed in less suitable layout,
	99	# with a number of additional permutations. But overall it turned to be
	100	# a win. Permutations are fastest possible on Knights Landing and they
	101	# are laid down to be independent of each other. In the essence I traded
	102	# 20 blend instructions for 3 permutations. The result is 13% faster
	103	# than KCP on Skylake-X, and >40% on Knights Landing.
313fa47f	104	#
e3c79f0f AP	105	# As implied, data is loaded in straight linear order. Digits in
	106	# variables' names represent coordinates of right-most element of
	107	# loaded data chunk:
	108
	109	my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
	110	$A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
	111	$A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
	112	$A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
	113	$A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
313fa47f AP	114	map("%zmm$_",(0..4));
	115
	116	# We also need to map the magic order into offsets within structure:
	117
e3c79f0f AP	118	my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
	119	[1,0], [1,1], [1,2], [1,3], [1,4],
	120	[2,0], [2,1], [2,2], [2,3], [2,4],
	121	[3,0], [3,1], [3,2], [3,3], [3,4],
	122	[4,0], [4,1], [4,2], [4,3], [4,4]);
	123	@A_jagged = map(8($$_[0]8+$$_[1]), @A_jagged); # ... and now linear
313fa47f	124
e3c79f0f AP	125	my @T = map("%zmm$_",(5..12));
	126	my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
	127	my @Pi0 = map("%zmm$_",(17..21));
	128	my @Rhotate0 = map("%zmm$_",(22..26));
	129	my @Rhotate1 = map("%zmm$_",(27..31));
313fa47f AP	130
	131	my ($C00,$D00) = @T[0..1];
	132	my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
	133
	134	$code.=<<___;
	135	.text
	136
	137	.type __KeccakF1600,\@function
	138	.align 32
	139	__KeccakF1600:
	140	lea iotas(%rip),%r10
e3c79f0f	141	mov \$12,%eax
313fa47f AP	142	jmp .Loop_avx512
	143
	144	.align 32
	145	.Loop_avx512:
e3c79f0f	146	######################################### Theta, even round
0d7903f8	147	vmovdqa64 $A00,@T[0] # put aside original A00
e3c79f0f AP	148	vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
e3c79f0f AP	149	vpternlogq \$0x96,$A40,$A30,$A00
313fa47f	150
0d7903f8 AP	151	vprolq \$1,$A00,$D00
0d7903f8 AP	152	vpermq $A00,@Theta[1],$A00
313fa47f AP	153	vpermq $D00,@Theta[4],$D00
313fa47f AP	154
0d7903f8	155	vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
e3c79f0f AP	156	vpternlogq \$0x96,$A00,$D00,$A10
	157	vpternlogq \$0x96,$A00,$D00,$A20
	158	vpternlogq \$0x96,$A00,$D00,$A30
	159	vpternlogq \$0x96,$A00,$D00,$A40
313fa47f AP	160
313fa47f AP	161	######################################### Rho
e3c79f0f AP	162	vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
	163	vprolvq @Rhotate0[1],$A10,$A10
	164	vprolvq @Rhotate0[2],$A20,$A20
	165	vprolvq @Rhotate0[3],$A30,$A30
	166	vprolvq @Rhotate0[4],$A40,$A40
313fa47f AP	167
313fa47f AP	168	######################################### Pi
e3c79f0f AP	169	vpermq $A00,@Pi0[0],$A00
	170	vpermq $A10,@Pi0[1],$A10
	171	vpermq $A20,@Pi0[2],$A20
	172	vpermq $A30,@Pi0[3],$A30
	173	vpermq $A40,@Pi0[4],$A40
313fa47f AP	174
	175	######################################### Chi
	176	vmovdqa64 $A00,@T[0]
e3c79f0f AP	177	vmovdqa64 $A10,@T[1]
	178	vpternlogq \$0xD2,$A20,$A10,$A00
	179	vpternlogq \$0xD2,$A30,$A20,$A10
	180	vpternlogq \$0xD2,$A40,$A30,$A20
	181	vpternlogq \$0xD2,@T[0],$A40,$A30
	182	vpternlogq \$0xD2,@T[1],@T[0],$A40
313fa47f AP	183
	184	######################################### Iota
	185	vpxorq (%r10),$A00,${A00}{$k00001}
e3c79f0f AP	186	lea 16(%r10),%r10
	187
	188	######################################### Harmonize rounds
	189	vpblendmq $A20,$A10,@{T[1]}{$k00010}
	190	vpblendmq $A30,$A20,@{T[2]}{$k00010}
	191	vpblendmq $A40,$A30,@{T[3]}{$k00010}
	192	vpblendmq $A10,$A00,@{T[0]}{$k00010}
	193	vpblendmq $A00,$A40,@{T[4]}{$k00010}
	194
	195	vpblendmq $A30,@T[1],@{T[1]}{$k00100}
	196	vpblendmq $A40,@T[2],@{T[2]}{$k00100}
	197	vpblendmq $A20,@T[0],@{T[0]}{$k00100}
	198	vpblendmq $A00,@T[3],@{T[3]}{$k00100}
	199	vpblendmq $A10,@T[4],@{T[4]}{$k00100}
	200
	201	vpblendmq $A40,@T[1],@{T[1]}{$k01000}
	202	vpblendmq $A30,@T[0],@{T[0]}{$k01000}
	203	vpblendmq $A00,@T[2],@{T[2]}{$k01000}
	204	vpblendmq $A10,@T[3],@{T[3]}{$k01000}
	205	vpblendmq $A20,@T[4],@{T[4]}{$k01000}
	206
	207	vpblendmq $A40,@T[0],@{T[0]}{$k10000}
	208	vpblendmq $A00,@T[1],@{T[1]}{$k10000}
	209	vpblendmq $A10,@T[2],@{T[2]}{$k10000}
	210	vpblendmq $A20,@T[3],@{T[3]}{$k10000}
	211	vpblendmq $A30,@T[4],@{T[4]}{$k10000}
	212
	213	#vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
	214	vpermq @T[1],@Theta[1],$A10
	215	vpermq @T[2],@Theta[2],$A20
	216	vpermq @T[3],@Theta[3],$A30
	217	vpermq @T[4],@Theta[4],$A40
	218
	219	######################################### Theta, odd round
	220	vmovdqa64 $T[0],$A00 # real A00
	221	vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
	222	vpternlogq \$0x96,$A40,$A30,$C00
	223
	224	vprolq \$1,$C00,$D00
	225	vpermq $C00,@Theta[1],$C00
	226	vpermq $D00,@Theta[4],$D00
	227
	228	vpternlogq \$0x96,$C00,$D00,$A00
	229	vpternlogq \$0x96,$C00,$D00,$A30
	230	vpternlogq \$0x96,$C00,$D00,$A10
	231	vpternlogq \$0x96,$C00,$D00,$A40
	232	vpternlogq \$0x96,$C00,$D00,$A20
	233
	234	######################################### Rho
	235	vprolvq @Rhotate1[0],$A00,$A00
	236	vprolvq @Rhotate1[3],$A30,@T[1]
	237	vprolvq @Rhotate1[1],$A10,@T[2]
	238	vprolvq @Rhotate1[4],$A40,@T[3]
	239	vprolvq @Rhotate1[2],$A20,@T[4]
	240
	241	vpermq $A00,@Theta[4],@T[5]
	242	vpermq $A00,@Theta[3],@T[6]
	243
	244	######################################### Iota
	245	vpxorq -8(%r10),$A00,${A00}{$k00001}
	246
	247	######################################### Pi
	248	vpermq @T[1],@Theta[2],$A10
	249	vpermq @T[2],@Theta[4],$A20
250	vpermq @T[3],@Theta[1],$A30
251	vpermq @T[4],@Theta[3],$A40
252
253	######################################### Chi
254	vpternlogq \$0xD2,@T[6],@T[5],$A00
255
256	vpermq @T[1],@Theta[1],@T[7]
257	#vpermq @T[1],@Theta[0],@T[1]
258	vpternlogq \$0xD2,@T[1],@T[7],$A10
259
260	vpermq @T[2],@Theta[3],@T[0]
261	vpermq @T[2],@Theta[2],@T[2]
262	vpternlogq \$0xD2,@T[2],@T[0],$A20
263
264	#vpermq @T[3],@Theta[0],@T[3]
265	vpermq @T[3],@Theta[4],@T[1]
266	vpternlogq \$0xD2,@T[1],@T[3],$A30
267
268	vpermq @T[4],@Theta[2],@T[0]
269	vpermq @T[4],@Theta[1],@T[4]
270	vpternlogq \$0xD2,@T[4],@T[0],$A40
313fa47f AP	271
	272	dec %eax
	273	jnz .Loop_avx512
	274
	275	ret
	276	.size __KeccakF1600,.-__KeccakF1600
	277	___
	278
	279	my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
	280	my $out = $inp; # in squeeze
	281
	282	$code.=<<___;
	283	.globl SHA3_absorb
	284	.type SHA3_absorb,\@function
	285	.align 32
	286	SHA3_absorb:
	287	mov %rsp,%r11
	288
	289	lea -320(%rsp),%rsp
	290	and \$-64,%rsp
	291
	292	lea 96($A_flat),$A_flat
	293	lea 96($inp),$inp
	294	lea 128(%rsp),%r9
	295
313fa47f AP	296	lea theta_perm(%rip),%r8
	297
	298	kxnorw $k11111,$k11111,$k11111
	299	kshiftrw \$15,$k11111,$k00001
	300	kshiftrw \$11,$k11111,$k11111
	301	kshiftlw \$1,$k00001,$k00010
	302	kshiftlw \$2,$k00001,$k00100
	303	kshiftlw \$3,$k00001,$k01000
	304	kshiftlw \$4,$k00001,$k10000
	305
	306	#vmovdqa64 64*0(%r8),@Theta[0]
	307	vmovdqa64 64*1(%r8),@Theta[1]
	308	vmovdqa64 64*2(%r8),@Theta[2]
	309	vmovdqa64 64*3(%r8),@Theta[3]
	310	vmovdqa64 64*4(%r8),@Theta[4]
	311
e3c79f0f AP	312	vmovdqa64 64*5(%r8),@Rhotate1[0]
	313	vmovdqa64 64*6(%r8),@Rhotate1[1]
	314	vmovdqa64 64*7(%r8),@Rhotate1[2]
	315	vmovdqa64 64*8(%r8),@Rhotate1[3]
	316	vmovdqa64 64*9(%r8),@Rhotate1[4]
	317
	318	vmovdqa64 64*10(%r8),@Rhotate0[0]
	319	vmovdqa64 64*11(%r8),@Rhotate0[1]
	320	vmovdqa64 64*12(%r8),@Rhotate0[2]
	321	vmovdqa64 64*13(%r8),@Rhotate0[3]
	322	vmovdqa64 64*14(%r8),@Rhotate0[4]
313fa47f	323
e3c79f0f AP	324	vmovdqa64 64*15(%r8),@Pi0[0]
	325	vmovdqa64 64*16(%r8),@Pi0[1]
	326	vmovdqa64 64*17(%r8),@Pi0[2]
	327	vmovdqa64 64*18(%r8),@Pi0[3]
	328	vmovdqa64 64*19(%r8),@Pi0[4]
313fa47f AP	329
	330	vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
	331	vpxorq @T[0],@T[0],@T[0]
e3c79f0f AP	332	vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
	333	vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
	334	vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
	335	vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
313fa47f AP	336
	337	vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
	338	vmovdqa64 @T[0],1*64-128(%r9)
	339	vmovdqa64 @T[0],2*64-128(%r9)
	340	vmovdqa64 @T[0],3*64-128(%r9)
	341	vmovdqa64 @T[0],4*64-128(%r9)
	342	jmp .Loop_absorb_avx512
	343
	344	.align 32
	345	.Loop_absorb_avx512:
	346	mov $bsz,%rax
	347	sub $bsz,$len
	348	jc .Ldone_absorb_avx512
	349
	350	shr \$3,%eax
313fa47f	351	___
0d7903f8	352	for(my $i=0; $i<25; $i++) {
313fa47f	353	$code.=<<___
313fa47f	354	mov 8*$i-96($inp),%r8
e3c79f0f	355	mov %r8,$A_jagged[$i]-128(%r9)
0d7903f8 AP	356	dec %eax
0d7903f8 AP	357	jz .Labsorved_avx512
313fa47f AP	358	___
	359	}
	360	$code.=<<___;
	361	.Labsorved_avx512:
	362	lea ($inp,$bsz),$inp
	363
0d7903f8	364	vpxorq 64*0-128(%r9),$A00,$A00
e3c79f0f AP	365	vpxorq 64*1-128(%r9),$A10,$A10
	366	vpxorq 64*2-128(%r9),$A20,$A20
	367	vpxorq 64*3-128(%r9),$A30,$A30
	368	vpxorq 64*4-128(%r9),$A40,$A40
313fa47f AP	369
	370	call __KeccakF1600
	371
	372	jmp .Loop_absorb_avx512
	373
	374	.align 32
	375	.Ldone_absorb_avx512:
	376	vmovdqu64 $A00,40*0-96($A_flat){$k11111}
e3c79f0f AP	377	vmovdqu64 $A10,40*1-96($A_flat){$k11111}
	378	vmovdqu64 $A20,40*2-96($A_flat){$k11111}
	379	vmovdqu64 $A30,40*3-96($A_flat){$k11111}
	380	vmovdqu64 $A40,40*4-96($A_flat){$k11111}
313fa47f AP	381
	382	vzeroupper
	383
	384	lea (%r11),%rsp
	385	lea ($len,$bsz),%rax # return value
	386	ret
	387	.size SHA3_absorb,.-SHA3_absorb
	388
	389	.globl SHA3_squeeze
	390	.type SHA3_squeeze,\@function
	391	.align 32
	392	SHA3_squeeze:
	393	mov %rsp,%r11
	394
	395	lea 96($A_flat),$A_flat
	396	cmp $bsz,$len
	397	jbe .Lno_output_extension_avx512
	398
313fa47f AP	399	lea theta_perm(%rip),%r8
	400
	401	kxnorw $k11111,$k11111,$k11111
	402	kshiftrw \$15,$k11111,$k00001
	403	kshiftrw \$11,$k11111,$k11111
	404	kshiftlw \$1,$k00001,$k00010
	405	kshiftlw \$2,$k00001,$k00100
	406	kshiftlw \$3,$k00001,$k01000
	407	kshiftlw \$4,$k00001,$k10000
	408
	409	#vmovdqa64 64*0(%r8),@Theta[0]
	410	vmovdqa64 64*1(%r8),@Theta[1]
	411	vmovdqa64 64*2(%r8),@Theta[2]
	412	vmovdqa64 64*3(%r8),@Theta[3]
	413	vmovdqa64 64*4(%r8),@Theta[4]
	414
e3c79f0f AP	415	vmovdqa64 64*5(%r8),@Rhotate1[0]
	416	vmovdqa64 64*6(%r8),@Rhotate1[1]
	417	vmovdqa64 64*7(%r8),@Rhotate1[2]
	418	vmovdqa64 64*8(%r8),@Rhotate1[3]
	419	vmovdqa64 64*9(%r8),@Rhotate1[4]
	420
	421	vmovdqa64 64*10(%r8),@Rhotate0[0]
	422	vmovdqa64 64*11(%r8),@Rhotate0[1]
	423	vmovdqa64 64*12(%r8),@Rhotate0[2]
	424	vmovdqa64 64*13(%r8),@Rhotate0[3]
	425	vmovdqa64 64*14(%r8),@Rhotate0[4]
313fa47f	426
e3c79f0f AP	427	vmovdqa64 64*15(%r8),@Pi0[0]
	428	vmovdqa64 64*16(%r8),@Pi0[1]
	429	vmovdqa64 64*17(%r8),@Pi0[2]
	430	vmovdqa64 64*18(%r8),@Pi0[3]
	431	vmovdqa64 64*19(%r8),@Pi0[4]
313fa47f AP	432
313fa47f AP	433	vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
e3c79f0f AP	434	vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
	435	vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
	436	vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
	437	vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
313fa47f AP	438
	439	.Lno_output_extension_avx512:
	440	shr \$3,$bsz
e3c79f0f	441	lea -96($A_flat),%r9
313fa47f	442	mov $bsz,%rax
e3c79f0f	443	jmp .Loop_squeeze_avx512
313fa47f	444
e3c79f0f	445	.align 32
313fa47f	446	.Loop_squeeze_avx512:
e3c79f0f AP	447	cmp \$8,$len
	448	jb .Ltail_squeeze_avx512
	449
	450	mov (%r9),%r8
	451	lea 8(%r9),%r9
313fa47f AP	452	mov %r8,($out)
313fa47f AP	453	lea 8($out),$out
e3c79f0f AP	454	sub \$8,$len # len -= 8
	455	jz .Ldone_squeeze_avx512
	456
	457	sub \$1,%rax # bsz--
	458	jnz .Loop_squeeze_avx512
	459
	460	#vpermq @Theta[4],@Theta[4],@Theta[3]
	461	#vpermq @Theta[3],@Theta[4],@Theta[2]
	462	#vpermq @Theta[3],@Theta[3],@Theta[1]
	463
	464	call __KeccakF1600
313fa47f AP	465
313fa47f AP	466	vmovdqu64 $A00,40*0-96($A_flat){$k11111}
e3c79f0f AP	467	vmovdqu64 $A10,40*1-96($A_flat){$k11111}
	468	vmovdqu64 $A20,40*2-96($A_flat){$k11111}
	469	vmovdqu64 $A30,40*3-96($A_flat){$k11111}
	470	vmovdqu64 $A40,40*4-96($A_flat){$k11111}
313fa47f	471
e3c79f0f	472	lea -96($A_flat),%r9
313fa47f AP	473	mov $bsz,%rax
	474	jmp .Loop_squeeze_avx512
	475
313fa47f	476	.Ltail_squeeze_avx512:
e3c79f0f	477	mov $out,%rdi
3c1a60e5	478	mov %r9,%rsi
e3c79f0f AP	479	mov $len,%rcx
e3c79f0f AP	480	.byte 0xf3,0xa4 # rep movsb
313fa47f AP	481
	482	.Ldone_squeeze_avx512:
	483	vzeroupper
	484
	485	lea (%r11),%rsp
	486	ret
	487	.size SHA3_squeeze,.-SHA3_squeeze
	488
	489	.align 64
	490	theta_perm:
	491	.quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
	492	.quad 4, 0, 1, 2, 3, 5, 6, 7
	493	.quad 3, 4, 0, 1, 2, 5, 6, 7
	494	.quad 2, 3, 4, 0, 1, 5, 6, 7
	495	.quad 1, 2, 3, 4, 0, 5, 6, 7
	496
e3c79f0f	497	rhotates1:
313fa47f AP	498	.quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
	499	.quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
	500	.quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
	501	.quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
	502	.quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
	503
e3c79f0f AP	504	rhotates0:
	505	.quad 0, 1, 62, 28, 27, 0, 0, 0
	506	.quad 36, 44, 6, 55, 20, 0, 0, 0
	507	.quad 3, 10, 43, 25, 39, 0, 0, 0
	508	.quad 41, 45, 15, 21, 8, 0, 0, 0
	509	.quad 18, 2, 61, 56, 14, 0, 0, 0
	510
	511	pi0_perm:
	512	.quad 0, 3, 1, 4, 2, 5, 6, 7
	513	.quad 1, 4, 2, 0, 3, 5, 6, 7
	514	.quad 2, 0, 3, 1, 4, 5, 6, 7
	515	.quad 3, 1, 4, 2, 0, 5, 6, 7
	516	.quad 4, 2, 0, 3, 1, 5, 6, 7
	517
313fa47f AP	518
	519	iotas:
	520	.quad 0x0000000000000001
	521	.quad 0x0000000000008082
	522	.quad 0x800000000000808a
	523	.quad 0x8000000080008000
	524	.quad 0x000000000000808b
	525	.quad 0x0000000080000001
	526	.quad 0x8000000080008081
	527	.quad 0x8000000000008009
	528	.quad 0x000000000000008a
	529	.quad 0x0000000000000088
	530	.quad 0x0000000080008009
	531	.quad 0x000000008000000a
	532	.quad 0x000000008000808b
	533	.quad 0x800000000000008b
	534	.quad 0x8000000000008089
	535	.quad 0x8000000000008003
	536	.quad 0x8000000000008002
	537	.quad 0x8000000000000080
	538	.quad 0x000000000000800a
	539	.quad 0x800000008000000a
	540	.quad 0x8000000080008081
	541	.quad 0x8000000000008080
	542	.quad 0x0000000080000001
	543	.quad 0x8000000080008008
	544
	545	.asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
	546	___
	547
2bd3b626 RL	548	$output=pop;
2bd3b626 RL	549	open STDOUT,">$output";
313fa47f	550	print $code;
218e740f	551	close STDOUT or die "error closing STDOUT: $!";