[thirdparty/openssl.git] / crypto / sha / asm / keccak1600-avx2.pl

#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for AVX2.
#
# July 2017.
#
# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
# other than A[0][0] in magic order into 6 [256-bit] registers, *each
# dedicated to one axis*, Pi permutation is reduced to intra-register
# shuffles...
#
# It makes other steps more intricate, but overall, is it a win? To be
# more specific index permutations organized by quadruples are:
#
#       [4][4] [3][3] [2][2] [1][1]<-+
#       [0][4] [0][3] [0][2] [0][1]<-+
#       [3][0] [1][0] [4][0] [2][0]  |
#       [4][3] [3][1] [2][4] [1][2]  |
#       [3][4] [1][3] [4][2] [2][1]  |
#       [2][3] [4][1] [1][4] [3][2]  |
#       [2][2] [4][4] [1][1] [3][3] -+
#
# This however is highly impractical for Theta and Chi. What would help
# Theta is if x indices were aligned column-wise, or in other words:
#
#       [0][4] [0][3] [0][2] [0][1]
#       [3][0] [1][0] [4][0] [2][0]
#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
#       [2][4] [4][3] [1][2] [3][1]
#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
#       [3][4] [1][3] [4][2] [2][1]
#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
#       [1][4] [2][3] [3][2] [4][1]
#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
#       [4][4] [3][3] [2][2] [1][1]
#
# So here we have it, lines not marked with vpermq() represent the magic
# order in which data is to be loaded and maintained. [And lines marked
# with vpermq() represent Pi circular permutation in chosen layout. Note
# that first step is permutation-free.] A[0][0] is loaded to register of
# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
# Digits in variables' names denote right-most coordinates:

my ($A00,	# [0][0] [0][0] [0][0] [0][0]		# %ymm0
    $A01,	# [0][4] [0][3] [0][2] [0][1]		# %ymm1
    $A20,	# [3][0] [1][0] [4][0] [2][0]		# %ymm2
    $A31,	# [2][4] [4][3] [1][2] [3][1]		# %ymm3
    $A21,	# [3][4] [1][3] [4][2] [2][1]		# %ymm4
    $A41,	# [1][4] [2][3] [3][2] [4][1]		# %ymm5
    $A11) =	# [4][4] [3][3] [2][2] [1][1]		# %ymm6
    map("%ymm$_",(0..6));

# We also need to map the magic order into offsets within structure:

my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],	# [0][0..4]
		[2,2], [6,0], [3,1], [4,2], [5,3],	# [1][0..4]
		[2,0], [4,0], [6,1], [5,2], [3,3],	# [2][0..4]
		[2,3], [3,0], [5,1], [6,2], [4,3],	# [3][0..4]
		[2,1], [5,0], [4,1], [3,2], [6,3]);	# [4][0..4]
   @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);	# ... and now linear

# But on the other hand Chi is much better off if y indices were aligned
# column-wise, not x. For this reason we have to shuffle data prior
# Chi and revert it afterwards. Prior shuffle is naturally merged with
# Pi itself:
#
#       [0][4] [0][3] [0][2] [0][1]
#       [3][0] [1][0] [4][0] [2][0]
#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
#       [3][1] [1][2] [4][3] [2][4]
#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
#       [3][4] [1][3] [4][2] [2][1]
#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
#       [3][2] [1][4] [4][1] [2][3]
#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
#       [3][3] [1][1] [4][4] [2][2]
#
# And reverse post-Chi permutation:
#
#       [0][4] [0][3] [0][2] [0][1]
#       [3][0] [1][0] [4][0] [2][0]
#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
#       [2][4] [4][3] [1][2] [3][1]
#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
#       [3][4] [1][3] [4][2] [2][1]
#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
#       [1][4] [2][3] [3][2] [4][1]
#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
#       [4][4] [3][3] [2][2] [1][1]
#
########################################################################
# Numbers are cycles per processed byte out of large message.
#
#			r=1088(*)
#
# Haswell		8.7/+10%
# Skylake		7.8/+20%
# Ryzen			17(**)
#
# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
#	coefficient in comparison to scalar keccak1600-x86_64.pl.
# (**)	It's expected that Ryzen performs poorly, because instruction
#	issue rate is limited to two AVX2 instructions per cycle and
#	in addition vpblendd is reportedly bound to specific port.
#	Obviously this code path should not be executed on Ryzen.

my @T = map("%ymm$_",(7..15));
my ($C14,$C00,$D00,$D14) = @T[5..8];

$code.=<<___;
.text

.type	__KeccakF1600,\@function
.align	32
__KeccakF1600:
	lea		rhotates_left+96(%rip),%r8
	lea		rhotates_right+96(%rip),%r9
	lea		iotas(%rip),%r10
	mov		\$24,%eax
	jmp		.Loop_avx2

.align	32
.Loop_avx2:
	######################################### Theta
	vpshufd		\$0b01001110,$A20,$C00
	vpxor		$A31,$A41,$C14
	vpxor		$A11,$A21,@T[2]
	vpxor		$A01,$C14,$C14
	vpxor		@T[2],$C14,$C14		# C[1..4]

	vpermq		\$0b10010011,$C14,@T[4]
	vpxor		$A20,$C00,$C00
	vpermq		\$0b01001110,$C00,@T[0]

	vpsrlq		\$63,$C14,@T[1]
	vpaddq		$C14,$C14,@T[2]
	vpor		@T[2],@T[1],@T[1]	# ROL64(C[1..4],1)

	vpermq		\$0b00111001,@T[1],$D14
	vpxor		@T[4],@T[1],$D00
	vpermq		\$0b00000000,$D00,$D00	# D[0..0] = ROL64(C[1],1) ^ C[4]

	vpxor		$A00,$C00,$C00
	vpxor		@T[0],$C00,$C00		# C[0..0]

	vpsrlq		\$63,$C00,@T[0]
	vpaddq		$C00,$C00,@T[1]
	vpor		@T[0],@T[1],@T[1]	# ROL64(C[0..0],1)

	vpxor		$D00,$A20,$A20		# ^= D[0..0]
	vpxor		$D00,$A00,$A00		# ^= D[0..0]

	vpblendd	\$0b11000000,@T[1],$D14,$D14
	vpblendd	\$0b00000011,$C00,@T[4],@T[4]
	vpxor		@T[4],$D14,$D14		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]

	######################################### Rho + Pi + pre-Chi shuffle
	vpsllvq		0*32-96(%r8),$A20,@T[3]
	vpsrlvq		0*32-96(%r9),$A20,$A20
	vpor		@T[3],$A20,$A20

	 vpxor		$D14,$A31,$A31		# ^= D[1..4] from Theta
	vpsllvq		2*32-96(%r8),$A31,@T[4]
	vpsrlvq		2*32-96(%r9),$A31,$A31
	vpor		@T[4],$A31,$A31

	 vpxor		$D14,$A21,$A21		# ^= D[1..4] from Theta
	vpsllvq		3*32-96(%r8),$A21,@T[5]
	vpsrlvq		3*32-96(%r9),$A21,$A21
	vpor		@T[5],$A21,$A21

	 vpxor		$D14,$A41,$A41		# ^= D[1..4] from Theta
	vpsllvq		4*32-96(%r8),$A41,@T[6]
	vpsrlvq		4*32-96(%r9),$A41,$A41
	vpor		@T[6],$A41,$A41

	 vpxor		$D14,$A11,$A11		# ^= D[1..4] from Theta
	 vpermq		\$0b10001101,$A20,@T[3]	# $A20 -> future $A31
	 vpermq		\$0b10001101,$A31,@T[4]	# $A31 -> future $A21
	vpsllvq		5*32-96(%r8),$A11,@T[7]
	vpsrlvq		5*32-96(%r9),$A11,@T[1]
	vpor		@T[7],@T[1],@T[1]	# $A11 -> future $A01

	 vpxor		$D14,$A01,$A01		# ^= D[1..4] from Theta
	 vpermq		\$0b00011011,$A21,@T[5]	# $A21 -> future $A41
	 vpermq		\$0b01110010,$A41,@T[6]	# $A41 -> future $A11
	vpsllvq		1*32-96(%r8),$A01,@T[8]
	vpsrlvq		1*32-96(%r9),$A01,@T[2]
	vpor		@T[8],@T[2],@T[2]	# $A01 -> future $A20

	######################################### Chi
	vpsrldq		\$8,@T[1],@T[7]
	vpandn		@T[7],@T[1],@T[0]	# tgting  [0][0] [0][0] [0][0] [0][0]

	vpblendd	\$0b00001100,@T[6],@T[2],$A31	#               [4][4] [2][0]
	vpblendd	\$0b00001100,@T[2],@T[4],@T[8]	#               [4][0] [2][1]
	 vpblendd	\$0b00001100,@T[4],@T[3],$A41	#               [4][2] [2][4]
	 vpblendd	\$0b00001100,@T[3],@T[2],@T[7]	#               [4][3] [2][0]
	vpblendd	\$0b00110000,@T[4],$A31,$A31	#        [1][3] [4][4] [2][0]
	vpblendd	\$0b00110000,@T[5],@T[8],@T[8]	#        [1][4] [4][0] [2][1]
	 vpblendd	\$0b00110000,@T[2],$A41,$A41	#        [1][0] [4][2] [2][4]
	 vpblendd	\$0b00110000,@T[6],@T[7],@T[7]	#        [1][1] [4][3] [2][0]
	vpblendd	\$0b11000000,@T[5],$A31,$A31	# [3][2] [1][3] [4][4] [2][0]
	vpblendd	\$0b11000000,@T[6],@T[8],@T[8]	# [3][3] [1][4] [4][0] [2][1]
	 vpblendd	\$0b11000000,@T[6],$A41,$A41	# [3][3] [1][0] [4][2] [2][4]
	 vpblendd	\$0b11000000,@T[4],@T[7],@T[7]	# [3][4] [1][1] [4][3] [2][0]
	vpandn		@T[8],$A31,$A31		# tgting  [3][1] [1][2] [4][3] [2][4]
	 vpandn		@T[7],$A41,$A41		# tgting  [3][2] [1][4] [4][1] [2][3]

	vpblendd	\$0b00001100,@T[2],@T[5],$A11	#               [4][0] [2][3]
	vpblendd	\$0b00001100,@T[5],@T[3],@T[8]	#               [4][1] [2][4]
	 vpxor		@T[3],$A31,$A31
	vpblendd	\$0b00110000,@T[3],$A11,$A11	#        [1][2] [4][0] [2][3]
	vpblendd	\$0b00110000,@T[4],@T[8],@T[8]	#        [1][3] [4][1] [2][4]
	 vpxor		@T[5],$A41,$A41
	vpblendd	\$0b11000000,@T[4],$A11,$A11	# [3][4] [1][2] [4][0] [2][3]
	vpblendd	\$0b11000000,@T[2],@T[8],@T[8]	# [3][0] [1][3] [4][1] [2][4]
	vpandn		@T[8],$A11,$A11		# tgting  [3][3] [1][1] [4][4] [2][2]
	vpxor		@T[6],$A11,$A11

	  vpermq	\$0b00011110,@T[1],$A21		# [0][1] [0][2] [0][4] [0][3]
	  vpblendd	\$0b00110000,$A00,$A21,@T[8]	# [0][1] [0][0] [0][4] [0][3]
	  vpermq	\$0b00111001,@T[1],$A01		# [0][1] [0][4] [0][3] [0][2]
	  vpblendd	\$0b11000000,$A00,$A01,$A01	# [0][0] [0][4] [0][3] [0][2]
	  vpandn	@T[8],$A01,$A01		# tgting  [0][4] [0][3] [0][2] [0][1]

	vpblendd	\$0b00001100,@T[5],@T[4],$A20	#               [4][1] [2][1]
	vpblendd	\$0b00001100,@T[4],@T[6],@T[7]	#               [4][2] [2][2]
	vpblendd	\$0b00110000,@T[6],$A20,$A20	#        [1][1] [4][1] [2][1]
	vpblendd	\$0b00110000,@T[3],@T[7],@T[7]	#        [1][2] [4][2] [2][2]
	vpblendd	\$0b11000000,@T[3],$A20,$A20	# [3][1] [1][1] [4][1] [2][1]
	vpblendd	\$0b11000000,@T[5],@T[7],@T[7]	# [3][2] [1][2] [4][2] [2][2]
	vpandn		@T[7],$A20,$A20		# tgting  [3][0] [1][0] [4][0] [2][0]
	vpxor		@T[2],$A20,$A20

	 vpermq		\$0b00000000,@T[0],@T[0]	# [0][0] [0][0] [0][0] [0][0]
	 vpermq		\$0b00011011,$A31,$A31	# post-Chi shuffle
	 vpermq		\$0b10001101,$A41,$A41
	 vpermq		\$0b01110010,$A11,$A11

	vpblendd	\$0b00001100,@T[3],@T[6],$A21	#               [4][3] [2][2]
	vpblendd	\$0b00001100,@T[6],@T[5],@T[7]	#               [4][4] [2][3]
	vpblendd	\$0b00110000,@T[5],$A21,$A21	#        [1][4] [4][3] [2][2]
	vpblendd	\$0b00110000,@T[2],@T[7],@T[7]	#        [1][0] [4][4] [2][3]
	vpblendd	\$0b11000000,@T[2],$A21,$A21	# [3][0] [1][4] [4][3] [2][2]
	vpblendd	\$0b11000000,@T[3],@T[7],@T[7]	# [3][1] [1][0] [4][4] [2][3]
	vpandn		@T[7],$A21,$A21		# tgting  [3][4] [1][3] [4][2] [2][1]

	vpxor		@T[0],$A00,$A00
	vpxor		@T[1],$A01,$A01
	vpxor		@T[4],$A21,$A21

	######################################### Iota
	vpxor		(%r10),$A00,$A00
	lea		32(%r10),%r10

	dec		%eax
	jnz		.Loop_avx2

	ret
.size	__KeccakF1600,.-__KeccakF1600
___
my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
my  $out = $inp;	# in squeeze

$code.=<<___;
.globl	SHA3_absorb
.type	SHA3_absorb,\@function
.align	32
SHA3_absorb:
	mov	%rsp,%r11

	lea	-240(%rsp),%rsp
	and	\$-32,%rsp

	lea	96($A_flat),$A_flat
	lea	96($inp),$inp
	lea	96(%rsp),%r10

	vzeroupper

	vpbroadcastq	-96($A_flat),$A00	# load A[5][5]
	vmovdqu		8+32*0-96($A_flat),$A01
	vmovdqu		8+32*1-96($A_flat),$A20
	vmovdqu		8+32*2-96($A_flat),$A31
	vmovdqu		8+32*3-96($A_flat),$A21
	vmovdqu		8+32*4-96($A_flat),$A41
	vmovdqu		8+32*5-96($A_flat),$A11

	vpxor		@T[0],@T[0],@T[0]
	vmovdqa		@T[0],32*2-96(%r10)	# zero transfer area on stack
	vmovdqa		@T[0],32*3-96(%r10)
	vmovdqa		@T[0],32*4-96(%r10)
	vmovdqa		@T[0],32*5-96(%r10)
	vmovdqa		@T[0],32*6-96(%r10)

.Loop_absorb_avx2:
	mov		$bsz,%rax
	sub		$bsz,$len
	jc		.Ldone_absorb_avx2

	shr		\$3,%eax
	vpbroadcastq	0-96($inp),@T[0]
	vmovdqu		8-96($inp),@T[1]
	sub		\$4,%eax
___
for(my $i=5; $i<25; $i++) {
$code.=<<___
	dec	%eax
	jz	.Labsorved_avx2
	mov	8*$i-96($inp),%r8
	mov	%r8,$A_jagged[$i]-96(%r10)
___
}
$code.=<<___;
.Labsorved_avx2:
	lea	($inp,$bsz),$inp

	vpxor	@T[0],$A00,$A00
	vpxor	@T[1],$A01,$A01
	vpxor	32*2-96(%r10),$A20,$A20
	vpxor	32*3-96(%r10),$A31,$A31
	vpxor	32*4-96(%r10),$A21,$A21
	vpxor	32*5-96(%r10),$A41,$A41
	vpxor	32*6-96(%r10),$A11,$A11

	call	__KeccakF1600

	lea	96(%rsp),%r10
	jmp	.Loop_absorb_avx2

.Ldone_absorb_avx2:
	vmovq	%xmm0,-96($A_flat)
	vmovdqu	$A01,8+32*0-96($A_flat)
	vmovdqu	$A20,8+32*1-96($A_flat)
	vmovdqu	$A31,8+32*2-96($A_flat)
	vmovdqu	$A21,8+32*3-96($A_flat)
	vmovdqu	$A41,8+32*4-96($A_flat)
	vmovdqu	$A11,8+32*5-96($A_flat)

	vzeroupper

	lea	(%r11),%rsp
	lea	($len,$bsz),%rax		# return value
	ret
.size	SHA3_absorb,.-SHA3_absorb

.globl	SHA3_squeeze
.type	SHA3_squeeze,\@function
.align	32
SHA3_squeeze:
	mov	%rsp,%r11

	lea	96($A_flat),$A_flat
	shr	\$3,$bsz

	vzeroupper

	vpbroadcastq	-96($A_flat),$A00
	vpxor		@T[0],@T[0],@T[0]
	vmovdqu		8+32*0-96($A_flat),$A01
	vmovdqu		8+32*1-96($A_flat),$A20
	vmovdqu		8+32*2-96($A_flat),$A31
	vmovdqu		8+32*3-96($A_flat),$A21
	vmovdqu		8+32*4-96($A_flat),$A41
	vmovdqu		8+32*5-96($A_flat),$A11

	mov	$bsz,%rax

.Loop_squeeze_avx2:
	mov	@A_jagged[$i]-96($A_flat),%r8
___
for (my $i=0; $i<25; $i++) {
$code.=<<___;
	sub	\$8,$len
	jc	.Ltail_squeeze_avx2
	mov	%r8,($out)
	lea	8($out),$out
	je	.Ldone_squeeze_avx2
	dec	%eax
	je	.Lextend_output_avx2
	mov	@A_jagged[$i+1]-120($A_flat),%r8
___
}
$code.=<<___;
.Lextend_output_avx2:
	call	__KeccakF1600

	vmovq	%xmm0,-96($A_flat)
	vmovdqu	$A01,8+32*0-96($A_flat)
	vmovdqu	$A20,8+32*1-96($A_flat)
	vmovdqu	$A31,8+32*2-96($A_flat)
	vmovdqu	$A21,8+32*3-96($A_flat)
	vmovdqu	$A41,8+32*4-96($A_flat)
	vmovdqu	$A11,8+32*5-96($A_flat)

	mov	$bsz,%rax
	jmp	.Loop_squeeze_avx2


.Ltail_squeeze_avx2:
	add	\$8,$len
.Loop_tail_avx2:
	mov	%r8b,($out)
	lea	1($out),$out
	shr	\$8,%r8
	dec	$len
	jnz	.Loop_tail_avx2

.Ldone_squeeze_avx2:
	vzeroupper

	lea	(%r11),%rsp
	ret
.size	SHA3_squeeze,.-SHA3_squeeze

.align	64
rhotates_left:
	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
rhotates_right:
	.quad	64-3,	64-18,	64-36,	64-41
	.quad	64-1,	64-62,	64-28,	64-27
	.quad	64-45,	64-6,	64-56,	64-39
	.quad	64-10,	64-61,	64-55,	64-8
	.quad	64-2,	64-15,	64-25,	64-20
	.quad	64-44,	64-43,	64-21,	64-14
iotas:
	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008

.asciz	"Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
___

$output=pop and open STDOUT,">$output";
print $code;
close STDOUT or die "error closing STDOUT: $!";
Commit	Line	Data
29724d0e	1	#!/usr/bin/env perl
b0edda11	2	# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
29724d0e	3	#
a598ed0d	4	# Licensed under the Apache License 2.0 (the "License"). You may not use
29724d0e AP	5	# this file except in compliance with the License. You can obtain a copy
	6	# in the file LICENSE in the source distribution or at
	7	# https://www.openssl.org/source/license.html
	8	#
	9	# ====================================================================
	10	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	11	# project. The module is, however, dual licensed under OpenSSL and
	12	# CRYPTOGAMS licenses depending on where you obtain it. For further
	13	# details see http://www.openssl.org/~appro/cryptogams/.
	14	# ====================================================================
	15	#
	16	# Keccak-1600 for AVX2.
	17	#
	18	# July 2017.
	19	#
	20	# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
	21	# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
	22	# other than A[0][0] in magic order into 6 [256-bit] registers, *each
	23	# dedicated to one axis*, Pi permutation is reduced to intra-register
	24	# shuffles...
	25	#
	26	# It makes other steps more intricate, but overall, is it a win? To be
	27	# more specific index permutations organized by quadruples are:
	28	#
	29	# [4][4] [3][3] [2][2] [1][1]<-+
	30	# [0][4] [0][3] [0][2] [0][1]<-+
	31	# [3][0] [1][0] [4][0] [2][0] \|
	32	# [4][3] [3][1] [2][4] [1][2] \|
	33	# [3][4] [1][3] [4][2] [2][1] \|
	34	# [2][3] [4][1] [1][4] [3][2] \|
	35	# [2][2] [4][4] [1][1] [3][3] -+
	36	#
	37	# This however is highly impractical for Theta and Chi. What would help
	38	# Theta is if x indices were aligned column-wise, or in other words:
	39	#
	40	# [0][4] [0][3] [0][2] [0][1]
	41	# [3][0] [1][0] [4][0] [2][0]
	42	#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
	43	# [2][4] [4][3] [1][2] [3][1]
	44	#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
	45	# [3][4] [1][3] [4][2] [2][1]
	46	#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
	47	# [1][4] [2][3] [3][2] [4][1]
	48	#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
	49	# [4][4] [3][3] [2][2] [1][1]
	50	#
	51	# So here we have it, lines not marked with vpermq() represent the magic
	52	# order in which data is to be loaded and maintained. [And lines marked
	53	# with vpermq() represent Pi circular permutation in chosen layout. Note
	54	# that first step is permutation-free.] A[0][0] is loaded to register of
	55	# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
	56	# Digits in variables' names denote right-most coordinates:
	57
	58	my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
	59	$A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
	60	$A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
	61	$A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
	62	$A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
	63	$A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
	64	$A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
	65	map("%ymm$_",(0..6));
	66
	67	# We also need to map the magic order into offsets within structure:
	68
69	my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
70	[2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
71	[2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
72	[2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
73	[2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
74	@A_jagged = map(8($$_[0]4+$$_[1]), @A_jagged); # ... and now linear
75
76	# But on the other hand Chi is much better off if y indices were aligned
77	# column-wise, not x. For this reason we have to shuffle data prior
78	# Chi and revert it afterwards. Prior shuffle is naturally merged with
79	# Pi itself:
80	#
81	# [0][4] [0][3] [0][2] [0][1]
82	# [3][0] [1][0] [4][0] [2][0]
83	#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84	#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85	# [3][1] [1][2] [4][3] [2][4]
86	#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87	#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88	# [3][4] [1][3] [4][2] [2][1]
89	#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90	#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91	# [3][2] [1][4] [4][1] [2][3]
92	#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93	#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94	# [3][3] [1][1] [4][4] [2][2]
95	#
96	# And reverse post-Chi permutation:
97	#
98	# [0][4] [0][3] [0][2] [0][1]
99	# [3][0] [1][0] [4][0] [2][0]
100	#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101	# [2][4] [4][3] [1][2] [3][1]
102	#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103	# [3][4] [1][3] [4][2] [2][1]
104	#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105	# [1][4] [2][3] [3][2] [4][1]
106	#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107	# [4][4] [3][3] [2][2] [1][1]
108	#
109	########################################################################
110	# Numbers are cycles per processed byte out of large message.
111	#
112	# r=1088(*)
113	#
d212b98b AP	114	# Haswell 8.7/+10%
	115	# Skylake 7.8/+20%
	116	# Ryzen 17(**)
29724d0e	117	#
d212b98b AP	118	# (*) Corresponds to SHA3-256. Percentage after slash is improvement
	119	# coefficient in comparison to scalar keccak1600-x86_64.pl.
	120	# (**) It's expected that Ryzen performs poorly, because instruction
	121	# issue rate is limited to two AVX2 instructions per cycle and
	122	# in addition vpblendd is reportedly bound to specific port.
	123	# Obviously this code path should not be executed on Ryzen.
29724d0e AP	124
	125	my @T = map("%ymm$_",(7..15));
	126	my ($C14,$C00,$D00,$D14) = @T[5..8];
	127
	128	$code.=<<___;
	129	.text
	130
	131	.type __KeccakF1600,\@function
	132	.align 32
	133	__KeccakF1600:
	134	lea rhotates_left+96(%rip),%r8
	135	lea rhotates_right+96(%rip),%r9
	136	lea iotas(%rip),%r10
	137	mov \$24,%eax
	138	jmp .Loop_avx2
	139
	140	.align 32
	141	.Loop_avx2:
	142	######################################### Theta
d212b98b AP	143	vpshufd \$0b01001110,$A20,$C00
	144	vpxor $A31,$A41,$C14
	145	vpxor $A11,$A21,@T[2]
	146	vpxor $A01,$C14,$C14
	147	vpxor @T[2],$C14,$C14 # C[1..4]
	148
	149	vpermq \$0b10010011,$C14,@T[4]
29724d0e AP	150	vpxor $A20,$C00,$C00
29724d0e AP	151	vpermq \$0b01001110,$C00,@T[0]
29724d0e AP	152
29724d0e AP	153	vpsrlq \$63,$C14,@T[1]
d212b98b AP	154	vpaddq $C14,$C14,@T[2]
d212b98b AP	155	vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
29724d0e	156
d212b98b AP	157	vpermq \$0b00111001,@T[1],$D14
	158	vpxor @T[4],@T[1],$D00
	159	vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
29724d0e	160
d212b98b AP	161	vpxor $A00,$C00,$C00
d212b98b AP	162	vpxor @T[0],$C00,$C00 # C[0..0]
29724d0e	163
d212b98b AP	164	vpsrlq \$63,$C00,@T[0]
	165	vpaddq $C00,$C00,@T[1]
	166	vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
29724d0e	167
91dbdc63	168	vpxor $D00,$A20,$A20 # ^= D[0..0]
d212b98b AP	169	vpxor $D00,$A00,$A00 # ^= D[0..0]
	170
	171	vpblendd \$0b11000000,@T[1],$D14,$D14
	172	vpblendd \$0b00000011,$C00,@T[4],@T[4]
	173	vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
91dbdc63	174
d212b98b AP	175	######################################### Rho + Pi + pre-Chi shuffle
	176	vpsllvq 0*32-96(%r8),$A20,@T[3]
	177	vpsrlvq 0*32-96(%r9),$A20,$A20
	178	vpor @T[3],$A20,$A20
29724d0e	179
d212b98b AP	180	vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta
d212b98b AP	181	vpsllvq 2*32-96(%r8),$A31,@T[4]
29724d0e	182	vpsrlvq 2*32-96(%r9),$A31,$A31
d212b98b	183	vpor @T[4],$A31,$A31
29724d0e	184
d212b98b AP	185	vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta
d212b98b AP	186	vpsllvq 3*32-96(%r8),$A21,@T[5]
29724d0e	187	vpsrlvq 3*32-96(%r9),$A21,$A21
d212b98b	188	vpor @T[5],$A21,$A21
29724d0e	189
d212b98b AP	190	vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta
d212b98b AP	191	vpsllvq 4*32-96(%r8),$A41,@T[6]
29724d0e	192	vpsrlvq 4*32-96(%r9),$A41,$A41
d212b98b AP	193	vpor @T[6],$A41,$A41
	194
	195	vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta
	196	vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
	197	vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
	198	vpsllvq 5*32-96(%r8),$A11,@T[7]
	199	vpsrlvq 5*32-96(%r9),$A11,@T[1]
	200	vpor @T[7],@T[1],@T[1] # $A11 -> future $A01
	201
	202	vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta
	203	vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
	204	vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
	205	vpsllvq 1*32-96(%r8),$A01,@T[8]
	206	vpsrlvq 1*32-96(%r9),$A01,@T[2]
	207	vpor @T[8],@T[2],@T[2] # $A01 -> future $A20
29724d0e	208
d212b98b AP	209	######################################### Chi
	210	vpsrldq \$8,@T[1],@T[7]
	211	vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
29724d0e	212
d212b98b AP	213	vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
	214	vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
	215	vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
	216	vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
	217	vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
	218	vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
	219	vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
	220	vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
	221	vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
	222	vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
	223	vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
	224	vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
	225	vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
	226	vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
29724d0e	227
d212b98b AP	228	vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
	229	vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
	230	vpxor @T[3],$A31,$A31
	231	vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
	232	vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
	233	vpxor @T[5],$A41,$A41
	234	vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
	235	vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
	236	vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
	237	vpxor @T[6],$A11,$A11
29724d0e	238
d212b98b AP	239	vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
	240	vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
	241	vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
	242	vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
	243	vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
91dbdc63 AP	244
91dbdc63 AP	245	vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
91dbdc63	246	vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
d212b98b	247	vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
91dbdc63	248	vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
d212b98b	249	vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
91dbdc63 AP	250	vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
91dbdc63 AP	251	vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
d212b98b	252	vpxor @T[2],$A20,$A20
91dbdc63	253
d212b98b AP	254	vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
	255	vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
	256	vpermq \$0b10001101,$A41,$A41
	257	vpermq \$0b01110010,$A11,$A11
91dbdc63 AP	258
91dbdc63 AP	259	vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
91dbdc63	260	vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
d212b98b	261	vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
91dbdc63	262	vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
d212b98b	263	vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
91dbdc63 AP	264	vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
	265	vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
	266
91dbdc63	267	vpxor @T[0],$A00,$A00
c7c7a8e6	268	vpxor @T[1],$A01,$A01
29724d0e	269	vpxor @T[4],$A21,$A21
29724d0e AP	270
	271	######################################### Iota
	272	vpxor (%r10),$A00,$A00
	273	lea 32(%r10),%r10
	274
	275	dec %eax
	276	jnz .Loop_avx2
	277
	278	ret
	279	.size __KeccakF1600,.-__KeccakF1600
	280	___
	281	my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
	282	my $out = $inp; # in squeeze
	283
	284	$code.=<<___;
	285	.globl SHA3_absorb
	286	.type SHA3_absorb,\@function
	287	.align 32
	288	SHA3_absorb:
	289	mov %rsp,%r11
	290
	291	lea -240(%rsp),%rsp
	292	and \$-32,%rsp
	293
	294	lea 96($A_flat),$A_flat
	295	lea 96($inp),$inp
	296	lea 96(%rsp),%r10
	297
	298	vzeroupper
	299
	300	vpbroadcastq -96($A_flat),$A00 # load A[5][5]
	301	vmovdqu 8+32*0-96($A_flat),$A01
	302	vmovdqu 8+32*1-96($A_flat),$A20
	303	vmovdqu 8+32*2-96($A_flat),$A31
	304	vmovdqu 8+32*3-96($A_flat),$A21
	305	vmovdqu 8+32*4-96($A_flat),$A41
	306	vmovdqu 8+32*5-96($A_flat),$A11
	307
	308	vpxor @T[0],@T[0],@T[0]
	309	vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
	310	vmovdqa @T[0],32*3-96(%r10)
	311	vmovdqa @T[0],32*4-96(%r10)
	312	vmovdqa @T[0],32*5-96(%r10)
	313	vmovdqa @T[0],32*6-96(%r10)
	314
	315	.Loop_absorb_avx2:
	316	mov $bsz,%rax
	317	sub $bsz,$len
	318	jc .Ldone_absorb_avx2
	319
	320	shr \$3,%eax
	321	vpbroadcastq 0-96($inp),@T[0]
	322	vmovdqu 8-96($inp),@T[1]
	323	sub \$4,%eax
	324	___
	325	for(my $i=5; $i<25; $i++) {
	326	$code.=<<___
	327	dec %eax
	328	jz .Labsorved_avx2
	329	mov 8*$i-96($inp),%r8
	330	mov %r8,$A_jagged[$i]-96(%r10)
	331	___
	332	}
	333	$code.=<<___;
334	.Labsorved_avx2:
335	lea ($inp,$bsz),$inp
336
337	vpxor @T[0],$A00,$A00
338	vpxor @T[1],$A01,$A01
339	vpxor 32*2-96(%r10),$A20,$A20
340	vpxor 32*3-96(%r10),$A31,$A31
341	vpxor 32*4-96(%r10),$A21,$A21
342	vpxor 32*5-96(%r10),$A41,$A41
343	vpxor 32*6-96(%r10),$A11,$A11
344
345	call __KeccakF1600
346
347	lea 96(%rsp),%r10
348	jmp .Loop_absorb_avx2
349
350	.Ldone_absorb_avx2:
351	vmovq %xmm0,-96($A_flat)
352	vmovdqu $A01,8+32*0-96($A_flat)
353	vmovdqu $A20,8+32*1-96($A_flat)
354	vmovdqu $A31,8+32*2-96($A_flat)
355	vmovdqu $A21,8+32*3-96($A_flat)
356	vmovdqu $A41,8+32*4-96($A_flat)
357	vmovdqu $A11,8+32*5-96($A_flat)
358
359	vzeroupper
360
361	lea (%r11),%rsp
362	lea ($len,$bsz),%rax # return value
363	ret
364	.size SHA3_absorb,.-SHA3_absorb
365
366	.globl SHA3_squeeze
367	.type SHA3_squeeze,\@function
368	.align 32
369	SHA3_squeeze:
370	mov %rsp,%r11
371
372	lea 96($A_flat),$A_flat
373	shr \$3,$bsz
374
375	vzeroupper
376
377	vpbroadcastq -96($A_flat),$A00
378	vpxor @T[0],@T[0],@T[0]
379	vmovdqu 8+32*0-96($A_flat),$A01
380	vmovdqu 8+32*1-96($A_flat),$A20
381	vmovdqu 8+32*2-96($A_flat),$A31
382	vmovdqu 8+32*3-96($A_flat),$A21
383	vmovdqu 8+32*4-96($A_flat),$A41
384	vmovdqu 8+32*5-96($A_flat),$A11
385
386	mov $bsz,%rax
387
388	.Loop_squeeze_avx2:
389	mov @A_jagged[$i]-96($A_flat),%r8
390	___
391	for (my $i=0; $i<25; $i++) {
392	$code.=<<___;
393	sub \$8,$len
394	jc .Ltail_squeeze_avx2
395	mov %r8,($out)
396	lea 8($out),$out
397	je .Ldone_squeeze_avx2
398	dec %eax
399	je .Lextend_output_avx2
400	mov @A_jagged[$i+1]-120($A_flat),%r8
401	___
402	}
403	$code.=<<___;
404	.Lextend_output_avx2:
405	call __KeccakF1600
406
407	vmovq %xmm0,-96($A_flat)
408	vmovdqu $A01,8+32*0-96($A_flat)
409	vmovdqu $A20,8+32*1-96($A_flat)
410	vmovdqu $A31,8+32*2-96($A_flat)
411	vmovdqu $A21,8+32*3-96($A_flat)
412	vmovdqu $A41,8+32*4-96($A_flat)
413	vmovdqu $A11,8+32*5-96($A_flat)
414
415	mov $bsz,%rax
416	jmp .Loop_squeeze_avx2
417
418
419	.Ltail_squeeze_avx2:
420	add \$8,$len
421	.Loop_tail_avx2:
422	mov %r8b,($out)
423	lea 1($out),$out
424	shr \$8,%r8
425	dec $len
426	jnz .Loop_tail_avx2
427
428	.Ldone_squeeze_avx2:
429	vzeroupper
430
431	lea (%r11),%rsp
432	ret
433	.size SHA3_squeeze,.-SHA3_squeeze
434
435	.align 64
436	rhotates_left:
437	.quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
438	.quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
439	.quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
440	.quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
441	.quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
442	.quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
443	rhotates_right:
444	.quad 64-3, 64-18, 64-36, 64-41
445	.quad 64-1, 64-62, 64-28, 64-27
446	.quad 64-45, 64-6, 64-56, 64-39
447	.quad 64-10, 64-61, 64-55, 64-8
448	.quad 64-2, 64-15, 64-25, 64-20
449	.quad 64-44, 64-43, 64-21, 64-14
450	iotas:
451	.quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
452	.quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
453	.quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
454	.quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
455	.quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
456	.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
457	.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
458	.quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
459	.quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
460	.quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
461	.quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
462	.quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
463	.quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
464	.quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
465	.quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
466	.quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
467	.quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
468	.quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
469	.quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
470	.quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
471	.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
472	.quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
473	.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
474	.quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
475
476	.asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
477	___
478
1aa89a7a	479	$output=pop and open STDOUT,">$output";
29724d0e	480	print $code;
a21314db	481	close STDOUT or die "error closing STDOUT: $!";