crypto/sha/asm/keccak1600-avx2.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for AVX2.
  17 #
  18 # July 2017.
  19 #
  20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
  21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
  22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
  23 # dedicated to one axis*, Pi permutation is reduced to intra-register
  24 # shuffles...
  25 #
  26 # It makes other steps more intricate, but overall, is it a win? To be
  27 # more specific index permutations organized by quadruples are:
  28 #
  29 #       [4][4] [3][3] [2][2] [1][1]<-+
  30 #       [0][4] [0][3] [0][2] [0][1]<-+
  31 #       [3][0] [1][0] [4][0] [2][0]  |
  32 #       [4][3] [3][1] [2][4] [1][2]  |
  33 #       [3][4] [1][3] [4][2] [2][1]  |
  34 #       [2][3] [4][1] [1][4] [3][2]  |
  35 #       [2][2] [4][4] [1][1] [3][3] -+
  36 #
  37 # This however is highly impractical for Theta and Chi. What would help
  38 # Theta is if x indices were aligned column-wise, or in other words:
  39 #
  40 #       [0][4] [0][3] [0][2] [0][1]
  41 #       [3][0] [1][0] [4][0] [2][0]
  42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
  43 #       [2][4] [4][3] [1][2] [3][1]
  44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
  45 #       [3][4] [1][3] [4][2] [2][1]
  46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
  47 #       [1][4] [2][3] [3][2] [4][1]
  48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
  49 #       [4][4] [3][3] [2][2] [1][1]
  50 #
  51 # So here we have it, lines not marked with vpermq() represent the magic
  52 # order in which data is to be loaded and maintained. [And lines marked
  53 # with vpermq() represent Pi circular permutation in chosen layout. Note
  54 # that first step is permutation-free.] A[0][0] is loaded to register of
  55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
  56 # Digits in variables' names denote right-most coordinates:
  57
  58 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
  59     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
  60     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
  61     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
  62     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
  63     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
  64     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
  65     map("%ymm$_",(0..6));
  66
  67 # We also need to map the magic order into offsets within structure:
  68
  69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
  70                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
  71                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
  72                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
  73                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
  74    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
  75
  76 # But on the other hand Chi is much better off if y indices were aligned
  77 # column-wise, not x. For this reason we have to shuffle data prior
  78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
  79 # Pi itself:
  80 #
  81 #       [0][4] [0][3] [0][2] [0][1]
  82 #       [3][0] [1][0] [4][0] [2][0]
  83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
  84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
  85 #       [3][1] [1][2] [4][3] [2][4]
  86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
  87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
  88 #       [3][4] [1][3] [4][2] [2][1]
  89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
  90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
  91 #       [3][2] [1][4] [4][1] [2][3]
  92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
  93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
  94 #       [3][3] [1][1] [4][4] [2][2]
  95 #
  96 # And reverse post-Chi permutation:
  97 #
  98 #       [0][4] [0][3] [0][2] [0][1]
  99 #       [3][0] [1][0] [4][0] [2][0]
 100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
 101 #       [2][4] [4][3] [1][2] [3][1]
 102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
 103 #       [3][4] [1][3] [4][2] [2][1]
 104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
 105 #       [1][4] [2][3] [3][2] [4][1]
 106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
 107 #       [4][4] [3][3] [2][2] [1][1]
 108 #
 109 ########################################################################
 110 # Numbers are cycles per processed byte out of large message.
 111 #
 112 #                       r=1088(*)
 113 #
 114 # Haswell               8.7/+10%
 115 # Skylake               7.8/+20%
 116 # Ryzen                 17(**)
 117 #
 118 # (*)   Corresponds to SHA3-256. Percentage after slash is improvement
 119 #       coefficient in comparison to scalar keccak1600-x86_64.pl.
 120 # (**)  It's expected that Ryzen performs poorly, because instruction
 121 #       issue rate is limited to two AVX2 instructions per cycle and
 122 #       in addition vpblendd is reportedly bound to specific port.
 123 #       Obviously this code path should not be executed on Ryzen.
 124
 125 my @T = map("%ymm$_",(7..15));
 126 my ($C14,$C00,$D00,$D14) = @T[5..8];
 127
 128 $code.=<<___;
 129 .text
 130
 131 .type   __KeccakF1600,\@function
 132 .align  32
 133 __KeccakF1600:
 134         lea             rhotates_left+96(%rip),%r8
 135         lea             rhotates_right+96(%rip),%r9
 136         lea             iotas(%rip),%r10
 137         mov             \$24,%eax
 138         jmp             .Loop_avx2
 139
 140 .align  32
 141 .Loop_avx2:
 142         ######################################### Theta
 143         vpshufd         \$0b01001110,$A20,$C00
 144         vpxor           $A31,$A41,$C14
 145         vpxor           $A11,$A21,@T[2]
 146         vpxor           $A01,$C14,$C14
 147         vpxor           @T[2],$C14,$C14         # C[1..4]
 148
 149         vpermq          \$0b10010011,$C14,@T[4]
 150         vpxor           $A20,$C00,$C00
 151         vpermq          \$0b01001110,$C00,@T[0]
 152
 153         vpsrlq          \$63,$C14,@T[1]
 154         vpaddq          $C14,$C14,@T[2]
 155         vpor            @T[2],@T[1],@T[1]       # ROL64(C[1..4],1)
 156
 157         vpermq          \$0b00111001,@T[1],$D14
 158         vpxor           @T[4],@T[1],$D00
 159         vpermq          \$0b00000000,$D00,$D00  # D[0..0] = ROL64(C[1],1) ^ C[4]
 160
 161         vpxor           $A00,$C00,$C00
 162         vpxor           @T[0],$C00,$C00         # C[0..0]
 163
 164         vpsrlq          \$63,$C00,@T[0]
 165         vpaddq          $C00,$C00,@T[1]
 166         vpor            @T[0],@T[1],@T[1]       # ROL64(C[0..0],1)
 167
 168         vpxor           $D00,$A20,$A20          # ^= D[0..0]
 169         vpxor           $D00,$A00,$A00          # ^= D[0..0]
 170
 171         vpblendd        \$0b11000000,@T[1],$D14,$D14
 172         vpblendd        \$0b00000011,$C00,@T[4],@T[4]
 173         vpxor           @T[4],$D14,$D14         # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
 174
 175         ######################################### Rho + Pi + pre-Chi shuffle
 176         vpsllvq         0*32-96(%r8),$A20,@T[3]
 177         vpsrlvq         0*32-96(%r9),$A20,$A20
 178         vpor            @T[3],$A20,$A20
 179
 180          vpxor          $D14,$A31,$A31          # ^= D[1..4] from Theta
 181         vpsllvq         2*32-96(%r8),$A31,@T[4]
 182         vpsrlvq         2*32-96(%r9),$A31,$A31
 183         vpor            @T[4],$A31,$A31
 184
 185          vpxor          $D14,$A21,$A21          # ^= D[1..4] from Theta
 186         vpsllvq         3*32-96(%r8),$A21,@T[5]
 187         vpsrlvq         3*32-96(%r9),$A21,$A21
 188         vpor            @T[5],$A21,$A21
 189
 190          vpxor          $D14,$A41,$A41          # ^= D[1..4] from Theta
 191         vpsllvq         4*32-96(%r8),$A41,@T[6]
 192         vpsrlvq         4*32-96(%r9),$A41,$A41
 193         vpor            @T[6],$A41,$A41
 194
 195          vpxor          $D14,$A11,$A11          # ^= D[1..4] from Theta
 196          vpermq         \$0b10001101,$A20,@T[3] # $A20 -> future $A31
 197          vpermq         \$0b10001101,$A31,@T[4] # $A31 -> future $A21
 198         vpsllvq         5*32-96(%r8),$A11,@T[7]
 199         vpsrlvq         5*32-96(%r9),$A11,@T[1]
 200         vpor            @T[7],@T[1],@T[1]       # $A11 -> future $A01
 201
 202          vpxor          $D14,$A01,$A01          # ^= D[1..4] from Theta
 203          vpermq         \$0b00011011,$A21,@T[5] # $A21 -> future $A41
 204          vpermq         \$0b01110010,$A41,@T[6] # $A41 -> future $A11
 205         vpsllvq         1*32-96(%r8),$A01,@T[8]
 206         vpsrlvq         1*32-96(%r9),$A01,@T[2]
 207         vpor            @T[8],@T[2],@T[2]       # $A01 -> future $A20
 208
 209         ######################################### Chi
 210         vpsrldq         \$8,@T[1],@T[7]
 211         vpandn          @T[7],@T[1],@T[0]       # tgting  [0][0] [0][0] [0][0] [0][0]
 212
 213         vpblendd        \$0b00001100,@T[6],@T[2],$A31   #               [4][4] [2][0]
 214         vpblendd        \$0b00001100,@T[2],@T[4],@T[8]  #               [4][0] [2][1]
 215          vpblendd       \$0b00001100,@T[4],@T[3],$A41   #               [4][2] [2][4]
 216          vpblendd       \$0b00001100,@T[3],@T[2],@T[7]  #               [4][3] [2][0]
 217         vpblendd        \$0b00110000,@T[4],$A31,$A31    #        [1][3] [4][4] [2][0]
 218         vpblendd        \$0b00110000,@T[5],@T[8],@T[8]  #        [1][4] [4][0] [2][1]
 219          vpblendd       \$0b00110000,@T[2],$A41,$A41    #        [1][0] [4][2] [2][4]
 220          vpblendd       \$0b00110000,@T[6],@T[7],@T[7]  #        [1][1] [4][3] [2][0]
 221         vpblendd        \$0b11000000,@T[5],$A31,$A31    # [3][2] [1][3] [4][4] [2][0]
 222         vpblendd        \$0b11000000,@T[6],@T[8],@T[8]  # [3][3] [1][4] [4][0] [2][1]
 223          vpblendd       \$0b11000000,@T[6],$A41,$A41    # [3][3] [1][0] [4][2] [2][4]
 224          vpblendd       \$0b11000000,@T[4],@T[7],@T[7]  # [3][4] [1][1] [4][3] [2][0]
 225         vpandn          @T[8],$A31,$A31         # tgting  [3][1] [1][2] [4][3] [2][4]
 226          vpandn         @T[7],$A41,$A41         # tgting  [3][2] [1][4] [4][1] [2][3]
 227
 228         vpblendd        \$0b00001100,@T[2],@T[5],$A11   #               [4][0] [2][3]
 229         vpblendd        \$0b00001100,@T[5],@T[3],@T[8]  #               [4][1] [2][4]
 230          vpxor          @T[3],$A31,$A31
 231         vpblendd        \$0b00110000,@T[3],$A11,$A11    #        [1][2] [4][0] [2][3]
 232         vpblendd        \$0b00110000,@T[4],@T[8],@T[8]  #        [1][3] [4][1] [2][4]
 233          vpxor          @T[5],$A41,$A41
 234         vpblendd        \$0b11000000,@T[4],$A11,$A11    # [3][4] [1][2] [4][0] [2][3]
 235         vpblendd        \$0b11000000,@T[2],@T[8],@T[8]  # [3][0] [1][3] [4][1] [2][4]
 236         vpandn          @T[8],$A11,$A11         # tgting  [3][3] [1][1] [4][4] [2][2]
 237         vpxor           @T[6],$A11,$A11
 238
 239           vpermq        \$0b00011110,@T[1],$A21         # [0][1] [0][2] [0][4] [0][3]
 240           vpblendd      \$0b00110000,$A00,$A21,@T[8]    # [0][1] [0][0] [0][4] [0][3]
 241           vpermq        \$0b00111001,@T[1],$A01         # [0][1] [0][4] [0][3] [0][2]
 242           vpblendd      \$0b11000000,$A00,$A01,$A01     # [0][0] [0][4] [0][3] [0][2]
 243           vpandn        @T[8],$A01,$A01         # tgting  [0][4] [0][3] [0][2] [0][1]
 244
 245         vpblendd        \$0b00001100,@T[5],@T[4],$A20   #               [4][1] [2][1]
 246         vpblendd        \$0b00001100,@T[4],@T[6],@T[7]  #               [4][2] [2][2]
 247         vpblendd        \$0b00110000,@T[6],$A20,$A20    #        [1][1] [4][1] [2][1]
 248         vpblendd        \$0b00110000,@T[3],@T[7],@T[7]  #        [1][2] [4][2] [2][2]
 249         vpblendd        \$0b11000000,@T[3],$A20,$A20    # [3][1] [1][1] [4][1] [2][1]
 250         vpblendd        \$0b11000000,@T[5],@T[7],@T[7]  # [3][2] [1][2] [4][2] [2][2]
 251         vpandn          @T[7],$A20,$A20         # tgting  [3][0] [1][0] [4][0] [2][0]
 252         vpxor           @T[2],$A20,$A20
 253
 254          vpermq         \$0b00000000,@T[0],@T[0]        # [0][0] [0][0] [0][0] [0][0]
 255          vpermq         \$0b00011011,$A31,$A31  # post-Chi shuffle
 256          vpermq         \$0b10001101,$A41,$A41
 257          vpermq         \$0b01110010,$A11,$A11
 258
 259         vpblendd        \$0b00001100,@T[3],@T[6],$A21   #               [4][3] [2][2]
 260         vpblendd        \$0b00001100,@T[6],@T[5],@T[7]  #               [4][4] [2][3]
 261         vpblendd        \$0b00110000,@T[5],$A21,$A21    #        [1][4] [4][3] [2][2]
 262         vpblendd        \$0b00110000,@T[2],@T[7],@T[7]  #        [1][0] [4][4] [2][3]
 263         vpblendd        \$0b11000000,@T[2],$A21,$A21    # [3][0] [1][4] [4][3] [2][2]
 264         vpblendd        \$0b11000000,@T[3],@T[7],@T[7]  # [3][1] [1][0] [4][4] [2][3]
 265         vpandn          @T[7],$A21,$A21         # tgting  [3][4] [1][3] [4][2] [2][1]
 266
 267         vpxor           @T[0],$A00,$A00
 268         vpxor           @T[1],$A01,$A01
 269         vpxor           @T[4],$A21,$A21
 270
 271         ######################################### Iota
 272         vpxor           (%r10),$A00,$A00
 273         lea             32(%r10),%r10
 274
 275         dec             %eax
 276         jnz             .Loop_avx2
 277
 278         ret
 279 .size   __KeccakF1600,.-__KeccakF1600
 280 ___
 281 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 282 my  $out = $inp;        # in squeeze
 283
 284 $code.=<<___;
 285 .globl  SHA3_absorb
 286 .type   SHA3_absorb,\@function
 287 .align  32
 288 SHA3_absorb:
 289         mov     %rsp,%r11
 290
 291         lea     -240(%rsp),%rsp
 292         and     \$-32,%rsp
 293
 294         lea     96($A_flat),$A_flat
 295         lea     96($inp),$inp
 296         lea     96(%rsp),%r10
 297
 298         vzeroupper
 299
 300         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
 301         vmovdqu         8+32*0-96($A_flat),$A01
 302         vmovdqu         8+32*1-96($A_flat),$A20
 303         vmovdqu         8+32*2-96($A_flat),$A31
 304         vmovdqu         8+32*3-96($A_flat),$A21
 305         vmovdqu         8+32*4-96($A_flat),$A41
 306         vmovdqu         8+32*5-96($A_flat),$A11
 307
 308         vpxor           @T[0],@T[0],@T[0]
 309         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
 310         vmovdqa         @T[0],32*3-96(%r10)
 311         vmovdqa         @T[0],32*4-96(%r10)
 312         vmovdqa         @T[0],32*5-96(%r10)
 313         vmovdqa         @T[0],32*6-96(%r10)
 314
 315 .Loop_absorb_avx2:
 316         mov             $bsz,%rax
 317         sub             $bsz,$len
 318         jc              .Ldone_absorb_avx2
 319
 320         shr             \$3,%eax
 321         vpbroadcastq    0-96($inp),@T[0]
 322         vmovdqu         8-96($inp),@T[1]
 323         sub             \$4,%eax
 324 ___
 325 for(my $i=5; $i<25; $i++) {
 326 $code.=<<___
 327         dec     %eax
 328         jz      .Labsorved_avx2
 329         mov     8*$i-96($inp),%r8
 330         mov     %r8,$A_jagged[$i]-96(%r10)
 331 ___
 332 }
 333 $code.=<<___;
 334 .Labsorved_avx2:
 335         lea     ($inp,$bsz),$inp
 336
 337         vpxor   @T[0],$A00,$A00
 338         vpxor   @T[1],$A01,$A01
 339         vpxor   32*2-96(%r10),$A20,$A20
 340         vpxor   32*3-96(%r10),$A31,$A31
 341         vpxor   32*4-96(%r10),$A21,$A21
 342         vpxor   32*5-96(%r10),$A41,$A41
 343         vpxor   32*6-96(%r10),$A11,$A11
 344
 345         call    __KeccakF1600
 346
 347         lea     96(%rsp),%r10
 348         jmp     .Loop_absorb_avx2
 349
 350 .Ldone_absorb_avx2:
 351         vmovq   %xmm0,-96($A_flat)
 352         vmovdqu $A01,8+32*0-96($A_flat)
 353         vmovdqu $A20,8+32*1-96($A_flat)
 354         vmovdqu $A31,8+32*2-96($A_flat)
 355         vmovdqu $A21,8+32*3-96($A_flat)
 356         vmovdqu $A41,8+32*4-96($A_flat)
 357         vmovdqu $A11,8+32*5-96($A_flat)
 358
 359         vzeroupper
 360
 361         lea     (%r11),%rsp
 362         lea     ($len,$bsz),%rax                # return value
 363         ret
 364 .size   SHA3_absorb,.-SHA3_absorb
 365
 366 .globl  SHA3_squeeze
 367 .type   SHA3_squeeze,\@function
 368 .align  32
 369 SHA3_squeeze:
 370         mov     %rsp,%r11
 371
 372         lea     96($A_flat),$A_flat
 373         shr     \$3,$bsz
 374
 375         vzeroupper
 376
 377         vpbroadcastq    -96($A_flat),$A00
 378         vpxor           @T[0],@T[0],@T[0]
 379         vmovdqu         8+32*0-96($A_flat),$A01
 380         vmovdqu         8+32*1-96($A_flat),$A20
 381         vmovdqu         8+32*2-96($A_flat),$A31
 382         vmovdqu         8+32*3-96($A_flat),$A21
 383         vmovdqu         8+32*4-96($A_flat),$A41
 384         vmovdqu         8+32*5-96($A_flat),$A11
 385
 386         mov     $bsz,%rax
 387
 388 .Loop_squeeze_avx2:
 389         mov     @A_jagged[$i]-96($A_flat),%r8
 390 ___
 391 for (my $i=0; $i<25; $i++) {
 392 $code.=<<___;
 393         sub     \$8,$len
 394         jc      .Ltail_squeeze_avx2
 395         mov     %r8,($out)
 396         lea     8($out),$out
 397         je      .Ldone_squeeze_avx2
 398         dec     %eax
 399         je      .Lextend_output_avx2
 400         mov     @A_jagged[$i+1]-120($A_flat),%r8
 401 ___
 402 }
 403 $code.=<<___;
 404 .Lextend_output_avx2:
 405         call    __KeccakF1600
 406
 407         vmovq   %xmm0,-96($A_flat)
 408         vmovdqu $A01,8+32*0-96($A_flat)
 409         vmovdqu $A20,8+32*1-96($A_flat)
 410         vmovdqu $A31,8+32*2-96($A_flat)
 411         vmovdqu $A21,8+32*3-96($A_flat)
 412         vmovdqu $A41,8+32*4-96($A_flat)
 413         vmovdqu $A11,8+32*5-96($A_flat)
 414
 415         mov     $bsz,%rax
 416         jmp     .Loop_squeeze_avx2
 417
 418
 419 .Ltail_squeeze_avx2:
 420         add     \$8,$len
 421 .Loop_tail_avx2:
 422         mov     %r8b,($out)
 423         lea     1($out),$out
 424         shr     \$8,%r8
 425         dec     $len
 426         jnz     .Loop_tail_avx2
 427
 428 .Ldone_squeeze_avx2:
 429         vzeroupper
 430
 431         lea     (%r11),%rsp
 432         ret
 433 .size   SHA3_squeeze,.-SHA3_squeeze
 434
 435 .align  64
 436 rhotates_left:
 437         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
 438         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
 439         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
 440         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
 441         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
 442         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
 443 rhotates_right:
 444         .quad   64-3,   64-18,  64-36,  64-41
 445         .quad   64-1,   64-62,  64-28,  64-27
 446         .quad   64-45,  64-6,   64-56,  64-39
 447         .quad   64-10,  64-61,  64-55,  64-8
 448         .quad   64-2,   64-15,  64-25,  64-20
 449         .quad   64-44,  64-43,  64-21,  64-14
 450 iotas:
 451         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
 452         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
 453         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
 454         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
 455         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
 456         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
 457         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
 458         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
 459         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
 460         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
 461         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
 462         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
 463         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
 464         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
 465         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
 466         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
 467         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
 468         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
 469         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
 470         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
 471         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
 472         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
 473         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
 474         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
 475
 476 .asciz  "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
 477 ___
 478
 479 $output=pop;
 480 open STDOUT,">$output";
 481 print $code;
 482 close STDOUT or die "error closing STDOUT: $!";