crypto/bn/asm/rsaz-3k-avx512.pl

   1 # Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
   2 # Copyright (c) 2021, Intel Corporation. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 #
  10 # Originally written by Sergey Kirillov and Andrey Matyukov
  11 # Intel Corporation
  12 #
  13 # March 2021
  14 #
  15 # Initial release.
  16 #
  17 # Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
  18 #
  19 # IceLake-Client @ 1.3GHz
  20 # |---------+-----------------------+---------------+-------------|
  21 # |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
  22 # |---------+-----------------------+---------------+-------------|
  23 # | rsa3072 | 6 397 637             | 2 866 593     | cycles/sign |
  24 # |         | 203.2                 | 453.5 / +123% | sign/s      |
  25 # |---------+-----------------------+---------------+-------------|
  26 #
  27
  28 # $output is the last argument if it looks like a file (it has an extension)
  29 # $flavour is the first argument if it doesn't look like a file
  30 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  31 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  32
  33 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  34 $avx512ifma=0;
  35
  36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  37 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  39 die "can't locate x86_64-xlate.pl";
  40
  41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  42         =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  43     $avx512ifma = ($1>=2.26);
  44 }
  45
  46 if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  47        `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  48     $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
  49 }
  50
  51 if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  52     $avx512ifma = ($2>=7.0);
  53 }
  54
  55 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  56     or die "can't call $xlate: $!";
  57 *STDOUT=*OUT;
  58
  59 if ($avx512ifma>0) {{{
  60 @_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
  61
  62 ###############################################################################
  63 # Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52.
  64 #
  65 # AMM is defined as presented in the paper [1].
  66 #
  67 # The input and output are presented in 2^52 radix domain, i.e.
  68 #   |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed
  69 #
  70 #   NOTE: the function uses zero-padded data - 2 high QWs is a padding.
  71 #
  72 #   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
  73 #
  74 # NB: the AMM implementation does not perform "conditional" subtraction step
  75 # specified in the original algorithm as according to the Lemma 1 from the paper
  76 # [2], the result will be always < 2*m and can be used as a direct input to
  77 # the next AMM iteration.  This post-condition is true, provided the correct
  78 # parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e.  s >= n + 2 * k,
  79 # which matches our case: 1560 > 1536 + 2 * 1.
  80 #
  81 # [1] Gueron, S. Efficient software implementations of modular exponentiation.
  82 #     DOI: 10.1007/s13389-012-0031-5
  83 # [2] Gueron, S. Enhanced Montgomery Multiplication.
  84 #     DOI: 10.1007/3-540-36400-5_5
  85 #
  86 # void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
  87 #                                    const BN_ULONG *a,
  88 #                                    const BN_ULONG *b,
  89 #                                    const BN_ULONG *m,
  90 #                                    BN_ULONG k0);
  91 ###############################################################################
  92 {
  93 # input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
  94 my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
  95
  96 my $mask52     = "%rax";
  97 my $acc0_0     = "%r9";
  98 my $acc0_0_low = "%r9d";
  99 my $acc0_1     = "%r15";
 100 my $acc0_1_low = "%r15d";
 101 my $b_ptr      = "%r11";
 102
 103 my $iter = "%ebx";
 104
 105 my $zero = "%ymm0";
 106 my $Bi   = "%ymm1";
 107 my $Yi   = "%ymm2";
 108 my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
 109 my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
 110
 111 # Registers mapping for normalization
 112 my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
 113
 114 sub amm52x30_x1() {
 115 # _data_offset - offset in the |a| or |m| arrays pointing to the beginning
 116 #                of data for corresponding AMM operation;
 117 # _b_offset    - offset in the |b| array pointing to the next qword digit;
 118 my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
 119 my $_R0_xmm = $_R0;
 120 $_R0_xmm =~ s/%y/%x/;
 121 $code.=<<___;
 122     movq    $_b_offset($b_ptr), %r13             # b[i]
 123
 124     vpbroadcastq    %r13, $Bi                    # broadcast b[i]
 125     movq    $_data_offset($a), %rdx
 126     mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
 127     addq    %r13, $_acc                          # acc += t0
 128     movq    %r12, %r10
 129     adcq    \$0, %r10                            # t2 += CF
 130
 131     movq    $_k0, %r13
 132     imulq   $_acc, %r13                          # acc * k0
 133     andq    $mask52, %r13                        # yi = (acc * k0) & mask52
 134
 135     vpbroadcastq    %r13, $Yi                    # broadcast y[i]
 136     movq    $_data_offset($m), %rdx
 137     mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
 138     addq    %r13, $_acc                          # acc += t0
 139     adcq    %r12, %r10                           # t2 += (t1 + CF)
 140
 141     shrq    \$52, $_acc
 142     salq    \$12, %r10
 143     or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
 144
 145     vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
 146     vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
 147     vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
 148     vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
 149     vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
 150     vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
 151     vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
 152     vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
 153
 154     vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
 155     vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
 156     vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
 157     vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
 158     vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
 159     vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
 160     vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
 161     vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
 162
 163     # Shift accumulators right by 1 qword, zero extending the highest one
 164     valignq     \$1, $_R0, $_R0h, $_R0
 165     valignq     \$1, $_R0h, $_R1, $_R0h
 166     valignq     \$1, $_R1, $_R1h, $_R1
 167     valignq     \$1, $_R1h, $_R2, $_R1h
 168     valignq     \$1, $_R2, $_R2h, $_R2
 169     valignq     \$1, $_R2h, $_R3, $_R2h
 170     valignq     \$1, $_R3, $_R3h, $_R3
 171     valignq     \$1, $_R3h, $zero, $_R3h
 172
 173     vmovq   $_R0_xmm, %r13
 174     addq    %r13, $_acc    # acc += R0[0]
 175
 176     vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
 177     vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
 178     vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
 179     vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
 180     vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
 181     vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
 182     vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
 183     vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
 184
 185     vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
 186     vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
 187     vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
 188     vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
 189     vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
 190     vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
 191     vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
 192     vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
 193 ___
 194 }
 195
 196 # Normalization routine: handles carry bits and gets bignum qwords to normalized
 197 # 2^52 representation.
 198 #
 199 # Uses %r8-14,%e[abcd]x
 200 sub amm52x30_x1_norm {
 201 my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
 202 $code.=<<___;
 203     # Put accumulator to low qword in R0
 204     vpbroadcastq    $_acc, $T0
 205     vpblendd \$3, $T0, $_R0, $_R0
 206
 207     # Extract "carries" (12 high bits) from each QW of the bignum
 208     # Save them to LSB of QWs in T0..Tn
 209     vpsrlq    \$52, $_R0,   $T0
 210     vpsrlq    \$52, $_R0h,  $T0h
 211     vpsrlq    \$52, $_R1,   $T1
 212     vpsrlq    \$52, $_R1h,  $T1h
 213     vpsrlq    \$52, $_R2,   $T2
 214     vpsrlq    \$52, $_R2h,  $T2h
 215     vpsrlq    \$52, $_R3,   $T3
 216     vpsrlq    \$52, $_R3h,  $T3h
 217
 218     # "Shift left" T0..Tn by 1 QW
 219     valignq \$3, $T3,  $T3h,  $T3h
 220     valignq \$3, $T2h,  $T3,  $T3
 221     valignq \$3, $T2,  $T2h,  $T2h
 222     valignq \$3, $T1h,  $T2,  $T2
 223     valignq \$3, $T1,   $T1h, $T1h
 224     valignq \$3, $T0h,  $T1,  $T1
 225     valignq \$3, $T0,   $T0h, $T0h
 226     valignq \$3, .Lzeros(%rip), $T0,  $T0
 227
 228     # Drop "carries" from R0..Rn QWs
 229     vpandq    .Lmask52x4(%rip), $_R0,  $_R0
 230     vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
 231     vpandq    .Lmask52x4(%rip), $_R1,  $_R1
 232     vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
 233     vpandq    .Lmask52x4(%rip), $_R2,  $_R2
 234     vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
 235     vpandq    .Lmask52x4(%rip), $_R3,  $_R3
 236     vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
 237
 238     # Sum R0..Rn with corresponding adjusted carries
 239     vpaddq  $T0,  $_R0,  $_R0
 240     vpaddq  $T0h, $_R0h, $_R0h
 241     vpaddq  $T1,  $_R1,  $_R1
 242     vpaddq  $T1h, $_R1h, $_R1h
 243     vpaddq  $T2,  $_R2,  $_R2
 244     vpaddq  $T2h, $_R2h, $_R2h
 245     vpaddq  $T3,  $_R3,  $_R3
 246     vpaddq  $T3h, $_R3h, $_R3h
 247
 248     # Now handle carry bits from this addition
 249     # Get mask of QWs whose 52-bit parts overflow
 250     vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
 251     vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
 252     kmovb      %k1,%r14d
 253     kmovb      %k2,%r13d
 254     shl        \$4,%r13b
 255     or         %r13b,%r14b
 256
 257     vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
 258     vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
 259     kmovb      %k1,%r13d
 260     kmovb      %k2,%r12d
 261     shl        \$4,%r12b
 262     or         %r12b,%r13b
 263
 264     vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
 265     vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
 266     kmovb      %k1,%r12d
 267     kmovb      %k2,%r11d
 268     shl        \$4,%r11b
 269     or         %r11b,%r12b
 270
 271     vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
 272     vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
 273     kmovb      %k1,%r11d
 274     kmovb      %k2,%r10d
 275     shl        \$4,%r10b
 276     or         %r10b,%r11b
 277
 278     addb       %r14b,%r14b
 279     adcb       %r13b,%r13b
 280     adcb       %r12b,%r12b
 281     adcb       %r11b,%r11b
 282
 283     # Get mask of QWs whose 52-bit parts saturated
 284     vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
 285     vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
 286     kmovb      %k1,%r9d
 287     kmovb      %k2,%r8d
 288     shl        \$4,%r8b
 289     or         %r8b,%r9b
 290
 291     vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
 292     vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
 293     kmovb      %k1,%r8d
 294     kmovb      %k2,%edx
 295     shl        \$4,%dl
 296     or         %dl,%r8b
 297
 298     vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
 299     vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
 300     kmovb      %k1,%edx
 301     kmovb      %k2,%ecx
 302     shl        \$4,%cl
 303     or         %cl,%dl
 304
 305     vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
 306     vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
 307     kmovb      %k1,%ecx
 308     kmovb      %k2,%ebx
 309     shl        \$4,%bl
 310     or         %bl,%cl
 311
 312     addb     %r9b,%r14b
 313     adcb     %r8b,%r13b
 314     adcb     %dl,%r12b
 315     adcb     %cl,%r11b
 316
 317     xor      %r9b,%r14b
 318     xor      %r8b,%r13b
 319     xor      %dl,%r12b
 320     xor      %cl,%r11b
 321
 322     kmovb    %r14d,%k1
 323     shr      \$4,%r14b
 324     kmovb    %r14d,%k2
 325     kmovb    %r13d,%k3
 326     shr      \$4,%r13b
 327     kmovb    %r13d,%k4
 328     kmovb    %r12d,%k5
 329     shr      \$4,%r12b
 330     kmovb    %r12d,%k6
 331     kmovb    %r11d,%k7
 332
 333     vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
 334     vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
 335     vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
 336     vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
 337     vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
 338     vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
 339     vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
 340
 341     vpandq  .Lmask52x4(%rip), $_R0,  $_R0
 342     vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
 343     vpandq  .Lmask52x4(%rip), $_R1,  $_R1
 344     vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
 345     vpandq  .Lmask52x4(%rip), $_R2,  $_R2
 346     vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
 347     vpandq  .Lmask52x4(%rip), $_R3,  $_R3
 348
 349     shr    \$4,%r11b
 350     kmovb   %r11d,%k1
 351
 352     vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
 353
 354     vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
 355 ___
 356 }
 357
 358 $code.=<<___;
 359 .text
 360
 361 .globl  ossl_rsaz_amm52x30_x1_ifma256
 362 .type   ossl_rsaz_amm52x30_x1_ifma256,\@function,5
 363 .align 32
 364 ossl_rsaz_amm52x30_x1_ifma256:
 365 .cfi_startproc
 366     endbranch
 367     push    %rbx
 368 .cfi_push   %rbx
 369     push    %rbp
 370 .cfi_push   %rbp
 371     push    %r12
 372 .cfi_push   %r12
 373     push    %r13
 374 .cfi_push   %r13
 375     push    %r14
 376 .cfi_push   %r14
 377     push    %r15
 378 .cfi_push   %r15
 379 ___
 380 $code.=<<___ if ($win64);
 381     lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
 382     vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
 383     vmovdqa64   %xmm7, `1*16`(%rsp)
 384     vmovdqa64   %xmm8, `2*16`(%rsp)
 385     vmovdqa64   %xmm9, `3*16`(%rsp)
 386     vmovdqa64   %xmm10,`4*16`(%rsp)
 387     vmovdqa64   %xmm11,`5*16`(%rsp)
 388     vmovdqa64   %xmm12,`6*16`(%rsp)
 389     vmovdqa64   %xmm13,`7*16`(%rsp)
 390     vmovdqa64   %xmm14,`8*16`(%rsp)
 391     vmovdqa64   %xmm15,`9*16`(%rsp)
 392 .Lossl_rsaz_amm52x30_x1_ifma256_body:
 393 ___
 394 $code.=<<___;
 395     # Zeroing accumulators
 396     vpxord   $zero, $zero, $zero
 397     vmovdqa64   $zero, $R0_0
 398     vmovdqa64   $zero, $R0_0h
 399     vmovdqa64   $zero, $R1_0
 400     vmovdqa64   $zero, $R1_0h
 401     vmovdqa64   $zero, $R2_0
 402     vmovdqa64   $zero, $R2_0h
 403     vmovdqa64   $zero, $R3_0
 404     vmovdqa64   $zero, $R3_0h
 405
 406     xorl    $acc0_0_low, $acc0_0_low
 407
 408     movq    $b, $b_ptr                       # backup address of b
 409     movq    \$0xfffffffffffff, $mask52       # 52-bit mask
 410
 411     # Loop over 30 digits unrolled by 4
 412     mov     \$7, $iter
 413
 414 .align 32
 415 .Lloop7:
 416 ___
 417     foreach my $idx (0..3) {
 418         &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
 419     }
 420 $code.=<<___;
 421     lea    `4*8`($b_ptr), $b_ptr
 422     dec    $iter
 423     jne    .Lloop7
 424 ___
 425     &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
 426     &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
 427
 428     &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
 429 $code.=<<___;
 430
 431     vmovdqu64   $R0_0,  `0*32`($res)
 432     vmovdqu64   $R0_0h, `1*32`($res)
 433     vmovdqu64   $R1_0,  `2*32`($res)
 434     vmovdqu64   $R1_0h, `3*32`($res)
 435     vmovdqu64   $R2_0,  `4*32`($res)
 436     vmovdqu64   $R2_0h, `5*32`($res)
 437     vmovdqu64   $R3_0,  `6*32`($res)
 438     vmovdqu64   $R3_0h, `7*32`($res)
 439
 440     vzeroupper
 441     lea     (%rsp),%rax
 442 .cfi_def_cfa_register   %rax
 443 ___
 444 $code.=<<___ if ($win64);
 445     vmovdqa64   `0*16`(%rax),%xmm6
 446     vmovdqa64   `1*16`(%rax),%xmm7
 447     vmovdqa64   `2*16`(%rax),%xmm8
 448     vmovdqa64   `3*16`(%rax),%xmm9
 449     vmovdqa64   `4*16`(%rax),%xmm10
 450     vmovdqa64   `5*16`(%rax),%xmm11
 451     vmovdqa64   `6*16`(%rax),%xmm12
 452     vmovdqa64   `7*16`(%rax),%xmm13
 453     vmovdqa64   `8*16`(%rax),%xmm14
 454     vmovdqa64   `9*16`(%rax),%xmm15
 455     lea  168(%rsp),%rax
 456 ___
 457 $code.=<<___;
 458     mov  0(%rax),%r15
 459 .cfi_restore    %r15
 460     mov  8(%rax),%r14
 461 .cfi_restore    %r14
 462     mov  16(%rax),%r13
 463 .cfi_restore    %r13
 464     mov  24(%rax),%r12
 465 .cfi_restore    %r12
 466     mov  32(%rax),%rbp
 467 .cfi_restore    %rbp
 468     mov  40(%rax),%rbx
 469 .cfi_restore    %rbx
 470     lea  48(%rax),%rsp       # restore rsp
 471 .cfi_def_cfa %rsp,8
 472 .Lossl_rsaz_amm52x30_x1_ifma256_epilogue:
 473     ret
 474 .cfi_endproc
 475 .size   ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
 476 ___
 477
 478 $code.=<<___;
 479 .data
 480 .align 32
 481 .Lmask52x4:
 482     .quad   0xfffffffffffff
 483     .quad   0xfffffffffffff
 484     .quad   0xfffffffffffff
 485     .quad   0xfffffffffffff
 486 ___
 487
 488 ###############################################################################
 489 # Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52
 490 #
 491 # See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost
 492 # Montgomery Multiplication algorithm and function input parameters description.
 493 #
 494 # This function does two AMMs for two independent inputs, hence dual.
 495 #
 496 # NOTE: the function uses zero-padded data - 2 high QWs is a padding.
 497 #
 498 # void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
 499 #                                    const BN_ULONG a[2][32],
 500 #                                    const BN_ULONG b[2][32],
 501 #                                    const BN_ULONG m[2][32],
 502 #                                    const BN_ULONG k0[2]);
 503 ###############################################################################
 504
 505 $code.=<<___;
 506 .text
 507
 508 .globl  ossl_rsaz_amm52x30_x2_ifma256
 509 .type   ossl_rsaz_amm52x30_x2_ifma256,\@function,5
 510 .align 32
 511 ossl_rsaz_amm52x30_x2_ifma256:
 512 .cfi_startproc
 513     endbranch
 514     push    %rbx
 515 .cfi_push   %rbx
 516     push    %rbp
 517 .cfi_push   %rbp
 518     push    %r12
 519 .cfi_push   %r12
 520     push    %r13
 521 .cfi_push   %r13
 522     push    %r14
 523 .cfi_push   %r14
 524     push    %r15
 525 .cfi_push   %r15
 526 ___
 527 $code.=<<___ if ($win64);
 528     lea     -168(%rsp),%rsp
 529     vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
 530     vmovdqa64   %xmm7, `1*16`(%rsp)
 531     vmovdqa64   %xmm8, `2*16`(%rsp)
 532     vmovdqa64   %xmm9, `3*16`(%rsp)
 533     vmovdqa64   %xmm10,`4*16`(%rsp)
 534     vmovdqa64   %xmm11,`5*16`(%rsp)
 535     vmovdqa64   %xmm12,`6*16`(%rsp)
 536     vmovdqa64   %xmm13,`7*16`(%rsp)
 537     vmovdqa64   %xmm14,`8*16`(%rsp)
 538     vmovdqa64   %xmm15,`9*16`(%rsp)
 539 .Lossl_rsaz_amm52x30_x2_ifma256_body:
 540 ___
 541 $code.=<<___;
 542     # Zeroing accumulators
 543     vpxord   $zero, $zero, $zero
 544     vmovdqa64   $zero, $R0_0
 545     vmovdqa64   $zero, $R0_0h
 546     vmovdqa64   $zero, $R1_0
 547     vmovdqa64   $zero, $R1_0h
 548     vmovdqa64   $zero, $R2_0
 549     vmovdqa64   $zero, $R2_0h
 550     vmovdqa64   $zero, $R3_0
 551     vmovdqa64   $zero, $R3_0h
 552
 553     vmovdqa64   $zero, $R0_1
 554     vmovdqa64   $zero, $R0_1h
 555     vmovdqa64   $zero, $R1_1
 556     vmovdqa64   $zero, $R1_1h
 557     vmovdqa64   $zero, $R2_1
 558     vmovdqa64   $zero, $R2_1h
 559     vmovdqa64   $zero, $R3_1
 560     vmovdqa64   $zero, $R3_1h
 561
 562
 563     xorl    $acc0_0_low, $acc0_0_low
 564     xorl    $acc0_1_low, $acc0_1_low
 565
 566     movq    $b, $b_ptr                       # backup address of b
 567     movq    \$0xfffffffffffff, $mask52       # 52-bit mask
 568
 569     mov    \$30, $iter
 570
 571 .align 32
 572 .Lloop30:
 573 ___
 574     &amm52x30_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)");
 575     # 32*8 = offset of the next dimension in two-dimension array
 576     &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)");
 577 $code.=<<___;
 578     lea    8($b_ptr), $b_ptr
 579     dec    $iter
 580     jne    .Lloop30
 581 ___
 582     &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
 583     &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h);
 584 $code.=<<___;
 585
 586     vmovdqu64   $R0_0,  `0*32`($res)
 587     vmovdqu64   $R0_0h, `1*32`($res)
 588     vmovdqu64   $R1_0,  `2*32`($res)
 589     vmovdqu64   $R1_0h, `3*32`($res)
 590     vmovdqu64   $R2_0,  `4*32`($res)
 591     vmovdqu64   $R2_0h, `5*32`($res)
 592     vmovdqu64   $R3_0,  `6*32`($res)
 593     vmovdqu64   $R3_0h, `7*32`($res)
 594
 595     vmovdqu64   $R0_1,  `8*32`($res)
 596     vmovdqu64   $R0_1h, `9*32`($res)
 597     vmovdqu64   $R1_1,  `10*32`($res)
 598     vmovdqu64   $R1_1h, `11*32`($res)
 599     vmovdqu64   $R2_1,  `12*32`($res)
 600     vmovdqu64   $R2_1h, `13*32`($res)
 601     vmovdqu64   $R3_1,  `14*32`($res)
 602     vmovdqu64   $R3_1h, `15*32`($res)
 603
 604     vzeroupper
 605     lea     (%rsp),%rax
 606 .cfi_def_cfa_register   %rax
 607 ___
 608 $code.=<<___ if ($win64);
 609     vmovdqa64   `0*16`(%rax),%xmm6
 610     vmovdqa64   `1*16`(%rax),%xmm7
 611     vmovdqa64   `2*16`(%rax),%xmm8
 612     vmovdqa64   `3*16`(%rax),%xmm9
 613     vmovdqa64   `4*16`(%rax),%xmm10
 614     vmovdqa64   `5*16`(%rax),%xmm11
 615     vmovdqa64   `6*16`(%rax),%xmm12
 616     vmovdqa64   `7*16`(%rax),%xmm13
 617     vmovdqa64   `8*16`(%rax),%xmm14
 618     vmovdqa64   `9*16`(%rax),%xmm15
 619     lea     168(%rsp),%rax
 620 ___
 621 $code.=<<___;
 622     mov  0(%rax),%r15
 623 .cfi_restore    %r15
 624     mov  8(%rax),%r14
 625 .cfi_restore    %r14
 626     mov  16(%rax),%r13
 627 .cfi_restore    %r13
 628     mov  24(%rax),%r12
 629 .cfi_restore    %r12
 630     mov  32(%rax),%rbp
 631 .cfi_restore    %rbp
 632     mov  40(%rax),%rbx
 633 .cfi_restore    %rbx
 634     lea  48(%rax),%rsp
 635 .cfi_def_cfa    %rsp,8
 636 .Lossl_rsaz_amm52x30_x2_ifma256_epilogue:
 637     ret
 638 .cfi_endproc
 639 .size   ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256
 640 ___
 641 }
 642
 643 ###############################################################################
 644 # Constant time extraction from the precomputed table of powers base^i, where
 645 #    i = 0..2^EXP_WIN_SIZE-1
 646 #
 647 # The input |red_table| contains precomputations for two independent base values.
 648 # |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
 649 #
 650 # Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix.
 651 # (2 high QW is zero padding)
 652 #
 653 # void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
 654 #                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
 655 #                                        int red_table_idx1, int red_table_idx2);
 656 #
 657 # EXP_WIN_SIZE = 5
 658 ###############################################################################
 659 {
 660 # input parameters
 661 my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
 662                                                         ("%rdi","%rsi","%rdx","%rcx");  # Unix order
 663
 664 my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
 665 my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25));
 666 my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30));
 667
 668 my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15);
 669 my $t0xmm = $t0;
 670 $t0xmm =~ s/%y/%x/;
 671
 672 $code.=<<___;
 673 .text
 674
 675 .align 32
 676 .globl  ossl_extract_multiplier_2x30_win5
 677 .type   ossl_extract_multiplier_2x30_win5,\@abi-omnipotent
 678 ossl_extract_multiplier_2x30_win5:
 679 .cfi_startproc
 680     endbranch
 681     vmovdqa64   .Lones(%rip), $ones         # broadcast ones
 682     vpbroadcastq    $red_tbl_idx1, $idx1
 683     vpbroadcastq    $red_tbl_idx2, $idx2
 684     leaq   `(1<<5)*2*32*8`($red_tbl), %rax  # holds end of the tbl
 685
 686     # zeroing t0..n, cur_idx
 687     vpxor   $t0xmm, $t0xmm, $t0xmm
 688     vmovdqa64   $t0, $cur_idx
 689 ___
 690 foreach (1..15) {
 691     $code.="vmovdqa64   $t0, $t[$_] \n";
 692 }
 693 $code.=<<___;
 694
 695 .align 32
 696 .Lloop:
 697     vpcmpq  \$0, $cur_idx, $idx1, %k1      # mask of (idx1 == cur_idx)
 698     vpcmpq  \$0, $cur_idx, $idx2, %k2      # mask of (idx2 == cur_idx)
 699 ___
 700 foreach (0..15) {
 701     my $mask = $_<8?"%k1":"%k2";
 702 $code.=<<___;
 703     vmovdqu64  `${_}*32`($red_tbl), $tmp     # load data from red_tbl
 704     vpblendmq  $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
 705 ___
 706 }
 707 $code.=<<___;
 708     vpaddq  $ones, $cur_idx, $cur_idx      # increment cur_idx
 709     addq    \$`2*32*8`, $red_tbl
 710     cmpq    $red_tbl, %rax
 711     jne .Lloop
 712 ___
 713 # store t0..n
 714 foreach (0..15) {
 715     $code.="vmovdqu64   $t[$_], `${_}*32`($out) \n";
 716 }
 717 $code.=<<___;
 718
 719     ret
 720 .cfi_endproc
 721 .size   ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5
 722 ___
 723 $code.=<<___;
 724 .data
 725 .align 32
 726 .Lones:
 727     .quad   1,1,1,1
 728 .Lzeros:
 729     .quad   0,0,0,0
 730 ___
 731 }
 732
 733 if ($win64) {
 734 $rec="%rcx";
 735 $frame="%rdx";
 736 $context="%r8";
 737 $disp="%r9";
 738
 739 $code.=<<___;
 740 .extern     __imp_RtlVirtualUnwind
 741 .type   rsaz_avx_handler,\@abi-omnipotent
 742 .align  16
 743 rsaz_avx_handler:
 744     push    %rsi
 745     push    %rdi
 746     push    %rbx
 747     push    %rbp
 748     push    %r12
 749     push    %r13
 750     push    %r14
 751     push    %r15
 752     pushfq
 753     sub     \$64,%rsp
 754
 755     mov     120($context),%rax # pull context->Rax
 756     mov     248($context),%rbx # pull context->Rip
 757
 758     mov     8($disp),%rsi      # disp->ImageBase
 759     mov     56($disp),%r11     # disp->HandlerData
 760
 761     mov     0(%r11),%r10d      # HandlerData[0]
 762     lea     (%rsi,%r10),%r10   # prologue label
 763     cmp     %r10,%rbx          # context->Rip<.Lprologue
 764     jb  .Lcommon_seh_tail
 765
 766     mov     4(%r11),%r10d      # HandlerData[1]
 767     lea     (%rsi,%r10),%r10   # epilogue label
 768     cmp     %r10,%rbx          # context->Rip>=.Lepilogue
 769     jae     .Lcommon_seh_tail
 770
 771     mov     152($context),%rax # pull context->Rsp
 772
 773     lea     (%rax),%rsi         # %xmm save area
 774     lea     512($context),%rdi  # & context.Xmm6
 775     mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
 776     .long   0xa548f3fc          # cld; rep movsq
 777
 778     lea     `48+168`(%rax),%rax
 779
 780     mov     -8(%rax),%rbx
 781     mov     -16(%rax),%rbp
 782     mov     -24(%rax),%r12
 783     mov     -32(%rax),%r13
 784     mov     -40(%rax),%r14
 785     mov     -48(%rax),%r15
 786     mov     %rbx,144($context) # restore context->Rbx
 787     mov     %rbp,160($context) # restore context->Rbp
 788     mov     %r12,216($context) # restore context->R12
 789     mov     %r13,224($context) # restore context->R13
 790     mov     %r14,232($context) # restore context->R14
 791     mov     %r15,240($context) # restore context->R14
 792
 793 .Lcommon_seh_tail:
 794     mov     8(%rax),%rdi
 795     mov     16(%rax),%rsi
 796     mov     %rax,152($context) # restore context->Rsp
 797     mov     %rsi,168($context) # restore context->Rsi
 798     mov     %rdi,176($context) # restore context->Rdi
 799
 800     mov     40($disp),%rdi     # disp->ContextRecord
 801     mov     $context,%rsi      # context
 802     mov     \$154,%ecx         # sizeof(CONTEXT)
 803     .long   0xa548f3fc         # cld; rep movsq
 804
 805     mov     $disp,%rsi
 806     xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
 807     mov     8(%rsi),%rdx       # arg2, disp->ImageBase
 808     mov     0(%rsi),%r8        # arg3, disp->ControlPc
 809     mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
 810     mov     40(%rsi),%r10      # disp->ContextRecord
 811     lea     56(%rsi),%r11      # &disp->HandlerData
 812     lea     24(%rsi),%r12      # &disp->EstablisherFrame
 813     mov     %r10,32(%rsp)      # arg5
 814     mov     %r11,40(%rsp)      # arg6
 815     mov     %r12,48(%rsp)      # arg7
 816     mov     %rcx,56(%rsp)      # arg8, (NULL)
 817     call    *__imp_RtlVirtualUnwind(%rip)
 818
 819     mov     \$1,%eax           # ExceptionContinueSearch
 820     add     \$64,%rsp
 821     popfq
 822     pop     %r15
 823     pop     %r14
 824     pop     %r13
 825     pop     %r12
 826     pop     %rbp
 827     pop     %rbx
 828     pop     %rdi
 829     pop     %rsi
 830     ret
 831 .size   rsaz_avx_handler,.-rsaz_avx_handler
 832
 833 .section    .pdata
 834 .align  4
 835     .rva    .LSEH_begin_ossl_rsaz_amm52x30_x1_ifma256
 836     .rva    .LSEH_end_ossl_rsaz_amm52x30_x1_ifma256
 837     .rva    .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256
 838
 839     .rva    .LSEH_begin_ossl_rsaz_amm52x30_x2_ifma256
 840     .rva    .LSEH_end_ossl_rsaz_amm52x30_x2_ifma256
 841     .rva    .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256
 842
 843 .section    .xdata
 844 .align  8
 845 .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256:
 846     .byte   9,0,0,0
 847     .rva    rsaz_avx_handler
 848     .rva    .Lossl_rsaz_amm52x30_x1_ifma256_body,.Lossl_rsaz_amm52x30_x1_ifma256_epilogue
 849 .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256:
 850     .byte   9,0,0,0
 851     .rva    rsaz_avx_handler
 852     .rva    .Lossl_rsaz_amm52x30_x2_ifma256_body,.Lossl_rsaz_amm52x30_x2_ifma256_epilogue
 853 ___
 854 }
 855 }}} else {{{                # fallback for old assembler
 856 $code.=<<___;
 857 .text
 858
 859 .globl  ossl_rsaz_amm52x30_x1_ifma256
 860 .globl  ossl_rsaz_amm52x30_x2_ifma256
 861 .globl  ossl_extract_multiplier_2x30_win5
 862 .type   ossl_rsaz_amm52x30_x1_ifma256,\@abi-omnipotent
 863 ossl_rsaz_amm52x30_x1_ifma256:
 864 ossl_rsaz_amm52x30_x2_ifma256:
 865 ossl_extract_multiplier_2x30_win5:
 866     .byte   0x0f,0x0b    # ud2
 867     ret
 868 .size   ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
 869 ___
 870 }}}
 871
 872 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 873 print $code;
 874 close STDOUT or die "error closing STDOUT: $!";