crypto/bn/asm/rsaz-4k-avx512.pl

   1 # Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
   2 # Copyright (c) 2021, Intel Corporation. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 #
  10 # Originally written by Sergey Kirillov and Andrey Matyukov
  11 # Intel Corporation
  12 #
  13 # March 2021
  14 #
  15 # Initial release.
  16 #
  17 # Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
  18 #
  19 # IceLake-Client @ 1.3GHz
  20 # |---------+-----------------------+---------------+-------------|
  21 # |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
  22 # |---------+-----------------------+---------------+-------------|
  23 # | rsa4096 | 14 301 4300           | 5 813 953     | cycles/sign |
  24 # |         | 90.9                  | 223.6 / +146% | sign/s      |
  25 # |---------+-----------------------+---------------+-------------|
  26 #
  27
  28 # $output is the last argument if it looks like a file (it has an extension)
  29 # $flavour is the first argument if it doesn't look like a file
  30 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  31 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  32
  33 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  34 $avx512ifma=0;
  35
  36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  37 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  39 die "can't locate x86_64-xlate.pl";
  40
  41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  42         =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  43     $avx512ifma = ($1>=2.26);
  44 }
  45
  46 if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  47        `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  48     $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
  49 }
  50
  51 if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  52     $avx512ifma = ($2>=7.0);
  53 }
  54
  55 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  56     or die "can't call $xlate: $!";
  57 *STDOUT=*OUT;
  58
  59 if ($avx512ifma>0) {{{
  60 @_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
  61
  62 ###############################################################################
  63 # Almost Montgomery Multiplication (AMM) for 40-digit number in radix 2^52.
  64 #
  65 # AMM is defined as presented in the paper [1].
  66 #
  67 # The input and output are presented in 2^52 radix domain, i.e.
  68 #   |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high bits zeroed.
  69 #   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
  70 #
  71 # NB: the AMM implementation does not perform "conditional" subtraction step
  72 # specified in the original algorithm as according to the Lemma 1 from the paper
  73 # [2], the result will be always < 2*m and can be used as a direct input to
  74 # the next AMM iteration.  This post-condition is true, provided the correct
  75 # parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e.  s >= n + 2 * k,
  76 # which matches our case: 2080 > 2048 + 2 * 1.
  77 #
  78 # [1] Gueron, S. Efficient software implementations of modular exponentiation.
  79 #     DOI: 10.1007/s13389-012-0031-5
  80 # [2] Gueron, S. Enhanced Montgomery Multiplication.
  81 #     DOI: 10.1007/3-540-36400-5_5
  82 #
  83 # void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res,
  84 #                                    const BN_ULONG *a,
  85 #                                    const BN_ULONG *b,
  86 #                                    const BN_ULONG *m,
  87 #                                    BN_ULONG k0);
  88 ###############################################################################
  89 {
  90 # input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
  91 my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
  92
  93 my $mask52     = "%rax";
  94 my $acc0_0     = "%r9";
  95 my $acc0_0_low = "%r9d";
  96 my $acc0_1     = "%r15";
  97 my $acc0_1_low = "%r15d";
  98 my $b_ptr      = "%r11";
  99
 100 my $iter = "%ebx";
 101
 102 my $zero = "%ymm0";
 103 my $Bi   = "%ymm1";
 104 my $Yi   = "%ymm2";
 105 my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h) = map("%ymm$_",(3..12));
 106 my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h) = map("%ymm$_",(13..22));
 107
 108 # Registers mapping for normalization
 109 my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h,$T4,$T4h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (23..29)));
 110
 111 sub amm52x40_x1() {
 112 # _data_offset - offset in the |a| or |m| arrays pointing to the beginning
 113 #                of data for corresponding AMM operation;
 114 # _b_offset    - offset in the |b| array pointing to the next qword digit;
 115 my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h,$_k0) = @_;
 116 my $_R0_xmm = $_R0;
 117 $_R0_xmm =~ s/%y/%x/;
 118 $code.=<<___;
 119     movq    $_b_offset($b_ptr), %r13             # b[i]
 120
 121     vpbroadcastq    %r13, $Bi                    # broadcast b[i]
 122     movq    $_data_offset($a), %rdx
 123     mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
 124     addq    %r13, $_acc                          # acc += t0
 125     movq    %r12, %r10
 126     adcq    \$0, %r10                            # t2 += CF
 127
 128     movq    $_k0, %r13
 129     imulq   $_acc, %r13                          # acc * k0
 130     andq    $mask52, %r13                        # yi = (acc * k0) & mask52
 131
 132     vpbroadcastq    %r13, $Yi                    # broadcast y[i]
 133     movq    $_data_offset($m), %rdx
 134     mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
 135     addq    %r13, $_acc                          # acc += t0
 136     adcq    %r12, %r10                           # t2 += (t1 + CF)
 137
 138     shrq    \$52, $_acc
 139     salq    \$12, %r10
 140     or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
 141
 142     vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
 143     vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
 144     vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
 145     vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
 146     vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
 147     vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
 148     vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
 149     vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
 150     vpmadd52luq `$_data_offset+64*4`($a), $Bi, $_R4
 151     vpmadd52luq `$_data_offset+64*4+32`($a), $Bi, $_R4h
 152
 153     vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
 154     vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
 155     vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
 156     vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
 157     vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
 158     vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
 159     vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
 160     vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
 161     vpmadd52luq `$_data_offset+64*4`($m), $Yi, $_R4
 162     vpmadd52luq `$_data_offset+64*4+32`($m), $Yi, $_R4h
 163
 164     # Shift accumulators right by 1 qword, zero extending the highest one
 165     valignq     \$1, $_R0, $_R0h, $_R0
 166     valignq     \$1, $_R0h, $_R1, $_R0h
 167     valignq     \$1, $_R1, $_R1h, $_R1
 168     valignq     \$1, $_R1h, $_R2, $_R1h
 169     valignq     \$1, $_R2, $_R2h, $_R2
 170     valignq     \$1, $_R2h, $_R3, $_R2h
 171     valignq     \$1, $_R3, $_R3h, $_R3
 172     valignq     \$1, $_R3h, $_R4, $_R3h
 173     valignq     \$1, $_R4, $_R4h, $_R4
 174     valignq     \$1, $_R4h, $zero, $_R4h
 175
 176     vmovq   $_R0_xmm, %r13
 177     addq    %r13, $_acc    # acc += R0[0]
 178
 179     vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
 180     vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
 181     vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
 182     vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
 183     vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
 184     vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
 185     vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
 186     vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
 187     vpmadd52huq `$_data_offset+64*4`($a), $Bi, $_R4
 188     vpmadd52huq `$_data_offset+64*4+32`($a), $Bi, $_R4h
 189
 190     vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
 191     vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
 192     vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
 193     vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
 194     vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
 195     vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
 196     vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
 197     vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
 198     vpmadd52huq `$_data_offset+64*4`($m), $Yi, $_R4
 199     vpmadd52huq `$_data_offset+64*4+32`($m), $Yi, $_R4h
 200 ___
 201 }
 202
 203 # Normalization routine: handles carry bits and gets bignum qwords to normalized
 204 # 2^52 representation.
 205 #
 206 # Uses %r8-14,%e[abcd]x
 207 sub amm52x40_x1_norm {
 208 my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h) = @_;
 209 $code.=<<___;
 210     # Put accumulator to low qword in R0
 211     vpbroadcastq    $_acc, $T0
 212     vpblendd \$3, $T0, $_R0, $_R0
 213
 214     # Extract "carries" (12 high bits) from each QW of the bignum
 215     # Save them to LSB of QWs in T0..Tn
 216     vpsrlq    \$52, $_R0,   $T0
 217     vpsrlq    \$52, $_R0h,  $T0h
 218     vpsrlq    \$52, $_R1,   $T1
 219     vpsrlq    \$52, $_R1h,  $T1h
 220     vpsrlq    \$52, $_R2,   $T2
 221     vpsrlq    \$52, $_R2h,  $T2h
 222     vpsrlq    \$52, $_R3,   $T3
 223     vpsrlq    \$52, $_R3h,  $T3h
 224     vpsrlq    \$52, $_R4,   $T4
 225     vpsrlq    \$52, $_R4h,  $T4h
 226
 227     # "Shift left" T0..Tn by 1 QW
 228     valignq \$3, $T4,  $T4h,  $T4h
 229     valignq \$3, $T3h,  $T4,  $T4
 230     valignq \$3, $T3,  $T3h,  $T3h
 231     valignq \$3, $T2h,  $T3,  $T3
 232     valignq \$3, $T2,  $T2h,  $T2h
 233     valignq \$3, $T1h,  $T2,  $T2
 234     valignq \$3, $T1,   $T1h, $T1h
 235     valignq \$3, $T0h,  $T1,  $T1
 236     valignq \$3, $T0,   $T0h, $T0h
 237     valignq \$3, .Lzeros(%rip), $T0,  $T0
 238
 239     # Drop "carries" from R0..Rn QWs
 240     vpandq    .Lmask52x4(%rip), $_R0,  $_R0
 241     vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
 242     vpandq    .Lmask52x4(%rip), $_R1,  $_R1
 243     vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
 244     vpandq    .Lmask52x4(%rip), $_R2,  $_R2
 245     vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
 246     vpandq    .Lmask52x4(%rip), $_R3,  $_R3
 247     vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
 248     vpandq    .Lmask52x4(%rip), $_R4,  $_R4
 249     vpandq    .Lmask52x4(%rip), $_R4h, $_R4h
 250
 251     # Sum R0..Rn with corresponding adjusted carries
 252     vpaddq  $T0,  $_R0,  $_R0
 253     vpaddq  $T0h, $_R0h, $_R0h
 254     vpaddq  $T1,  $_R1,  $_R1
 255     vpaddq  $T1h, $_R1h, $_R1h
 256     vpaddq  $T2,  $_R2,  $_R2
 257     vpaddq  $T2h, $_R2h, $_R2h
 258     vpaddq  $T3,  $_R3,  $_R3
 259     vpaddq  $T3h, $_R3h, $_R3h
 260     vpaddq  $T4,  $_R4,  $_R4
 261     vpaddq  $T4h, $_R4h, $_R4h
 262
 263     # Now handle carry bits from this addition
 264     # Get mask of QWs whose 52-bit parts overflow
 265     vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
 266     vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
 267     kmovb      %k1,%r14d
 268     kmovb      %k2,%r13d
 269     shl        \$4,%r13b
 270     or         %r13b,%r14b
 271
 272     vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
 273     vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
 274     kmovb      %k1,%r13d
 275     kmovb      %k2,%r12d
 276     shl        \$4,%r12b
 277     or         %r12b,%r13b
 278
 279     vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
 280     vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
 281     kmovb      %k1,%r12d
 282     kmovb      %k2,%r11d
 283     shl        \$4,%r11b
 284     or         %r11b,%r12b
 285
 286     vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
 287     vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
 288     kmovb      %k1,%r11d
 289     kmovb      %k2,%r10d
 290     shl        \$4,%r10b
 291     or         %r10b,%r11b
 292
 293     vpcmpuq    \$6,.Lmask52x4(%rip),${_R4},%k1
 294     vpcmpuq    \$6,.Lmask52x4(%rip),${_R4h},%k2
 295     kmovb      %k1,%r10d
 296     kmovb      %k2,%r9d
 297     shl        \$4,%r9b
 298     or         %r9b,%r10b
 299
 300     addb       %r14b,%r14b
 301     adcb       %r13b,%r13b
 302     adcb       %r12b,%r12b
 303     adcb       %r11b,%r11b
 304     adcb       %r10b,%r10b
 305
 306     # Get mask of QWs whose 52-bit parts saturated
 307     vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
 308     vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
 309     kmovb      %k1,%r9d
 310     kmovb      %k2,%r8d
 311     shl        \$4,%r8b
 312     or         %r8b,%r9b
 313
 314     vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
 315     vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
 316     kmovb      %k1,%r8d
 317     kmovb      %k2,%edx
 318     shl        \$4,%dl
 319     or         %dl,%r8b
 320
 321     vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
 322     vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
 323     kmovb      %k1,%edx
 324     kmovb      %k2,%ecx
 325     shl        \$4,%cl
 326     or         %cl,%dl
 327
 328     vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
 329     vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
 330     kmovb      %k1,%ecx
 331     kmovb      %k2,%ebx
 332     shl        \$4,%bl
 333     or         %bl,%cl
 334
 335     vpcmpuq    \$0,.Lmask52x4(%rip),${_R4},%k1
 336     vpcmpuq    \$0,.Lmask52x4(%rip),${_R4h},%k2
 337     kmovb      %k1,%ebx
 338     kmovb      %k2,%eax
 339     shl        \$4,%al
 340     or         %al,%bl
 341
 342     addb     %r9b,%r14b
 343     adcb     %r8b,%r13b
 344     adcb     %dl,%r12b
 345     adcb     %cl,%r11b
 346     adcb     %bl,%r10b
 347
 348     xor      %r9b,%r14b
 349     xor      %r8b,%r13b
 350     xor      %dl,%r12b
 351     xor      %cl,%r11b
 352     xor      %bl,%r10b
 353
 354     kmovb    %r14d,%k1
 355     shr      \$4,%r14b
 356     kmovb    %r14d,%k2
 357     kmovb    %r13d,%k3
 358     shr      \$4,%r13b
 359     kmovb    %r13d,%k4
 360     kmovb    %r12d,%k5
 361     shr      \$4,%r12b
 362     kmovb    %r12d,%k6
 363     kmovb    %r11d,%k7
 364
 365     vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
 366     vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
 367     vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
 368     vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
 369     vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
 370     vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
 371     vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
 372
 373     vpandq  .Lmask52x4(%rip), $_R0,  $_R0
 374     vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
 375     vpandq  .Lmask52x4(%rip), $_R1,  $_R1
 376     vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
 377     vpandq  .Lmask52x4(%rip), $_R2,  $_R2
 378     vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
 379     vpandq  .Lmask52x4(%rip), $_R3,  $_R3
 380
 381     shr    \$4,%r11b
 382     kmovb   %r11d,%k1
 383     kmovb   %r10d,%k2
 384     shr    \$4,%r10b
 385     kmovb   %r10d,%k3
 386
 387     vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
 388     vpsubq  .Lmask52x4(%rip), $_R4,  ${_R4}{%k2}
 389     vpsubq  .Lmask52x4(%rip), $_R4h, ${_R4h}{%k3}
 390
 391     vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
 392     vpandq  .Lmask52x4(%rip), $_R4,  $_R4
 393     vpandq  .Lmask52x4(%rip), $_R4h, $_R4h
 394 ___
 395 }
 396
 397 $code.=<<___;
 398 .text
 399
 400 .globl  ossl_rsaz_amm52x40_x1_ifma256
 401 .type   ossl_rsaz_amm52x40_x1_ifma256,\@function,5
 402 .align 32
 403 ossl_rsaz_amm52x40_x1_ifma256:
 404 .cfi_startproc
 405     endbranch
 406     push    %rbx
 407 .cfi_push   %rbx
 408     push    %rbp
 409 .cfi_push   %rbp
 410     push    %r12
 411 .cfi_push   %r12
 412     push    %r13
 413 .cfi_push   %r13
 414     push    %r14
 415 .cfi_push   %r14
 416     push    %r15
 417 .cfi_push   %r15
 418 ___
 419 $code.=<<___ if ($win64);
 420     lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
 421     vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
 422     vmovdqa64   %xmm7, `1*16`(%rsp)
 423     vmovdqa64   %xmm8, `2*16`(%rsp)
 424     vmovdqa64   %xmm9, `3*16`(%rsp)
 425     vmovdqa64   %xmm10,`4*16`(%rsp)
 426     vmovdqa64   %xmm11,`5*16`(%rsp)
 427     vmovdqa64   %xmm12,`6*16`(%rsp)
 428     vmovdqa64   %xmm13,`7*16`(%rsp)
 429     vmovdqa64   %xmm14,`8*16`(%rsp)
 430     vmovdqa64   %xmm15,`9*16`(%rsp)
 431 .Lossl_rsaz_amm52x40_x1_ifma256_body:
 432 ___
 433 $code.=<<___;
 434     # Zeroing accumulators
 435     vpxord   $zero, $zero, $zero
 436     vmovdqa64   $zero, $R0_0
 437     vmovdqa64   $zero, $R0_0h
 438     vmovdqa64   $zero, $R1_0
 439     vmovdqa64   $zero, $R1_0h
 440     vmovdqa64   $zero, $R2_0
 441     vmovdqa64   $zero, $R2_0h
 442     vmovdqa64   $zero, $R3_0
 443     vmovdqa64   $zero, $R3_0h
 444     vmovdqa64   $zero, $R4_0
 445     vmovdqa64   $zero, $R4_0h
 446
 447     xorl    $acc0_0_low, $acc0_0_low
 448
 449     movq    $b, $b_ptr                       # backup address of b
 450     movq    \$0xfffffffffffff, $mask52       # 52-bit mask
 451
 452     # Loop over 40 digits unrolled by 4
 453     mov     \$10, $iter
 454
 455 .align 32
 456 .Lloop10:
 457 ___
 458     foreach my $idx (0..3) {
 459         &amm52x40_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,$k0);
 460     }
 461 $code.=<<___;
 462     lea    `4*8`($b_ptr), $b_ptr
 463     dec    $iter
 464     jne    .Lloop10
 465 ___
 466     &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
 467 $code.=<<___;
 468
 469     vmovdqu64   $R0_0,  `0*32`($res)
 470     vmovdqu64   $R0_0h, `1*32`($res)
 471     vmovdqu64   $R1_0,  `2*32`($res)
 472     vmovdqu64   $R1_0h, `3*32`($res)
 473     vmovdqu64   $R2_0,  `4*32`($res)
 474     vmovdqu64   $R2_0h, `5*32`($res)
 475     vmovdqu64   $R3_0,  `6*32`($res)
 476     vmovdqu64   $R3_0h, `7*32`($res)
 477     vmovdqu64   $R4_0,  `8*32`($res)
 478     vmovdqu64   $R4_0h, `9*32`($res)
 479
 480     vzeroupper
 481     lea     (%rsp),%rax
 482 .cfi_def_cfa_register   %rax
 483 ___
 484 $code.=<<___ if ($win64);
 485     vmovdqa64   `0*16`(%rax),%xmm6
 486     vmovdqa64   `1*16`(%rax),%xmm7
 487     vmovdqa64   `2*16`(%rax),%xmm8
 488     vmovdqa64   `3*16`(%rax),%xmm9
 489     vmovdqa64   `4*16`(%rax),%xmm10
 490     vmovdqa64   `5*16`(%rax),%xmm11
 491     vmovdqa64   `6*16`(%rax),%xmm12
 492     vmovdqa64   `7*16`(%rax),%xmm13
 493     vmovdqa64   `8*16`(%rax),%xmm14
 494     vmovdqa64   `9*16`(%rax),%xmm15
 495     lea  168(%rsp),%rax
 496 ___
 497 $code.=<<___;
 498     mov  0(%rax),%r15
 499 .cfi_restore    %r15
 500     mov  8(%rax),%r14
 501 .cfi_restore    %r14
 502     mov  16(%rax),%r13
 503 .cfi_restore    %r13
 504     mov  24(%rax),%r12
 505 .cfi_restore    %r12
 506     mov  32(%rax),%rbp
 507 .cfi_restore    %rbp
 508     mov  40(%rax),%rbx
 509 .cfi_restore    %rbx
 510     lea  48(%rax),%rsp       # restore rsp
 511 .cfi_def_cfa %rsp,8
 512 .Lossl_rsaz_amm52x40_x1_ifma256_epilogue:
 513
 514     ret
 515 .cfi_endproc
 516 .size   ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256
 517 ___
 518
 519 $code.=<<___;
 520 .data
 521 .align 32
 522 .Lmask52x4:
 523     .quad   0xfffffffffffff
 524     .quad   0xfffffffffffff
 525     .quad   0xfffffffffffff
 526     .quad   0xfffffffffffff
 527 ___
 528
 529 ###############################################################################
 530 # Dual Almost Montgomery Multiplication for 40-digit number in radix 2^52
 531 #
 532 # See description of ossl_rsaz_amm52x40_x1_ifma256() above for details about Almost
 533 # Montgomery Multiplication algorithm and function input parameters description.
 534 #
 535 # This function does two AMMs for two independent inputs, hence dual.
 536 #
 537 # void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40],
 538 #                                    const BN_ULONG a[2][40],
 539 #                                    const BN_ULONG b[2][40],
 540 #                                    const BN_ULONG m[2][40],
 541 #                                    const BN_ULONG k0[2]);
 542 ###############################################################################
 543
 544 $code.=<<___;
 545 .text
 546
 547 .globl  ossl_rsaz_amm52x40_x2_ifma256
 548 .type   ossl_rsaz_amm52x40_x2_ifma256,\@function,5
 549 .align 32
 550 ossl_rsaz_amm52x40_x2_ifma256:
 551 .cfi_startproc
 552     endbranch
 553     push    %rbx
 554 .cfi_push   %rbx
 555     push    %rbp
 556 .cfi_push   %rbp
 557     push    %r12
 558 .cfi_push   %r12
 559     push    %r13
 560 .cfi_push   %r13
 561     push    %r14
 562 .cfi_push   %r14
 563     push    %r15
 564 .cfi_push   %r15
 565 ___
 566 $code.=<<___ if ($win64);
 567     lea     -168(%rsp),%rsp
 568     vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
 569     vmovdqa64   %xmm7, `1*16`(%rsp)
 570     vmovdqa64   %xmm8, `2*16`(%rsp)
 571     vmovdqa64   %xmm9, `3*16`(%rsp)
 572     vmovdqa64   %xmm10,`4*16`(%rsp)
 573     vmovdqa64   %xmm11,`5*16`(%rsp)
 574     vmovdqa64   %xmm12,`6*16`(%rsp)
 575     vmovdqa64   %xmm13,`7*16`(%rsp)
 576     vmovdqa64   %xmm14,`8*16`(%rsp)
 577     vmovdqa64   %xmm15,`9*16`(%rsp)
 578 .Lossl_rsaz_amm52x40_x2_ifma256_body:
 579 ___
 580 $code.=<<___;
 581     # Zeroing accumulators
 582     vpxord   $zero, $zero, $zero
 583     vmovdqa64   $zero, $R0_0
 584     vmovdqa64   $zero, $R0_0h
 585     vmovdqa64   $zero, $R1_0
 586     vmovdqa64   $zero, $R1_0h
 587     vmovdqa64   $zero, $R2_0
 588     vmovdqa64   $zero, $R2_0h
 589     vmovdqa64   $zero, $R3_0
 590     vmovdqa64   $zero, $R3_0h
 591     vmovdqa64   $zero, $R4_0
 592     vmovdqa64   $zero, $R4_0h
 593
 594     vmovdqa64   $zero, $R0_1
 595     vmovdqa64   $zero, $R0_1h
 596     vmovdqa64   $zero, $R1_1
 597     vmovdqa64   $zero, $R1_1h
 598     vmovdqa64   $zero, $R2_1
 599     vmovdqa64   $zero, $R2_1h
 600     vmovdqa64   $zero, $R3_1
 601     vmovdqa64   $zero, $R3_1h
 602     vmovdqa64   $zero, $R4_1
 603     vmovdqa64   $zero, $R4_1h
 604
 605
 606     xorl    $acc0_0_low, $acc0_0_low
 607     xorl    $acc0_1_low, $acc0_1_low
 608
 609     movq    $b, $b_ptr                       # backup address of b
 610     movq    \$0xfffffffffffff, $mask52       # 52-bit mask
 611
 612     mov    \$40, $iter
 613
 614 .align 32
 615 .Lloop40:
 616 ___
 617     &amm52x40_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,"($k0)");
 618     # 40*8 = offset of the next dimension in two-dimension array
 619     &amm52x40_x1(40*8,40*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h,"8($k0)");
 620 $code.=<<___;
 621     lea    8($b_ptr), $b_ptr
 622     dec    $iter
 623     jne    .Lloop40
 624 ___
 625     &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
 626     &amm52x40_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h);
 627 $code.=<<___;
 628
 629     vmovdqu64   $R0_0,  `0*32`($res)
 630     vmovdqu64   $R0_0h, `1*32`($res)
 631     vmovdqu64   $R1_0,  `2*32`($res)
 632     vmovdqu64   $R1_0h, `3*32`($res)
 633     vmovdqu64   $R2_0,  `4*32`($res)
 634     vmovdqu64   $R2_0h, `5*32`($res)
 635     vmovdqu64   $R3_0,  `6*32`($res)
 636     vmovdqu64   $R3_0h, `7*32`($res)
 637     vmovdqu64   $R4_0,  `8*32`($res)
 638     vmovdqu64   $R4_0h, `9*32`($res)
 639
 640     vmovdqu64   $R0_1,  `10*32`($res)
 641     vmovdqu64   $R0_1h, `11*32`($res)
 642     vmovdqu64   $R1_1,  `12*32`($res)
 643     vmovdqu64   $R1_1h, `13*32`($res)
 644     vmovdqu64   $R2_1,  `14*32`($res)
 645     vmovdqu64   $R2_1h, `15*32`($res)
 646     vmovdqu64   $R3_1,  `16*32`($res)
 647     vmovdqu64   $R3_1h, `17*32`($res)
 648     vmovdqu64   $R4_1,  `18*32`($res)
 649     vmovdqu64   $R4_1h, `19*32`($res)
 650
 651     vzeroupper
 652     lea     (%rsp),%rax
 653 .cfi_def_cfa_register   %rax
 654 ___
 655 $code.=<<___ if ($win64);
 656     vmovdqa64   `0*16`(%rax),%xmm6
 657     vmovdqa64   `1*16`(%rax),%xmm7
 658     vmovdqa64   `2*16`(%rax),%xmm8
 659     vmovdqa64   `3*16`(%rax),%xmm9
 660     vmovdqa64   `4*16`(%rax),%xmm10
 661     vmovdqa64   `5*16`(%rax),%xmm11
 662     vmovdqa64   `6*16`(%rax),%xmm12
 663     vmovdqa64   `7*16`(%rax),%xmm13
 664     vmovdqa64   `8*16`(%rax),%xmm14
 665     vmovdqa64   `9*16`(%rax),%xmm15
 666     lea     168(%rsp),%rax
 667 ___
 668 $code.=<<___;
 669     mov  0(%rax),%r15
 670 .cfi_restore    %r15
 671     mov  8(%rax),%r14
 672 .cfi_restore    %r14
 673     mov  16(%rax),%r13
 674 .cfi_restore    %r13
 675     mov  24(%rax),%r12
 676 .cfi_restore    %r12
 677     mov  32(%rax),%rbp
 678 .cfi_restore    %rbp
 679     mov  40(%rax),%rbx
 680 .cfi_restore    %rbx
 681     lea  48(%rax),%rsp
 682 .cfi_def_cfa    %rsp,8
 683 .Lossl_rsaz_amm52x40_x2_ifma256_epilogue:
 684     ret
 685 .cfi_endproc
 686 .size   ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256
 687 ___
 688 }
 689
 690 ###############################################################################
 691 # Constant time extraction from the precomputed table of powers base^i, where
 692 #    i = 0..2^EXP_WIN_SIZE-1
 693 #
 694 # The input |red_table| contains precomputations for two independent base values.
 695 # |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
 696 #
 697 # Extracted value (output) is 2 40 digits numbers in 2^52 radix.
 698 #
 699 # void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y,
 700 #                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40],
 701 #                                        int red_table_idx1, int red_table_idx2);
 702 #
 703 # EXP_WIN_SIZE = 5
 704 ###############################################################################
 705 {
 706 # input parameters
 707 my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
 708                                                         ("%rdi","%rsi","%rdx","%rcx");  # Unix order
 709
 710 my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
 711 my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
 712 my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
 713
 714 my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
 715 my $t0xmm = $t0;
 716 $t0xmm =~ s/%y/%x/;
 717
 718 sub get_table_value_consttime() {
 719 my ($_idx,$_offset) = @_;
 720 $code.=<<___;
 721     vpxorq   $cur_idx, $cur_idx, $cur_idx
 722 .align 32
 723 .Lloop_$_offset:
 724     vpcmpq  \$0, $cur_idx, $_idx, %k1      # mask of (idx == cur_idx)
 725 ___
 726 foreach (0..9) {
 727 $code.=<<___;
 728     vmovdqu64  `$_offset+${_}*32`($red_tbl), $tmp   # load data from red_tbl
 729     vpblendmq  $tmp, $t[$_], ${t[$_]}{%k1}          # extract data when mask is not zero
 730 ___
 731 }
 732 $code.=<<___;
 733     vpaddq  $ones, $cur_idx, $cur_idx # increment cur_idx
 734     addq    \$`2*40*8`, $red_tbl
 735     cmpq    $red_tbl, %rax
 736     jne .Lloop_$_offset
 737 ___
 738 }
 739
 740 $code.=<<___;
 741 .text
 742
 743 .align 32
 744 .globl  ossl_extract_multiplier_2x40_win5
 745 .type   ossl_extract_multiplier_2x40_win5,\@abi-omnipotent
 746 ossl_extract_multiplier_2x40_win5:
 747 .cfi_startproc
 748     endbranch
 749     vmovdqa64   .Lones(%rip), $ones         # broadcast ones
 750     vpbroadcastq    $red_tbl_idx1, $idx1
 751     vpbroadcastq    $red_tbl_idx2, $idx2
 752     leaq   `(1<<5)*2*40*8`($red_tbl), %rax  # holds end of the tbl
 753
 754     # backup red_tbl address
 755     movq    $red_tbl, %r10
 756
 757     # zeroing t0..n, cur_idx
 758     vpxor   $t0xmm, $t0xmm, $t0xmm
 759 ___
 760 foreach (1..9) {
 761     $code.="vmovdqa64   $t0, $t[$_] \n";
 762 }
 763
 764 &get_table_value_consttime($idx1, 0);
 765 foreach (0..9) {
 766     $code.="vmovdqu64   $t[$_], `(0+$_)*32`($out) \n";
 767 }
 768 $code.="movq    %r10, $red_tbl \n";
 769 &get_table_value_consttime($idx2, 40*8);
 770 foreach (0..9) {
 771     $code.="vmovdqu64   $t[$_], `(10+$_)*32`($out) \n";
 772 }
 773 $code.=<<___;
 774
 775     ret
 776 .cfi_endproc
 777 .size   ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5
 778 ___
 779 $code.=<<___;
 780 .data
 781 .align 32
 782 .Lones:
 783     .quad   1,1,1,1
 784 .Lzeros:
 785     .quad   0,0,0,0
 786 ___
 787 }
 788
 789 if ($win64) {
 790 $rec="%rcx";
 791 $frame="%rdx";
 792 $context="%r8";
 793 $disp="%r9";
 794
 795 $code.=<<___;
 796 .extern     __imp_RtlVirtualUnwind
 797 .type   rsaz_avx_handler,\@abi-omnipotent
 798 .align  16
 799 rsaz_avx_handler:
 800     push    %rsi
 801     push    %rdi
 802     push    %rbx
 803     push    %rbp
 804     push    %r12
 805     push    %r13
 806     push    %r14
 807     push    %r15
 808     pushfq
 809     sub     \$64,%rsp
 810
 811     mov     120($context),%rax # pull context->Rax
 812     mov     248($context),%rbx # pull context->Rip
 813
 814     mov     8($disp),%rsi      # disp->ImageBase
 815     mov     56($disp),%r11     # disp->HandlerData
 816
 817     mov     0(%r11),%r10d      # HandlerData[0]
 818     lea     (%rsi,%r10),%r10   # prologue label
 819     cmp     %r10,%rbx          # context->Rip<.Lprologue
 820     jb  .Lcommon_seh_tail
 821
 822     mov     4(%r11),%r10d      # HandlerData[1]
 823     lea     (%rsi,%r10),%r10   # epilogue label
 824     cmp     %r10,%rbx          # context->Rip>=.Lepilogue
 825     jae     .Lcommon_seh_tail
 826
 827     mov     152($context),%rax # pull context->Rsp
 828
 829     lea     (%rax),%rsi         # %xmm save area
 830     lea     512($context),%rdi  # & context.Xmm6
 831     mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
 832     .long   0xa548f3fc          # cld; rep movsq
 833
 834     lea     `48+168`(%rax),%rax
 835
 836     mov     -8(%rax),%rbx
 837     mov     -16(%rax),%rbp
 838     mov     -24(%rax),%r12
 839     mov     -32(%rax),%r13
 840     mov     -40(%rax),%r14
 841     mov     -48(%rax),%r15
 842     mov     %rbx,144($context) # restore context->Rbx
 843     mov     %rbp,160($context) # restore context->Rbp
 844     mov     %r12,216($context) # restore context->R12
 845     mov     %r13,224($context) # restore context->R13
 846     mov     %r14,232($context) # restore context->R14
 847     mov     %r15,240($context) # restore context->R14
 848
 849 .Lcommon_seh_tail:
 850     mov     8(%rax),%rdi
 851     mov     16(%rax),%rsi
 852     mov     %rax,152($context) # restore context->Rsp
 853     mov     %rsi,168($context) # restore context->Rsi
 854     mov     %rdi,176($context) # restore context->Rdi
 855
 856     mov     40($disp),%rdi     # disp->ContextRecord
 857     mov     $context,%rsi      # context
 858     mov     \$154,%ecx         # sizeof(CONTEXT)
 859     .long   0xa548f3fc         # cld; rep movsq
 860
 861     mov     $disp,%rsi
 862     xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
 863     mov     8(%rsi),%rdx       # arg2, disp->ImageBase
 864     mov     0(%rsi),%r8        # arg3, disp->ControlPc
 865     mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
 866     mov     40(%rsi),%r10      # disp->ContextRecord
 867     lea     56(%rsi),%r11      # &disp->HandlerData
 868     lea     24(%rsi),%r12      # &disp->EstablisherFrame
 869     mov     %r10,32(%rsp)      # arg5
 870     mov     %r11,40(%rsp)      # arg6
 871     mov     %r12,48(%rsp)      # arg7
 872     mov     %rcx,56(%rsp)      # arg8, (NULL)
 873     call    *__imp_RtlVirtualUnwind(%rip)
 874
 875     mov     \$1,%eax           # ExceptionContinueSearch
 876     add     \$64,%rsp
 877     popfq
 878     pop     %r15
 879     pop     %r14
 880     pop     %r13
 881     pop     %r12
 882     pop     %rbp
 883     pop     %rbx
 884     pop     %rdi
 885     pop     %rsi
 886     ret
 887 .size   rsaz_avx_handler,.-rsaz_avx_handler
 888
 889 .section    .pdata
 890 .align  4
 891     .rva    .LSEH_begin_ossl_rsaz_amm52x40_x1_ifma256
 892     .rva    .LSEH_end_ossl_rsaz_amm52x40_x1_ifma256
 893     .rva    .LSEH_info_ossl_rsaz_amm52x40_x1_ifma256
 894
 895     .rva    .LSEH_begin_ossl_rsaz_amm52x40_x2_ifma256
 896     .rva    .LSEH_end_ossl_rsaz_amm52x40_x2_ifma256
 897     .rva    .LSEH_info_ossl_rsaz_amm52x40_x2_ifma256
 898
 899 .section    .xdata
 900 .align  8
 901 .LSEH_info_ossl_rsaz_amm52x40_x1_ifma256:
 902     .byte   9,0,0,0
 903     .rva    rsaz_avx_handler
 904     .rva    .Lossl_rsaz_amm52x40_x1_ifma256_body,.Lossl_rsaz_amm52x40_x1_ifma256_epilogue
 905 .LSEH_info_ossl_rsaz_amm52x40_x2_ifma256:
 906     .byte   9,0,0,0
 907     .rva    rsaz_avx_handler
 908     .rva    .Lossl_rsaz_amm52x40_x2_ifma256_body,.Lossl_rsaz_amm52x40_x2_ifma256_epilogue
 909 ___
 910 }
 911 }}} else {{{                # fallback for old assembler
 912 $code.=<<___;
 913 .text
 914
 915 .globl  ossl_rsaz_amm52x40_x1_ifma256
 916 .globl  ossl_rsaz_amm52x40_x2_ifma256
 917 .globl  ossl_extract_multiplier_2x40_win5
 918 .type   ossl_rsaz_amm52x40_x1_ifma256,\@abi-omnipotent
 919 ossl_rsaz_amm52x40_x1_ifma256:
 920 ossl_rsaz_amm52x40_x2_ifma256:
 921 ossl_extract_multiplier_2x40_win5:
 922     .byte   0x0f,0x0b    # ud2
 923     ret
 924 .size   ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256
 925 ___
 926 }}}
 927
 928 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 929 print $code;
 930 close STDOUT or die "error closing STDOUT: $!";