sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S

   1 /* Function expf vectorized with AVX-512. KNL and SKX versions.
   2    Copyright (C) 2014-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_s_expf_data.h"
  21 #include "svml_s_wrapper_impl.h"
  22
  23         .text
  24 ENTRY (_ZGVeN16v_expf_knl)
  25 #ifndef HAVE_AVX512DQ_ASM_SUPPORT
  26 WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
  27 #else
  28 /*
  29    ALGORITHM DESCRIPTION:
  30
  31      Argument representation:
  32      M = rint(X*2^k/ln2) = 2^k*N+j
  33      X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
  34      then -ln2/2^(k+1) < r < ln2/2^(k+1)
  35      Alternatively:
  36      M = trunc(X*2^k/ln2)
  37      then 0 < r < ln2/2^k
  38
  39      Result calculation:
  40      exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
  41      = 2^N * 2^(j/2^k) * exp(r)
  42      2^N is calculated by bit manipulation
  43      2^(j/2^k) is computed from table lookup
  44      exp(r) is approximated by polynomial
  45
  46      The table lookup is skipped if k = 0.
  47      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
  48
  49         pushq     %rbp
  50         cfi_adjust_cfa_offset (8)
  51         cfi_rel_offset (%rbp, 0)
  52         movq      %rsp, %rbp
  53         cfi_def_cfa_register (%rbp)
  54         andq      $-64, %rsp
  55         subq      $1280, %rsp
  56         movq      __svml_sexp_data@GOTPCREL(%rip), %rax
  57
  58 /* r = x-n*ln2_hi/2^k */
  59         vmovaps   %zmm0, %zmm6
  60
  61 /* compare against threshold */
  62         movl      $-1, %ecx
  63         vmovups   __sInvLn2(%rax), %zmm3
  64         vmovups   __sLn2hi(%rax), %zmm5
  65
  66 /* m = x*2^k/ln2 + shifter */
  67         vfmadd213ps __sShifter(%rax), %zmm0, %zmm3
  68         vmovups     __sPC5(%rax), %zmm9
  69
  70 /* n = m - shifter = rint(x*2^k/ln2) */
  71         vsubps    __sShifter(%rax), %zmm3, %zmm7
  72
  73 /* remove sign of x by "and" operation */
  74         vpandd   __iAbsMask(%rax), %zmm0, %zmm1
  75         vpaddd   __iBias(%rax), %zmm3, %zmm4
  76         vpcmpgtd __iDomainRange(%rax), %zmm1, %k1
  77
  78 /* compute 2^N with "shift" */
  79         vpslld       $23, %zmm4, %zmm8
  80         vfnmadd231ps %zmm7, %zmm5, %zmm6
  81         vpbroadcastd %ecx, %zmm2{%k1}{z}
  82
  83 /* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
  84         vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7
  85
  86 /* set mask for overflow/underflow */
  87         vptestmd  %zmm2, %zmm2, %k0
  88         kmovw     %k0, %ecx
  89
  90 /* c5*r+c4 */
  91         vfmadd213ps __sPC4(%rax), %zmm7, %zmm9
  92
  93 /* (c5*r+c4)*r+c3 */
  94         vfmadd213ps __sPC3(%rax), %zmm7, %zmm9
  95
  96 /* ((c5*r+c4)*r+c3)*r+c2 */
  97         vfmadd213ps __sPC2(%rax), %zmm7, %zmm9
  98
  99 /* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
 100         vfmadd213ps __sPC1(%rax), %zmm7, %zmm9
 101
 102 /* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
 103         vfmadd213ps __sPC0(%rax), %zmm7, %zmm9
 104
 105 /* 2^N*exp(r) */
 106         vmulps    %zmm9, %zmm8, %zmm1
 107         testl     %ecx, %ecx
 108         jne       .LBL_1_3
 109
 110 .LBL_1_2:
 111         cfi_remember_state
 112         vmovaps   %zmm1, %zmm0
 113         movq      %rbp, %rsp
 114         cfi_def_cfa_register (%rsp)
 115         popq      %rbp
 116         cfi_adjust_cfa_offset (-8)
 117         cfi_restore (%rbp)
 118         ret
 119
 120 .LBL_1_3:
 121         cfi_restore_state
 122         vmovups   %zmm0, 1152(%rsp)
 123         vmovups   %zmm1, 1216(%rsp)
 124         je        .LBL_1_2
 125
 126         xorb      %dl, %dl
 127         kmovw     %k4, 1048(%rsp)
 128         xorl      %eax, %eax
 129         kmovw     %k5, 1040(%rsp)
 130         kmovw     %k6, 1032(%rsp)
 131         kmovw     %k7, 1024(%rsp)
 132         vmovups   %zmm16, 960(%rsp)
 133         vmovups   %zmm17, 896(%rsp)
 134         vmovups   %zmm18, 832(%rsp)
 135         vmovups   %zmm19, 768(%rsp)
 136         vmovups   %zmm20, 704(%rsp)
 137         vmovups   %zmm21, 640(%rsp)
 138         vmovups   %zmm22, 576(%rsp)
 139         vmovups   %zmm23, 512(%rsp)
 140         vmovups   %zmm24, 448(%rsp)
 141         vmovups   %zmm25, 384(%rsp)
 142         vmovups   %zmm26, 320(%rsp)
 143         vmovups   %zmm27, 256(%rsp)
 144         vmovups   %zmm28, 192(%rsp)
 145         vmovups   %zmm29, 128(%rsp)
 146         vmovups   %zmm30, 64(%rsp)
 147         vmovups   %zmm31, (%rsp)
 148         movq      %rsi, 1064(%rsp)
 149         movq      %rdi, 1056(%rsp)
 150         movq      %r12, 1096(%rsp)
 151         cfi_offset_rel_rsp (12, 1096)
 152         movb      %dl, %r12b
 153         movq      %r13, 1088(%rsp)
 154         cfi_offset_rel_rsp (13, 1088)
 155         movl      %ecx, %r13d
 156         movq      %r14, 1080(%rsp)
 157         cfi_offset_rel_rsp (14, 1080)
 158         movl      %eax, %r14d
 159         movq      %r15, 1072(%rsp)
 160         cfi_offset_rel_rsp (15, 1072)
 161         cfi_remember_state
 162
 163 .LBL_1_6:
 164         btl       %r14d, %r13d
 165         jc        .LBL_1_12
 166
 167 .LBL_1_7:
 168         lea       1(%r14), %esi
 169         btl       %esi, %r13d
 170         jc        .LBL_1_10
 171
 172 .LBL_1_8:
 173         addb      $1, %r12b
 174         addl      $2, %r14d
 175         cmpb      $16, %r12b
 176         jb        .LBL_1_6
 177
 178         kmovw     1048(%rsp), %k4
 179         movq      1064(%rsp), %rsi
 180         kmovw     1040(%rsp), %k5
 181         movq      1056(%rsp), %rdi
 182         kmovw     1032(%rsp), %k6
 183         movq      1096(%rsp), %r12
 184         cfi_restore (%r12)
 185         movq      1088(%rsp), %r13
 186         cfi_restore (%r13)
 187         kmovw     1024(%rsp), %k7
 188         vmovups   960(%rsp), %zmm16
 189         vmovups   896(%rsp), %zmm17
 190         vmovups   832(%rsp), %zmm18
 191         vmovups   768(%rsp), %zmm19
 192         vmovups   704(%rsp), %zmm20
 193         vmovups   640(%rsp), %zmm21
 194         vmovups   576(%rsp), %zmm22
 195         vmovups   512(%rsp), %zmm23
 196         vmovups   448(%rsp), %zmm24
 197         vmovups   384(%rsp), %zmm25
 198         vmovups   320(%rsp), %zmm26
 199         vmovups   256(%rsp), %zmm27
 200         vmovups   192(%rsp), %zmm28
 201         vmovups   128(%rsp), %zmm29
 202         vmovups   64(%rsp), %zmm30
 203         vmovups   (%rsp), %zmm31
 204         movq      1080(%rsp), %r14
 205         cfi_restore (%r14)
 206         movq      1072(%rsp), %r15
 207         cfi_restore (%r15)
 208         vmovups   1216(%rsp), %zmm1
 209         jmp       .LBL_1_2
 210
 211 .LBL_1_10:
 212         cfi_restore_state
 213         movzbl    %r12b, %r15d
 214         vmovss    1156(%rsp,%r15,8), %xmm0
 215         call      JUMPTARGET(expf)
 216         vmovss    %xmm0, 1220(%rsp,%r15,8)
 217         jmp       .LBL_1_8
 218
 219 .LBL_1_12:
 220         movzbl    %r12b, %r15d
 221         vmovss    1152(%rsp,%r15,8), %xmm0
 222         call      JUMPTARGET(expf)
 223         vmovss    %xmm0, 1216(%rsp,%r15,8)
 224         jmp       .LBL_1_7
 225
 226 #endif
 227 END (_ZGVeN16v_expf_knl)
 228
 229 ENTRY (_ZGVeN16v_expf_skx)
 230 #ifndef HAVE_AVX512DQ_ASM_SUPPORT
 231 WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
 232 #else
 233 /*
 234    ALGORITHM DESCRIPTION:
 235
 236      Argument representation:
 237      M = rint(X*2^k/ln2) = 2^k*N+j
 238      X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
 239      then -ln2/2^(k+1) < r < ln2/2^(k+1)
 240      Alternatively:
 241      M = trunc(X*2^k/ln2)
 242      then 0 < r < ln2/2^k
 243
 244      Result calculation:
 245      exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
 246      = 2^N * 2^(j/2^k) * exp(r)
 247      2^N is calculated by bit manipulation
 248      2^(j/2^k) is computed from table lookup
 249      exp(r) is approximated by polynomial
 250
 251      The table lookup is skipped if k = 0.
 252      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
 253
 254         pushq     %rbp
 255         cfi_adjust_cfa_offset (8)
 256         cfi_rel_offset (%rbp, 0)
 257         movq      %rsp, %rbp
 258         cfi_def_cfa_register (%rbp)
 259         andq      $-64, %rsp
 260         subq      $1280, %rsp
 261         movq      __svml_sexp_data@GOTPCREL(%rip), %rax
 262
 263 /* r = x-n*ln2_hi/2^k */
 264         vmovaps   %zmm0, %zmm7
 265
 266 /* compare against threshold */
 267         vmovups   .L_2il0floatpacket.13(%rip), %zmm3
 268         vmovups __sInvLn2(%rax), %zmm4
 269         vmovups __sShifter(%rax), %zmm1
 270         vmovups __sLn2hi(%rax), %zmm6
 271         vmovups __sPC5(%rax), %zmm10
 272
 273 /* m = x*2^k/ln2 + shifter */
 274         vfmadd213ps %zmm1, %zmm0, %zmm4
 275
 276 /* n = m - shifter = rint(x*2^k/ln2) */
 277         vsubps    %zmm1, %zmm4, %zmm8
 278         vpaddd __iBias(%rax), %zmm4, %zmm5
 279         vfnmadd231ps %zmm8, %zmm6, %zmm7
 280
 281 /* compute 2^N with "shift" */
 282         vpslld    $23, %zmm5, %zmm9
 283
 284 /* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
 285         vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8
 286
 287 /* c5*r+c4 */
 288         vfmadd213ps __sPC4(%rax), %zmm8, %zmm10
 289
 290 /* (c5*r+c4)*r+c3 */
 291         vfmadd213ps __sPC3(%rax), %zmm8, %zmm10
 292
 293 /* ((c5*r+c4)*r+c3)*r+c2 */
 294         vfmadd213ps __sPC2(%rax), %zmm8, %zmm10
 295
 296 /* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
 297         vfmadd213ps __sPC1(%rax), %zmm8, %zmm10
 298
 299 /* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
 300         vfmadd213ps __sPC0(%rax), %zmm8, %zmm10
 301
 302 /* 2^N*exp(r) */
 303         vmulps    %zmm10, %zmm9, %zmm1
 304
 305 /* remove sign of x by "and" operation */
 306         vpandd __iAbsMask(%rax), %zmm0, %zmm2
 307         vpcmpd    $2, __iDomainRange(%rax), %zmm2, %k1
 308         vpandnd   %zmm2, %zmm2, %zmm3{%k1}
 309
 310 /* set mask for overflow/underflow */
 311         vptestmd  %zmm3, %zmm3, %k0
 312         kmovw     %k0, %ecx
 313         testl     %ecx, %ecx
 314         jne       .LBL_2_3
 315
 316 .LBL_2_2:
 317         cfi_remember_state
 318         vmovaps   %zmm1, %zmm0
 319         movq      %rbp, %rsp
 320         cfi_def_cfa_register (%rsp)
 321         popq      %rbp
 322         cfi_adjust_cfa_offset (-8)
 323         cfi_restore (%rbp)
 324         ret
 325
 326 .LBL_2_3:
 327         cfi_restore_state
 328         vmovups   %zmm0, 1152(%rsp)
 329         vmovups   %zmm1, 1216(%rsp)
 330         je        .LBL_2_2
 331
 332         xorb      %dl, %dl
 333         xorl      %eax, %eax
 334         kmovw     %k4, 1048(%rsp)
 335         kmovw     %k5, 1040(%rsp)
 336         kmovw     %k6, 1032(%rsp)
 337         kmovw     %k7, 1024(%rsp)
 338         vmovups   %zmm16, 960(%rsp)
 339         vmovups   %zmm17, 896(%rsp)
 340         vmovups   %zmm18, 832(%rsp)
 341         vmovups   %zmm19, 768(%rsp)
 342         vmovups   %zmm20, 704(%rsp)
 343         vmovups   %zmm21, 640(%rsp)
 344         vmovups   %zmm22, 576(%rsp)
 345         vmovups   %zmm23, 512(%rsp)
 346         vmovups   %zmm24, 448(%rsp)
 347         vmovups   %zmm25, 384(%rsp)
 348         vmovups   %zmm26, 320(%rsp)
 349         vmovups   %zmm27, 256(%rsp)
 350         vmovups   %zmm28, 192(%rsp)
 351         vmovups   %zmm29, 128(%rsp)
 352         vmovups   %zmm30, 64(%rsp)
 353         vmovups   %zmm31, (%rsp)
 354         movq      %rsi, 1064(%rsp)
 355         movq      %rdi, 1056(%rsp)
 356         movq      %r12, 1096(%rsp)
 357         cfi_offset_rel_rsp (12, 1096)
 358         movb      %dl, %r12b
 359         movq      %r13, 1088(%rsp)
 360         cfi_offset_rel_rsp (13, 1088)
 361         movl      %ecx, %r13d
 362         movq      %r14, 1080(%rsp)
 363         cfi_offset_rel_rsp (14, 1080)
 364         movl      %eax, %r14d
 365         movq      %r15, 1072(%rsp)
 366         cfi_offset_rel_rsp (15, 1072)
 367         cfi_remember_state
 368
 369
 370 .LBL_2_6:
 371         btl       %r14d, %r13d
 372         jc        .LBL_2_12
 373
 374 .LBL_2_7:
 375         lea       1(%r14), %esi
 376         btl       %esi, %r13d
 377         jc        .LBL_2_10
 378
 379 .LBL_2_8:
 380         incb      %r12b
 381         addl      $2, %r14d
 382         cmpb      $16, %r12b
 383         jb        .LBL_2_6
 384
 385         kmovw     1048(%rsp), %k4
 386         kmovw     1040(%rsp), %k5
 387         kmovw     1032(%rsp), %k6
 388         kmovw     1024(%rsp), %k7
 389         vmovups   960(%rsp), %zmm16
 390         vmovups   896(%rsp), %zmm17
 391         vmovups   832(%rsp), %zmm18
 392         vmovups   768(%rsp), %zmm19
 393         vmovups   704(%rsp), %zmm20
 394         vmovups   640(%rsp), %zmm21
 395         vmovups   576(%rsp), %zmm22
 396         vmovups   512(%rsp), %zmm23
 397         vmovups   448(%rsp), %zmm24
 398         vmovups   384(%rsp), %zmm25
 399         vmovups   320(%rsp), %zmm26
 400         vmovups   256(%rsp), %zmm27
 401         vmovups   192(%rsp), %zmm28
 402         vmovups   128(%rsp), %zmm29
 403         vmovups   64(%rsp), %zmm30
 404         vmovups   (%rsp), %zmm31
 405         vmovups   1216(%rsp), %zmm1
 406         movq      1064(%rsp), %rsi
 407         movq      1056(%rsp), %rdi
 408         movq      1096(%rsp), %r12
 409         cfi_restore (%r12)
 410         movq      1088(%rsp), %r13
 411         cfi_restore (%r13)
 412         movq      1080(%rsp), %r14
 413         cfi_restore (%r14)
 414         movq      1072(%rsp), %r15
 415         cfi_restore (%r15)
 416         jmp       .LBL_2_2
 417
 418 .LBL_2_10:
 419         cfi_restore_state
 420         movzbl    %r12b, %r15d
 421         vmovss    1156(%rsp,%r15,8), %xmm0
 422         vzeroupper
 423         vmovss    1156(%rsp,%r15,8), %xmm0
 424
 425         call      JUMPTARGET(expf)
 426
 427         vmovss    %xmm0, 1220(%rsp,%r15,8)
 428         jmp       .LBL_2_8
 429
 430 .LBL_2_12:
 431         movzbl    %r12b, %r15d
 432         vmovss    1152(%rsp,%r15,8), %xmm0
 433         vzeroupper
 434         vmovss    1152(%rsp,%r15,8), %xmm0
 435
 436         call      JUMPTARGET(expf)
 437
 438         vmovss    %xmm0, 1216(%rsp,%r15,8)
 439         jmp       .LBL_2_7
 440
 441 #endif
 442 END (_ZGVeN16v_expf_skx)
 443
 444         .section .rodata, "a"
 445 .L_2il0floatpacket.13:
 446         .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 447         .type   .L_2il0floatpacket.13,@object