sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S

   1 /* Function expf vectorized with AVX-512. KNL and SKX versions.
   2    Copyright (C) 2014-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_s_expf_data.h"
  21 #include "svml_s_wrapper_impl.h"
  22
  23         .text
  24 ENTRY (_ZGVeN16v_expf_knl)
  25 /*
  26    ALGORITHM DESCRIPTION:
  27
  28      Argument representation:
  29      M = rint(X*2^k/ln2) = 2^k*N+j
  30      X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
  31      then -ln2/2^(k+1) < r < ln2/2^(k+1)
  32      Alternatively:
  33      M = trunc(X*2^k/ln2)
  34      then 0 < r < ln2/2^k
  35
  36      Result calculation:
  37      exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
  38      = 2^N * 2^(j/2^k) * exp(r)
  39      2^N is calculated by bit manipulation
  40      2^(j/2^k) is computed from table lookup
  41      exp(r) is approximated by polynomial
  42
  43      The table lookup is skipped if k = 0.
  44      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
  45
  46         pushq     %rbp
  47         cfi_adjust_cfa_offset (8)
  48         cfi_rel_offset (%rbp, 0)
  49         movq      %rsp, %rbp
  50         cfi_def_cfa_register (%rbp)
  51         andq      $-64, %rsp
  52         subq      $1280, %rsp
  53         movq      __svml_sexp_data@GOTPCREL(%rip), %rax
  54
  55 /* r = x-n*ln2_hi/2^k */
  56         vmovaps   %zmm0, %zmm6
  57
  58 /* compare against threshold */
  59         movl      $-1, %ecx
  60         vmovups   __sInvLn2(%rax), %zmm3
  61         vmovups   __sLn2hi(%rax), %zmm5
  62
  63 /* m = x*2^k/ln2 + shifter */
  64         vfmadd213ps __sShifter(%rax), %zmm0, %zmm3
  65         vmovups     __sPC5(%rax), %zmm9
  66
  67 /* n = m - shifter = rint(x*2^k/ln2) */
  68         vsubps    __sShifter(%rax), %zmm3, %zmm7
  69
  70 /* remove sign of x by "and" operation */
  71         vpandd   __iAbsMask(%rax), %zmm0, %zmm1
  72         vpaddd   __iBias(%rax), %zmm3, %zmm4
  73         vpcmpgtd __iDomainRange(%rax), %zmm1, %k1
  74
  75 /* compute 2^N with "shift" */
  76         vpslld       $23, %zmm4, %zmm8
  77         vfnmadd231ps %zmm7, %zmm5, %zmm6
  78         vpbroadcastd %ecx, %zmm2{%k1}{z}
  79
  80 /* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
  81         vfnmadd132ps __sLn2lo(%rax), %zmm6, %zmm7
  82
  83 /* set mask for overflow/underflow */
  84         vptestmd  %zmm2, %zmm2, %k0
  85         kmovw     %k0, %ecx
  86
  87 /* c5*r+c4 */
  88         vfmadd213ps __sPC4(%rax), %zmm7, %zmm9
  89
  90 /* (c5*r+c4)*r+c3 */
  91         vfmadd213ps __sPC3(%rax), %zmm7, %zmm9
  92
  93 /* ((c5*r+c4)*r+c3)*r+c2 */
  94         vfmadd213ps __sPC2(%rax), %zmm7, %zmm9
  95
  96 /* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
  97         vfmadd213ps __sPC1(%rax), %zmm7, %zmm9
  98
  99 /* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
 100         vfmadd213ps __sPC0(%rax), %zmm7, %zmm9
 101
 102 /* 2^N*exp(r) */
 103         vmulps    %zmm9, %zmm8, %zmm1
 104         testl     %ecx, %ecx
 105         jne       .LBL_1_3
 106
 107 .LBL_1_2:
 108         cfi_remember_state
 109         vmovaps   %zmm1, %zmm0
 110         movq      %rbp, %rsp
 111         cfi_def_cfa_register (%rsp)
 112         popq      %rbp
 113         cfi_adjust_cfa_offset (-8)
 114         cfi_restore (%rbp)
 115         ret
 116
 117 .LBL_1_3:
 118         cfi_restore_state
 119         vmovups   %zmm0, 1152(%rsp)
 120         vmovups   %zmm1, 1216(%rsp)
 121         je        .LBL_1_2
 122
 123         xorb      %dl, %dl
 124         kmovw     %k4, 1048(%rsp)
 125         xorl      %eax, %eax
 126         kmovw     %k5, 1040(%rsp)
 127         kmovw     %k6, 1032(%rsp)
 128         kmovw     %k7, 1024(%rsp)
 129         vmovups   %zmm16, 960(%rsp)
 130         vmovups   %zmm17, 896(%rsp)
 131         vmovups   %zmm18, 832(%rsp)
 132         vmovups   %zmm19, 768(%rsp)
 133         vmovups   %zmm20, 704(%rsp)
 134         vmovups   %zmm21, 640(%rsp)
 135         vmovups   %zmm22, 576(%rsp)
 136         vmovups   %zmm23, 512(%rsp)
 137         vmovups   %zmm24, 448(%rsp)
 138         vmovups   %zmm25, 384(%rsp)
 139         vmovups   %zmm26, 320(%rsp)
 140         vmovups   %zmm27, 256(%rsp)
 141         vmovups   %zmm28, 192(%rsp)
 142         vmovups   %zmm29, 128(%rsp)
 143         vmovups   %zmm30, 64(%rsp)
 144         vmovups   %zmm31, (%rsp)
 145         movq      %rsi, 1064(%rsp)
 146         movq      %rdi, 1056(%rsp)
 147         movq      %r12, 1096(%rsp)
 148         cfi_offset_rel_rsp (12, 1096)
 149         movb      %dl, %r12b
 150         movq      %r13, 1088(%rsp)
 151         cfi_offset_rel_rsp (13, 1088)
 152         movl      %ecx, %r13d
 153         movq      %r14, 1080(%rsp)
 154         cfi_offset_rel_rsp (14, 1080)
 155         movl      %eax, %r14d
 156         movq      %r15, 1072(%rsp)
 157         cfi_offset_rel_rsp (15, 1072)
 158         cfi_remember_state
 159
 160 .LBL_1_6:
 161         btl       %r14d, %r13d
 162         jc        .LBL_1_12
 163
 164 .LBL_1_7:
 165         lea       1(%r14), %esi
 166         btl       %esi, %r13d
 167         jc        .LBL_1_10
 168
 169 .LBL_1_8:
 170         addb      $1, %r12b
 171         addl      $2, %r14d
 172         cmpb      $16, %r12b
 173         jb        .LBL_1_6
 174
 175         kmovw     1048(%rsp), %k4
 176         movq      1064(%rsp), %rsi
 177         kmovw     1040(%rsp), %k5
 178         movq      1056(%rsp), %rdi
 179         kmovw     1032(%rsp), %k6
 180         movq      1096(%rsp), %r12
 181         cfi_restore (%r12)
 182         movq      1088(%rsp), %r13
 183         cfi_restore (%r13)
 184         kmovw     1024(%rsp), %k7
 185         vmovups   960(%rsp), %zmm16
 186         vmovups   896(%rsp), %zmm17
 187         vmovups   832(%rsp), %zmm18
 188         vmovups   768(%rsp), %zmm19
 189         vmovups   704(%rsp), %zmm20
 190         vmovups   640(%rsp), %zmm21
 191         vmovups   576(%rsp), %zmm22
 192         vmovups   512(%rsp), %zmm23
 193         vmovups   448(%rsp), %zmm24
 194         vmovups   384(%rsp), %zmm25
 195         vmovups   320(%rsp), %zmm26
 196         vmovups   256(%rsp), %zmm27
 197         vmovups   192(%rsp), %zmm28
 198         vmovups   128(%rsp), %zmm29
 199         vmovups   64(%rsp), %zmm30
 200         vmovups   (%rsp), %zmm31
 201         movq      1080(%rsp), %r14
 202         cfi_restore (%r14)
 203         movq      1072(%rsp), %r15
 204         cfi_restore (%r15)
 205         vmovups   1216(%rsp), %zmm1
 206         jmp       .LBL_1_2
 207
 208 .LBL_1_10:
 209         cfi_restore_state
 210         movzbl    %r12b, %r15d
 211         vmovss    1156(%rsp,%r15,8), %xmm0
 212         call      JUMPTARGET(expf)
 213         vmovss    %xmm0, 1220(%rsp,%r15,8)
 214         jmp       .LBL_1_8
 215
 216 .LBL_1_12:
 217         movzbl    %r12b, %r15d
 218         vmovss    1152(%rsp,%r15,8), %xmm0
 219         call      JUMPTARGET(expf)
 220         vmovss    %xmm0, 1216(%rsp,%r15,8)
 221         jmp       .LBL_1_7
 222
 223 END (_ZGVeN16v_expf_knl)
 224
 225 ENTRY (_ZGVeN16v_expf_skx)
 226 /*
 227    ALGORITHM DESCRIPTION:
 228
 229      Argument representation:
 230      M = rint(X*2^k/ln2) = 2^k*N+j
 231      X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
 232      then -ln2/2^(k+1) < r < ln2/2^(k+1)
 233      Alternatively:
 234      M = trunc(X*2^k/ln2)
 235      then 0 < r < ln2/2^k
 236
 237      Result calculation:
 238      exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
 239      = 2^N * 2^(j/2^k) * exp(r)
 240      2^N is calculated by bit manipulation
 241      2^(j/2^k) is computed from table lookup
 242      exp(r) is approximated by polynomial
 243
 244      The table lookup is skipped if k = 0.
 245      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */
 246
 247         pushq     %rbp
 248         cfi_adjust_cfa_offset (8)
 249         cfi_rel_offset (%rbp, 0)
 250         movq      %rsp, %rbp
 251         cfi_def_cfa_register (%rbp)
 252         andq      $-64, %rsp
 253         subq      $1280, %rsp
 254         movq      __svml_sexp_data@GOTPCREL(%rip), %rax
 255
 256 /* r = x-n*ln2_hi/2^k */
 257         vmovaps   %zmm0, %zmm7
 258
 259 /* compare against threshold */
 260         vpternlogd $0xff, %zmm3, %zmm3, %zmm3
 261         vmovups __sInvLn2(%rax), %zmm4
 262         vmovups __sShifter(%rax), %zmm1
 263         vmovups __sLn2hi(%rax), %zmm6
 264         vmovups __sPC5(%rax), %zmm10
 265
 266 /* m = x*2^k/ln2 + shifter */
 267         vfmadd213ps %zmm1, %zmm0, %zmm4
 268
 269 /* n = m - shifter = rint(x*2^k/ln2) */
 270         vsubps    %zmm1, %zmm4, %zmm8
 271         vpaddd __iBias(%rax), %zmm4, %zmm5
 272         vfnmadd231ps %zmm8, %zmm6, %zmm7
 273
 274 /* compute 2^N with "shift" */
 275         vpslld    $23, %zmm5, %zmm9
 276
 277 /* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
 278         vfnmadd132ps __sLn2lo(%rax), %zmm7, %zmm8
 279
 280 /* c5*r+c4 */
 281         vfmadd213ps __sPC4(%rax), %zmm8, %zmm10
 282
 283 /* (c5*r+c4)*r+c3 */
 284         vfmadd213ps __sPC3(%rax), %zmm8, %zmm10
 285
 286 /* ((c5*r+c4)*r+c3)*r+c2 */
 287         vfmadd213ps __sPC2(%rax), %zmm8, %zmm10
 288
 289 /* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
 290         vfmadd213ps __sPC1(%rax), %zmm8, %zmm10
 291
 292 /* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
 293         vfmadd213ps __sPC0(%rax), %zmm8, %zmm10
 294
 295 /* 2^N*exp(r) */
 296         vmulps    %zmm10, %zmm9, %zmm1
 297
 298 /* remove sign of x by "and" operation */
 299         vpandd __iAbsMask(%rax), %zmm0, %zmm2
 300         vpcmpd    $2, __iDomainRange(%rax), %zmm2, %k1
 301         vpandnd   %zmm2, %zmm2, %zmm3{%k1}
 302
 303 /* set mask for overflow/underflow */
 304         vptestmd  %zmm3, %zmm3, %k0
 305         kmovw     %k0, %ecx
 306         testl     %ecx, %ecx
 307         jne       .LBL_2_3
 308
 309 .LBL_2_2:
 310         cfi_remember_state
 311         vmovaps   %zmm1, %zmm0
 312         movq      %rbp, %rsp
 313         cfi_def_cfa_register (%rsp)
 314         popq      %rbp
 315         cfi_adjust_cfa_offset (-8)
 316         cfi_restore (%rbp)
 317         ret
 318
 319 .LBL_2_3:
 320         cfi_restore_state
 321         vmovups   %zmm0, 1152(%rsp)
 322         vmovups   %zmm1, 1216(%rsp)
 323         je        .LBL_2_2
 324
 325         xorb      %dl, %dl
 326         xorl      %eax, %eax
 327         kmovw     %k4, 1048(%rsp)
 328         kmovw     %k5, 1040(%rsp)
 329         kmovw     %k6, 1032(%rsp)
 330         kmovw     %k7, 1024(%rsp)
 331         vmovups   %zmm16, 960(%rsp)
 332         vmovups   %zmm17, 896(%rsp)
 333         vmovups   %zmm18, 832(%rsp)
 334         vmovups   %zmm19, 768(%rsp)
 335         vmovups   %zmm20, 704(%rsp)
 336         vmovups   %zmm21, 640(%rsp)
 337         vmovups   %zmm22, 576(%rsp)
 338         vmovups   %zmm23, 512(%rsp)
 339         vmovups   %zmm24, 448(%rsp)
 340         vmovups   %zmm25, 384(%rsp)
 341         vmovups   %zmm26, 320(%rsp)
 342         vmovups   %zmm27, 256(%rsp)
 343         vmovups   %zmm28, 192(%rsp)
 344         vmovups   %zmm29, 128(%rsp)
 345         vmovups   %zmm30, 64(%rsp)
 346         vmovups   %zmm31, (%rsp)
 347         movq      %rsi, 1064(%rsp)
 348         movq      %rdi, 1056(%rsp)
 349         movq      %r12, 1096(%rsp)
 350         cfi_offset_rel_rsp (12, 1096)
 351         movb      %dl, %r12b
 352         movq      %r13, 1088(%rsp)
 353         cfi_offset_rel_rsp (13, 1088)
 354         movl      %ecx, %r13d
 355         movq      %r14, 1080(%rsp)
 356         cfi_offset_rel_rsp (14, 1080)
 357         movl      %eax, %r14d
 358         movq      %r15, 1072(%rsp)
 359         cfi_offset_rel_rsp (15, 1072)
 360         cfi_remember_state
 361
 362
 363 .LBL_2_6:
 364         btl       %r14d, %r13d
 365         jc        .LBL_2_12
 366
 367 .LBL_2_7:
 368         lea       1(%r14), %esi
 369         btl       %esi, %r13d
 370         jc        .LBL_2_10
 371
 372 .LBL_2_8:
 373         incb      %r12b
 374         addl      $2, %r14d
 375         cmpb      $16, %r12b
 376         jb        .LBL_2_6
 377
 378         kmovw     1048(%rsp), %k4
 379         kmovw     1040(%rsp), %k5
 380         kmovw     1032(%rsp), %k6
 381         kmovw     1024(%rsp), %k7
 382         vmovups   960(%rsp), %zmm16
 383         vmovups   896(%rsp), %zmm17
 384         vmovups   832(%rsp), %zmm18
 385         vmovups   768(%rsp), %zmm19
 386         vmovups   704(%rsp), %zmm20
 387         vmovups   640(%rsp), %zmm21
 388         vmovups   576(%rsp), %zmm22
 389         vmovups   512(%rsp), %zmm23
 390         vmovups   448(%rsp), %zmm24
 391         vmovups   384(%rsp), %zmm25
 392         vmovups   320(%rsp), %zmm26
 393         vmovups   256(%rsp), %zmm27
 394         vmovups   192(%rsp), %zmm28
 395         vmovups   128(%rsp), %zmm29
 396         vmovups   64(%rsp), %zmm30
 397         vmovups   (%rsp), %zmm31
 398         vmovups   1216(%rsp), %zmm1
 399         movq      1064(%rsp), %rsi
 400         movq      1056(%rsp), %rdi
 401         movq      1096(%rsp), %r12
 402         cfi_restore (%r12)
 403         movq      1088(%rsp), %r13
 404         cfi_restore (%r13)
 405         movq      1080(%rsp), %r14
 406         cfi_restore (%r14)
 407         movq      1072(%rsp), %r15
 408         cfi_restore (%r15)
 409         jmp       .LBL_2_2
 410
 411 .LBL_2_10:
 412         cfi_restore_state
 413         movzbl    %r12b, %r15d
 414         vmovss    1156(%rsp,%r15,8), %xmm0
 415         vzeroupper
 416         vmovss    1156(%rsp,%r15,8), %xmm0
 417
 418         call      JUMPTARGET(expf)
 419
 420         vmovss    %xmm0, 1220(%rsp,%r15,8)
 421         jmp       .LBL_2_8
 422
 423 .LBL_2_12:
 424         movzbl    %r12b, %r15d
 425         vmovss    1152(%rsp,%r15,8), %xmm0
 426         vzeroupper
 427         vmovss    1152(%rsp,%r15,8), %xmm0
 428
 429         call      JUMPTARGET(expf)
 430
 431         vmovss    %xmm0, 1216(%rsp,%r15,8)
 432         jmp       .LBL_2_7
 433
 434 END (_ZGVeN16v_expf_skx)