1 /* Function expf vectorized with AVX2.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include "svml_s_expf_data.h"
23 ENTRY(_ZGVdN8v_expf_avx2)
25 ALGORITHM DESCRIPTION:
27 Argument representation:
28 M = rint(X*2^k/ln2) = 2^k*N+j
29 X = M*ln2/2^k + r = N*ln2 + ln2*(j/2^k) + r
30 then -ln2/2^(k+1) < r < ln2/2^(k+1)
36 exp(X) = exp(N*ln2 + ln2*(j/2^k) + r)
37 = 2^N * 2^(j/2^k) * exp(r)
38 2^N is calculated by bit manipulation
39 2^(j/2^k) is computed from table lookup
40 exp(r) is approximated by polynomial
42 The table lookup is skipped if k = 0.
43 For low accuracy approximation, exp(r) ~ 1 or 1+r. */
46 cfi_adjust_cfa_offset (8)
47 cfi_rel_offset (%rbp, 0)
49 cfi_def_cfa_register (%rbp)
52 movq __svml_sexp_data@GOTPCREL(%rip), %rax
54 vmovups __sInvLn2(%rax), %ymm7
55 vmovups __sShifter(%rax), %ymm4
56 vmovups __sLn2hi(%rax), %ymm3
57 vmovups __sPC5(%rax), %ymm1
59 /* m = x*2^k/ln2 + shifter */
60 vfmadd213ps %ymm4, %ymm2, %ymm7
62 /* n = m - shifter = rint(x*2^k/ln2) */
63 vsubps %ymm4, %ymm7, %ymm0
64 vpaddd __iBias(%rax), %ymm7, %ymm4
66 /* remove sign of x by "and" operation */
67 vandps __iAbsMask(%rax), %ymm2, %ymm5
69 /* compare against threshold */
70 vpcmpgtd __iDomainRange(%rax), %ymm5, %ymm6
72 /* r = x-n*ln2_hi/2^k */
74 vfnmadd231ps %ymm0, %ymm3, %ymm5
76 /* r = r-n*ln2_lo/2^k = x - n*ln2/2^k */
77 vfnmadd132ps __sLn2lo(%rax), %ymm5, %ymm0
80 vfmadd213ps __sPC4(%rax), %ymm0, %ymm1
83 vfmadd213ps __sPC3(%rax), %ymm0, %ymm1
85 /* ((c5*r+c4)*r+c3)*r+c2 */
86 vfmadd213ps __sPC2(%rax), %ymm0, %ymm1
88 /* (((c5*r+c4)*r+c3)*r+c2)*r+c1 */
89 vfmadd213ps __sPC1(%rax), %ymm0, %ymm1
91 /* exp(r) = ((((c5*r+c4)*r+c3)*r+c2)*r+c1)*r+c0 */
92 vfmadd213ps __sPC0(%rax), %ymm0, %ymm1
94 /* set mask for overflow/underflow */
97 /* compute 2^N with "shift" */
98 vpslld $23, %ymm4, %ymm6
101 vmulps %ymm1, %ymm6, %ymm0
108 cfi_def_cfa_register (%rsp)
110 cfi_adjust_cfa_offset (-8)
116 vmovups %ymm2, 320(%rsp)
117 vmovups %ymm0, 384(%rsp)
122 vmovups %ymm8, 224(%rsp)
123 vmovups %ymm9, 192(%rsp)
124 vmovups %ymm10, 160(%rsp)
125 vmovups %ymm11, 128(%rsp)
126 vmovups %ymm12, 96(%rsp)
127 vmovups %ymm13, 64(%rsp)
128 vmovups %ymm14, 32(%rsp)
129 vmovups %ymm15, (%rsp)
133 cfi_offset_rel_rsp (12, 296)
136 cfi_offset_rel_rsp (13, 288)
139 cfi_offset_rel_rsp (14, 280)
142 cfi_offset_rel_rsp (15, 272)
160 vmovups 224(%rsp), %ymm8
161 vmovups 192(%rsp), %ymm9
162 vmovups 160(%rsp), %ymm10
163 vmovups 128(%rsp), %ymm11
164 vmovups 96(%rsp), %ymm12
165 vmovups 64(%rsp), %ymm13
166 vmovups 32(%rsp), %ymm14
167 vmovups (%rsp), %ymm15
168 vmovups 384(%rsp), %ymm0
184 vmovss 324(%rsp,%r15,8), %xmm0
187 call JUMPTARGET(__expf_finite)
189 vmovss %xmm0, 388(%rsp,%r15,8)
194 vmovss 320(%rsp,%r15,8), %xmm0
197 call JUMPTARGET(__expf_finite)
199 vmovss %xmm0, 384(%rsp,%r15,8)
202 END(_ZGVdN8v_expf_avx2)