1 /* Function exp2f vectorized with AVX2.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * exp2(x) = 2^n * T[j] * (1 + P(y))
24 * x = m*(1/K) + y, y in [-1/K..1/K]
25 * m = n*K + j, m, n,j - signed integer, j in [-K/2..K/2]
27 * values of 2^j/K are tabulated
29 * P(y) is a minimax polynomial approximation of exp2(x)-1
30 * on small interval [-1/K..1/K]
37 * exp2(x) = 1 for subnormals
39 * if x >= 128.0 then exp2f(x) overflow
40 * if x < -151.0 then exp2f(x) underflow
44 /* Offsets for data table __svml_sexp2_data_internal
55 #define _iDomainRange 288
59 .section .text.avx2, "ax", @progbits
60 ENTRY(_ZGVdN8v_exp2f_avx2)
62 cfi_def_cfa_offset(16)
68 vmovups __svml_sexp2_data_internal(%rip), %ymm1
70 /* Check for overflow\underflow */
71 vmovups _sPC6+__svml_sexp2_data_internal(%rip), %ymm7
74 vaddps %ymm1, %ymm0, %ymm6
75 vsubps %ymm1, %ymm6, %ymm4
78 vpslld $23, %ymm6, %ymm8
81 vsubps %ymm4, %ymm0, %ymm5
84 vfmadd213ps _sPC5+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
85 vfmadd213ps _sPC4+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
86 vfmadd213ps _sPC3+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
87 vfmadd213ps _sPC2+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
88 vfmadd213ps _sPC1+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
89 vfmadd213ps _sPC0+__svml_sexp2_data_internal(%rip), %ymm5, %ymm7
91 /* Check for overflow\underflow */
92 vandps _iAbsMask+__svml_sexp2_data_internal(%rip), %ymm0, %ymm2
93 vpcmpgtd _iDomainRange+__svml_sexp2_data_internal(%rip), %ymm2, %ymm3
97 vpaddd %ymm8, %ymm7, %ymm1
100 /* Go to special inputs processing branch */
101 jne L(SPECIAL_VALUES_BRANCH)
102 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
105 * and exit the function
122 L(SPECIAL_VALUES_BRANCH):
123 vmovups %ymm0, 32(%rsp)
124 vmovups %ymm1, 64(%rsp)
125 # LOE rbx r12 r13 r14 r15 edx ymm1
128 # LOE rbx r12 r13 r14 r15 eax edx
132 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
133 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
136 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
137 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
140 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
141 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
142 # LOE rbx r15 r12d r13d
151 /* Call scalar math function */
152 jc L(SCALAR_MATH_CALL)
153 # LOE rbx r15 r12d r13d
159 L(SPECIAL_VALUES_LOOP):
163 /* Check bits in range mask */
164 jl L(RANGEMASK_CHECK)
165 # LOE rbx r15 r12d r13d
173 vmovups 64(%rsp), %ymm1
177 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
178 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
179 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
180 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
181 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
182 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
183 # LOE rbx r12 r13 r14 r15 ymm1
185 /* Scalar math fucntion call
186 * to process special input
191 vmovss 32(%rsp, %r14, 4), %xmm0
193 # LOE rbx r14 r15 r12d r13d xmm0
195 vmovss %xmm0, 64(%rsp, %r14, 4)
197 /* Process special inputs in loop */
198 jmp L(SPECIAL_VALUES_LOOP)
199 # LOE rbx r15 r12d r13d
200 END(_ZGVdN8v_exp2f_avx2)
202 .section .rodata, "a"
205 #ifdef __svml_sexp2_data_internal_typedef
206 typedef unsigned int VUINT32;
208 __declspec(align(32)) VUINT32 _sShifter[8][1];
209 __declspec(align(32)) VUINT32 _sPC0[8][1];
210 __declspec(align(32)) VUINT32 _sPC1[8][1];
211 __declspec(align(32)) VUINT32 _sPC2[8][1];
212 __declspec(align(32)) VUINT32 _sPC3[8][1];
213 __declspec(align(32)) VUINT32 _sPC4[8][1];
214 __declspec(align(32)) VUINT32 _sPC5[8][1];
215 __declspec(align(32)) VUINT32 _sPC6[8][1];
216 __declspec(align(32)) VUINT32 _iAbsMask[8][1];
217 __declspec(align(32)) VUINT32 _iDomainRange[8][1];
218 } __svml_sexp2_data_internal;
220 __svml_sexp2_data_internal:
221 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
223 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC0 */
225 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 /* _sPC1 */
227 .long 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef /* _sPC2 */
229 .long 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf /* _sPC3 */
231 .long 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c /* _sPC4 */
233 .long 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51 /* _sPC5 */
235 .long 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c /* _sPC6 */
238 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
240 .long 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000 /* _iDomainRange=126.0 */
242 .type __svml_sexp2_data_internal, @object
243 .size __svml_sexp2_data_internal, .-__svml_sexp2_data_internal