1 /* Function cosf vectorized with AVX2.
2 Copyright (C) 2014-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 #include "svml_s_trig_data.h"
23 .section .text.avx2, "ax", @progbits
24 ENTRY (_ZGVdN8v_cosf_avx2)
26 ALGORITHM DESCRIPTION:
28 1) Range reduction to [-Pi/2; +Pi/2] interval
29 a) We remove sign using AND operation
30 b) Add Pi/2 value to argument X for Cos to Sin transformation
31 c) Getting octant Y by 1/Pi multiplication
32 d) Add "Right Shifter" value
33 e) Treat obtained value as integer for destination sign setting.
34 Shift first bit of this value to the last (sign) position
35 f) Subtract "Right Shifter" value
36 g) Subtract 0.5 from result for octant correction
37 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
38 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
39 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
40 a) Calculate X^2 = X * X
41 b) Calculate polynomial:
42 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
43 3) Destination sign setting
44 a) Set shifted destination sign using XOR operation:
48 cfi_adjust_cfa_offset (8)
49 cfi_rel_offset (%rbp, 0)
51 cfi_def_cfa_register (%rbp)
54 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
56 vmovups __sRShifter(%rax), %ymm5
57 vmovups __sPI1_FMA(%rax), %ymm7
59 /* b) Add Pi/2 value to argument X for Cos to Sin transformation */
60 vaddps __sHalfPI(%rax), %ymm2, %ymm4
63 1) Range reduction to [-Pi/2; +Pi/2] interval
64 c) Getting octant Y by 1/Pi multiplication
65 d) Add "Right Shifter" (0x4B000000) value
67 vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
69 /* f) Subtract "Right Shifter" (0x4B000000) value */
70 vsubps %ymm5, %ymm4, %ymm6
73 e) Treat obtained value as integer for destination sign setting.
74 Shift first bit of this value to the last (sign) position (S << 31)
76 vpslld $31, %ymm4, %ymm0
78 /* g) Subtract 0.5 from result for octant correction */
79 vsubps __sOneHalf(%rax), %ymm6, %ymm4
81 /* Check for large and special arguments */
82 vandps __sAbsMask(%rax), %ymm2, %ymm3
83 vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
86 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
87 X = X - Y*PI1 - Y*PI2 - Y*PI3
90 vfnmadd231ps %ymm4, %ymm7, %ymm3
91 vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
92 vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
94 /* a) Calculate X^2 = X * X */
95 vmulps %ymm4, %ymm4, %ymm5
98 3) Destination sign setting
99 a) Set shifted destination sign using XOR operation:
102 vxorps %ymm0, %ymm4, %ymm6
103 vmovups __sA9_FMA(%rax), %ymm0
106 b) Calculate polynomial:
107 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
109 vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
110 vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
111 vfmadd213ps __sA3(%rax), %ymm5, %ymm0
112 vmulps %ymm5, %ymm0, %ymm0
113 vmovmskps %ymm1, %ecx
114 vfmadd213ps %ymm6, %ymm6, %ymm0
121 cfi_def_cfa_register (%rsp)
123 cfi_adjust_cfa_offset (-8)
129 vmovups %ymm2, 320(%rsp)
130 vmovups %ymm0, 384(%rsp)
135 vmovups %ymm8, 224(%rsp)
136 vmovups %ymm9, 192(%rsp)
137 vmovups %ymm10, 160(%rsp)
138 vmovups %ymm11, 128(%rsp)
139 vmovups %ymm12, 96(%rsp)
140 vmovups %ymm13, 64(%rsp)
141 vmovups %ymm14, 32(%rsp)
142 vmovups %ymm15, (%rsp)
146 cfi_offset_rel_rsp (12, 296)
149 cfi_offset_rel_rsp (13, 288)
152 cfi_offset_rel_rsp (14, 280)
155 cfi_offset_rel_rsp (15, 272)
173 vmovups 224(%rsp), %ymm8
174 vmovups 192(%rsp), %ymm9
175 vmovups 160(%rsp), %ymm10
176 vmovups 128(%rsp), %ymm11
177 vmovups 96(%rsp), %ymm12
178 vmovups 64(%rsp), %ymm13
179 vmovups 32(%rsp), %ymm14
180 vmovups (%rsp), %ymm15
181 vmovups 384(%rsp), %ymm0
197 vmovss 324(%rsp,%r15,8), %xmm0
200 call JUMPTARGET(cosf)
202 vmovss %xmm0, 388(%rsp,%r15,8)
207 vmovss 320(%rsp,%r15,8), %xmm0
210 call JUMPTARGET(cosf)
212 vmovss %xmm0, 384(%rsp,%r15,8)
215 END (_ZGVdN8v_cosf_avx2)