1 /* Function powf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include "svml_s_powf_data.h"
21 #include "svml_s_wrapper_impl.h"
24 ALGORITHM DESCRIPTION:
26 We are using the next identity : pow(x,y) = 2^(y * log2(x)).
28 1) log2(x) calculation
29 Here we use the following formula.
30 Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
32 Rcp1 ~= 1/X1, X2=Rcp1*X1,
33 Rcp2 ~= 1/X2, X3=Rcp2*X2,
34 Rcp3 ~= 1/X3, Rcp3C ~= C/X3.
36 log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
37 log2(X1*Rcp1*Rcp2*Rcp3C/C),
38 where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
40 The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
41 Rcp3C, log2(C/Rcp3C) are taken from tables.
42 Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
43 is exactly represented in target precision.
45 log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
46 = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
47 = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
48 = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
50 cq=X1*Rcp1*Rcp2*Rcp3C-C,
51 a1=1/(C*ln(2))-1 is small,
55 Log2 result is split by three parts: HH+HL+HLL
57 2) Calculation of y*log2(x)
59 Get high PH and medium PL parts of y*log2|x|.
60 Get low PLL part of y*log2|x|.
61 Now we have PH+PL+PLL ~= y*log2|x|.
63 3) Calculation of 2^(y*log2(x))
64 Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
65 where expK=7 in this implementation, N and j are integers,
66 0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
67 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
68 where 2^(j/2^expK) is stored in a table, and
69 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
70 We compute 2^(PH+PL+PLL) as follows:
71 Break PH into PHH + PHL, where PHH = N + j/2^expK.
73 Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
74 Get 2^(j/2^expK) from table in the form THI+TLO.
75 Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
76 Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
78 ResLo := THI * Exp2Poly + TLO
79 Get exponent ERes of the result:
81 Result := ex(Res) + N. */
83 .section .text.evex512, "ax", @progbits
84 ENTRY (_ZGVeN16vv_powf_knl)
86 cfi_adjust_cfa_offset (8)
87 cfi_rel_offset (%rbp, 0)
89 cfi_def_cfa_register (%rbp)
92 movq __svml_spow_data@GOTPCREL(%rip), %rdx
94 vshuff32x4 $238, %zmm0, %zmm0, %zmm7
96 vcvtps2pd %ymm0, %zmm14
97 vcvtps2pd %ymm7, %zmm10
100 vpandd _ABSMASK(%rdx), %zmm9, %zmm4
101 vmovups _ExpMask(%rdx), %zmm6
103 /* exponent bits selection */
104 vpsrlq $20, %zmm14, %zmm13
105 vshuff32x4 $238, %zmm9, %zmm9, %zmm8
106 vpcmpd $5, _INF(%rdx), %zmm4, %k2
107 vpsrlq $32, %zmm13, %zmm15
108 vcvtps2pd %ymm8, %zmm2
109 vmovups _Two10(%rdx), %zmm4
110 vpmovqd %zmm15, %ymm12
111 vcvtps2pd %ymm9, %zmm1
112 vpsubd _NMINNORM(%rdx), %zmm0, %zmm3
113 vpbroadcastd %eax, %zmm8{%k2}{z}
114 vpcmpd $5, _NMAXVAL(%rdx), %zmm3, %k1
116 /* preserve mantissa, set input exponent to 2^(-10) */
118 vpternlogq $248, %zmm6, %zmm10, %zmm4
119 vpsrlq $20, %zmm10, %zmm10
120 vpternlogq $234, _Two10(%rdx), %zmm14, %zmm3
122 /* reciprocal approximation good to at least 11 bits */
123 vrcp28pd %zmm4, %zmm11
124 vpsrlq $32, %zmm10, %zmm14
125 vpbroadcastd %eax, %zmm7{%k1}{z}
127 vrcp28pd %zmm3, %zmm5
128 vpmovqd %zmm14, %ymm6
129 vshufi32x4 $68, %zmm6, %zmm12, %zmm13
130 vmovups _One(%rdx), %zmm6
132 /* round reciprocal to nearest integer, will have 1+9 mantissa bits */
133 vrndscalepd $8, %zmm5, %zmm14
135 /* biased exponent in DP format */
136 vshuff32x4 $238, %zmm13, %zmm13, %zmm5
137 vrndscalepd $8, %zmm11, %zmm11
138 vcmppd $30, _Threshold(%rdx), %zmm14, %k2
139 vcvtdq2pd %ymm13, %zmm10
140 vcvtdq2pd %ymm5, %zmm15
143 vpsrlq $40, %zmm14, %zmm13
144 vpxord %zmm5, %zmm5, %zmm5
145 vgatherqpd _Log2Rcp_lookup(%rdx,%zmm13), %zmm5{%k3}
146 vfmsub213pd %zmm6, %zmm14, %zmm3
147 vfmsub213pd %zmm6, %zmm11, %zmm4
148 vcmppd $30, _Threshold(%rdx), %zmm11, %k3
149 vpbroadcastq %rcx, %zmm14{%k2}{z}
151 /* dpP= _dbT+lJ*T_ITEM_GRAN */
153 vpsrlq $40, %zmm11, %zmm12
154 vpxord %zmm6, %zmm6, %zmm6
155 vpbroadcastq %rcx, %zmm11{%k3}{z}
157 vgatherqpd _Log2Rcp_lookup(%rdx,%zmm12), %zmm6{%k1}
158 vmovups _Bias1(%rdx), %zmm12
159 vpternlogq $236, _Bias(%rdx), %zmm12, %zmm14
160 vpternlogq $248, _Bias(%rdx), %zmm11, %zmm12
161 vsubpd %zmm14, %zmm10, %zmm13
162 vsubpd %zmm12, %zmm15, %zmm10
163 vmovups _poly_coeff_3(%rdx), %zmm11
164 vmovups _poly_coeff_4(%rdx), %zmm15
165 vfmadd213pd %zmm15, %zmm4, %zmm11
166 vmulpd %zmm4, %zmm4, %zmm12
167 vmovaps %zmm15, %zmm14
168 vmulpd %zmm3, %zmm3, %zmm15
169 vfmadd231pd _poly_coeff_3(%rdx), %zmm3, %zmm14
172 vfmadd213pd %zmm4, %zmm12, %zmm11
173 vfmadd213pd %zmm3, %zmm15, %zmm14
174 vaddpd %zmm6, %zmm11, %zmm11
175 vaddpd %zmm5, %zmm14, %zmm3
176 vfmadd231pd _L2(%rdx), %zmm10, %zmm11
177 vfmadd132pd _L2(%rdx), %zmm3, %zmm13
178 vmulpd %zmm2, %zmm11, %zmm12
179 vmulpd %zmm1, %zmm13, %zmm10
180 vmulpd __dbInvLn2(%rdx), %zmm12, %zmm6
183 vpsrlq $32, %zmm12, %zmm12
184 vmulpd __dbInvLn2(%rdx), %zmm10, %zmm1
186 /* to round down; if dR is an integer we will get R = 1, which is ok */
187 vsubpd __dbHALF(%rdx), %zmm6, %zmm4
188 vpsrlq $32, %zmm10, %zmm11
189 vpmovqd %zmm11, %ymm3
190 vsubpd __dbHALF(%rdx), %zmm1, %zmm2
191 vaddpd __dbShifter(%rdx), %zmm4, %zmm14
192 vpmovqd %zmm12, %ymm4
193 vshufi32x4 $68, %zmm4, %zmm3, %zmm5
194 vpxord %zmm4, %zmm4, %zmm4
195 vaddpd __dbShifter(%rdx), %zmm2, %zmm2
197 /* iAbsX = iAbsX&iAbsMask; */
198 vpandd __iAbsMask(%rdx), %zmm5, %zmm11
199 vpxord %zmm5, %zmm5, %zmm5
200 vsubpd __dbShifter(%rdx), %zmm14, %zmm13
202 /* iRangeMask = (iAbsX>iDomainRange) */
203 vpcmpgtd __iDomainRange(%rdx), %zmm11, %k1
204 vsubpd __dbShifter(%rdx), %zmm2, %zmm15
205 vpbroadcastd %eax, %zmm10{%k1}{z}
206 vpternlogd $254, %zmm8, %zmm7, %zmm10
209 vsubpd %zmm15, %zmm1, %zmm1
212 vpandq __lbLOWKBITS(%rdx), %zmm14, %zmm11
213 vgatherqpd 13952(%rdx,%zmm11,8), %zmm5{%k3}
214 vsubpd %zmm13, %zmm6, %zmm7
215 vptestmd %zmm10, %zmm10, %k0
216 vpandq __lbLOWKBITS(%rdx), %zmm2, %zmm10
217 vmulpd __dbC1(%rdx), %zmm1, %zmm1
218 vmulpd __dbC1(%rdx), %zmm7, %zmm3
219 vpsrlq $11, %zmm2, %zmm8
220 vpsrlq $11, %zmm14, %zmm2
222 /* NB : including +/- sign for the exponent!! */
223 vpsllq $52, %zmm8, %zmm8
225 vpsllq $52, %zmm2, %zmm6
226 vfmadd213pd %zmm5, %zmm3, %zmm5
227 vgatherqpd 13952(%rdx,%zmm10,8), %zmm4{%k2}
228 vfmadd213pd %zmm4, %zmm1, %zmm4
229 vpaddq %zmm6, %zmm5, %zmm10
230 vcvtpd2ps %zmm10, %ymm12
231 vpaddq %zmm8, %zmm4, %zmm7
232 vcvtpd2ps %zmm7, %ymm11
233 vshuff32x4 $68, %zmm12, %zmm11, %zmm1
241 cfi_def_cfa_register (%rsp)
243 cfi_adjust_cfa_offset (-8)
249 vmovups %zmm0, 1152(%rsp)
250 vmovups %zmm9, 1216(%rsp)
251 vmovups %zmm1, 1280(%rsp)
255 kmovw %k4, 1048(%rsp)
257 kmovw %k5, 1040(%rsp)
258 kmovw %k6, 1032(%rsp)
259 kmovw %k7, 1024(%rsp)
260 vmovups %zmm16, 960(%rsp)
261 vmovups %zmm17, 896(%rsp)
262 vmovups %zmm18, 832(%rsp)
263 vmovups %zmm19, 768(%rsp)
264 vmovups %zmm20, 704(%rsp)
265 vmovups %zmm21, 640(%rsp)
266 vmovups %zmm22, 576(%rsp)
267 vmovups %zmm23, 512(%rsp)
268 vmovups %zmm24, 448(%rsp)
269 vmovups %zmm25, 384(%rsp)
270 vmovups %zmm26, 320(%rsp)
271 vmovups %zmm27, 256(%rsp)
272 vmovups %zmm28, 192(%rsp)
273 vmovups %zmm29, 128(%rsp)
274 vmovups %zmm30, 64(%rsp)
275 vmovups %zmm31, (%rsp)
276 movq %rsi, 1064(%rsp)
277 movq %rdi, 1056(%rsp)
278 movq %r12, 1096(%rsp)
279 cfi_offset_rel_rsp (12, 1096)
281 movq %r13, 1088(%rsp)
282 cfi_offset_rel_rsp (13, 1088)
284 movq %r14, 1080(%rsp)
285 cfi_offset_rel_rsp (14, 1080)
287 movq %r15, 1072(%rsp)
288 cfi_offset_rel_rsp (15, 1072)
306 kmovw 1048(%rsp), %k4
307 movq 1064(%rsp), %rsi
308 kmovw 1040(%rsp), %k5
309 movq 1056(%rsp), %rdi
310 kmovw 1032(%rsp), %k6
311 movq 1096(%rsp), %r12
313 movq 1088(%rsp), %r13
315 kmovw 1024(%rsp), %k7
316 vmovups 960(%rsp), %zmm16
317 vmovups 896(%rsp), %zmm17
318 vmovups 832(%rsp), %zmm18
319 vmovups 768(%rsp), %zmm19
320 vmovups 704(%rsp), %zmm20
321 vmovups 640(%rsp), %zmm21
322 vmovups 576(%rsp), %zmm22
323 vmovups 512(%rsp), %zmm23
324 vmovups 448(%rsp), %zmm24
325 vmovups 384(%rsp), %zmm25
326 vmovups 320(%rsp), %zmm26
327 vmovups 256(%rsp), %zmm27
328 vmovups 192(%rsp), %zmm28
329 vmovups 128(%rsp), %zmm29
330 vmovups 64(%rsp), %zmm30
331 vmovups (%rsp), %zmm31
332 movq 1080(%rsp), %r14
334 movq 1072(%rsp), %r15
336 vmovups 1280(%rsp), %zmm1
342 vmovss 1156(%rsp,%r15,8), %xmm0
343 vmovss 1220(%rsp,%r15,8), %xmm1
344 call JUMPTARGET(powf)
345 vmovss %xmm0, 1284(%rsp,%r15,8)
350 vmovss 1152(%rsp,%r15,8), %xmm0
351 vmovss 1216(%rsp,%r15,8), %xmm1
352 call JUMPTARGET(powf)
353 vmovss %xmm0, 1280(%rsp,%r15,8)
355 END (_ZGVeN16vv_powf_knl)
357 ENTRY (_ZGVeN16vv_powf_skx)
359 cfi_adjust_cfa_offset (8)
360 cfi_rel_offset (%rbp, 0)
362 cfi_def_cfa_register (%rbp)
365 movq __svml_spow_data@GOTPCREL(%rip), %rax
366 vextractf32x8 $1, %zmm1, %ymm14
367 vextractf32x8 $1, %zmm0, %ymm15
368 vpsubd _NMINNORM(%rax), %zmm0, %zmm9
369 vmovups %zmm26, 1280(%rsp)
370 vmovups _ExpMask(%rax), %zmm6
371 vpcmpd $1, _NMAXVAL(%rax), %zmm9, %k1
372 vcvtps2pd %ymm0, %zmm5
373 vcvtps2pd %ymm1, %zmm12
376 /* exponent bits selection */
377 vpsrlq $20, %zmm5, %zmm3
378 vpsrlq $32, %zmm3, %zmm2
379 vpmovqd %zmm2, %ymm11
380 vcvtps2pd %ymm14, %zmm13
381 vpternlogd $0xff, %zmm14, %zmm14, %zmm14
382 vmovaps %zmm14, %zmm26
383 vpandd _ABSMASK(%rax), %zmm1, %zmm8
384 vpcmpd $1, _INF(%rax), %zmm8, %k2
385 vpandnd %zmm9, %zmm9, %zmm26{%k1}
386 vmovups _Two10(%rax), %zmm9
388 vcvtps2pd %ymm15, %zmm4
389 vmovaps %zmm14, %zmm15
391 /* preserve mantissa, set input exponent to 2^(-10) */
392 vpternlogq $248, %zmm6, %zmm4, %zmm9
393 vpsrlq $20, %zmm4, %zmm4
395 /* reciprocal approximation good to at least 11 bits */
396 vrcp14pd %zmm9, %zmm10
398 /* round reciprocal to nearest integer, will have 1+9 mantissa bits */
399 vrndscalepd $8, %zmm10, %zmm3
400 vmovups _One(%rax), %zmm10
401 vfmsub213pd %zmm10, %zmm3, %zmm9
402 vpandnd %zmm8, %zmm8, %zmm15{%k2}
404 vpternlogq $234, _Two10(%rax), %zmm5, %zmm8
405 vpsrlq $32, %zmm4, %zmm5
406 vrcp14pd %zmm8, %zmm7
408 vrndscalepd $8, %zmm7, %zmm2
409 vfmsub213pd %zmm10, %zmm2, %zmm8
412 vpsrlq $40, %zmm2, %zmm10
413 vinserti32x8 $1, %ymm6, %zmm11, %zmm4
414 vpsrlq $40, %zmm3, %zmm11
416 /* biased exponent in DP format */
417 vextracti32x8 $1, %zmm4, %ymm7
418 vcvtdq2pd %ymm4, %zmm6
419 vpmovqd %zmm10, %ymm4
420 vpmovqd %zmm11, %ymm5
421 vpxord %zmm10, %zmm10, %zmm10
422 vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
423 vpternlogd $0xff, %zmm4, %zmm4, %zmm4
424 vpxord %zmm11, %zmm11, %zmm11
425 vcvtdq2pd %ymm7, %zmm7
426 vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
427 vmovups _Threshold(%rax), %zmm5
428 vcmppd $21, %zmm2, %zmm5, %k2
429 vcmppd $21, %zmm3, %zmm5, %k3
430 vmovups _Bias1(%rax), %zmm3
432 vpandnq %zmm5, %zmm5, %zmm2{%k2}
433 vpternlogq $236, _Bias(%rax), %zmm3, %zmm2
435 /* dpP= _dbT+lJ*T_ITEM_GRAN */
437 vpandnq %zmm5, %zmm5, %zmm4{%k3}
438 vpternlogq $248, _Bias(%rax), %zmm4, %zmm3
439 vsubpd %zmm2, %zmm6, %zmm4
440 vmovups _poly_coeff_3(%rax), %zmm6
441 vmovups _poly_coeff_4(%rax), %zmm2
442 vsubpd %zmm3, %zmm7, %zmm5
443 vmulpd %zmm8, %zmm8, %zmm7
444 vfmadd213pd %zmm2, %zmm9, %zmm6
447 vmulpd %zmm9, %zmm9, %zmm2
448 vfmadd231pd _poly_coeff_3(%rax), %zmm8, %zmm3
451 vfmadd213pd %zmm9, %zmm2, %zmm6
452 vfmadd213pd %zmm8, %zmm7, %zmm3
453 vaddpd %zmm11, %zmm6, %zmm8
454 vaddpd %zmm10, %zmm3, %zmm9
455 vfmadd231pd _L2(%rax), %zmm5, %zmm8
456 vfmadd132pd _L2(%rax), %zmm9, %zmm4
457 vmulpd %zmm13, %zmm8, %zmm13
458 vmulpd %zmm12, %zmm4, %zmm3
459 vmulpd __dbInvLn2(%rax), %zmm13, %zmm10
460 vmulpd __dbInvLn2(%rax), %zmm3, %zmm8
463 vpsrlq $32, %zmm3, %zmm4
464 vpsrlq $32, %zmm13, %zmm13
466 /* to round down; if dR is an integer we will get R = 1, which is ok */
467 vsubpd __dbHALF(%rax), %zmm8, %zmm12
469 vpmovqd %zmm13, %ymm2
470 vsubpd __dbHALF(%rax), %zmm10, %zmm9
471 vaddpd __dbShifter(%rax), %zmm12, %zmm7
472 vaddpd __dbShifter(%rax), %zmm9, %zmm9
473 vsubpd __dbShifter(%rax), %zmm7, %zmm11
474 vsubpd __dbShifter(%rax), %zmm9, %zmm12
475 vinserti32x8 $1, %ymm2, %zmm5, %zmm3
477 /* iAbsX = iAbsX&iAbsMask */
478 vpandd __iAbsMask(%rax), %zmm3, %zmm4
480 /* iRangeMask = (iAbsX>iDomainRange) */
481 vpcmpd $2, __iDomainRange(%rax), %zmm4, %k1
482 vpandnd %zmm4, %zmm4, %zmm14{%k1}
483 vpternlogd $254, %zmm15, %zmm26, %zmm14
486 vsubpd %zmm11, %zmm8, %zmm15
487 vsubpd %zmm12, %zmm10, %zmm26
488 vptestmd %zmm14, %zmm14, %k0
489 vpsrlq $11, %zmm7, %zmm8
490 vpsrlq $11, %zmm9, %zmm10
491 vmulpd __dbC1(%rax), %zmm26, %zmm26
492 vmulpd __dbC1(%rax), %zmm15, %zmm15
494 /* NB : including +/- sign for the exponent!! */
495 vpsllq $52, %zmm10, %zmm13
496 vpsllq $52, %zmm8, %zmm12
500 vpandq __lbLOWKBITS(%rax), %zmm9, %zmm14
501 vpandq __lbLOWKBITS(%rax), %zmm7, %zmm6
502 vpmovqd %zmm14, %ymm7
504 vpxord %zmm2, %zmm2, %zmm2
505 vgatherdpd 13952(%rax,%ymm7,8), %zmm2{%k3}
506 vfmadd213pd %zmm2, %zmm26, %zmm2
507 vpaddq %zmm13, %zmm2, %zmm2
508 vcvtpd2ps %zmm2, %ymm4
509 vpxord %zmm11, %zmm11, %zmm11
510 vgatherdpd 13952(%rax,%ymm9,8), %zmm11{%k2}
511 vfmadd213pd %zmm11, %zmm15, %zmm11
512 vpaddq %zmm12, %zmm11, %zmm3
513 vcvtpd2ps %zmm3, %ymm5
514 vinsertf32x8 $1, %ymm4, %zmm5, %zmm2
520 vmovups 1280(%rsp), %zmm26
523 cfi_def_cfa_register (%rsp)
525 cfi_adjust_cfa_offset (-8)
531 vmovups %zmm0, 1088(%rsp)
532 vmovups %zmm1, 1152(%rsp)
533 vmovups %zmm2, 1216(%rsp)
542 vmovups %zmm16, 896(%rsp)
543 vmovups %zmm17, 832(%rsp)
544 vmovups %zmm18, 768(%rsp)
545 vmovups %zmm19, 704(%rsp)
546 vmovups %zmm20, 640(%rsp)
547 vmovups %zmm21, 576(%rsp)
548 vmovups %zmm22, 512(%rsp)
549 vmovups %zmm23, 448(%rsp)
550 vmovups %zmm24, 384(%rsp)
551 vmovups %zmm25, 320(%rsp)
552 vmovups %zmm27, 256(%rsp)
553 vmovups %zmm28, 192(%rsp)
554 vmovups %zmm29, 128(%rsp)
555 vmovups %zmm30, 64(%rsp)
556 vmovups %zmm31, (%rsp)
557 movq %rsi, 1000(%rsp)
559 movq %r12, 1032(%rsp)
560 cfi_offset_rel_rsp (12, 1032)
562 movq %r13, 1024(%rsp)
563 cfi_offset_rel_rsp (13, 1024)
565 movq %r14, 1016(%rsp)
566 cfi_offset_rel_rsp (14, 1016)
568 movq %r15, 1008(%rsp)
569 cfi_offset_rel_rsp (15, 1008)
591 vmovups 896(%rsp), %zmm16
592 vmovups 832(%rsp), %zmm17
593 vmovups 768(%rsp), %zmm18
594 vmovups 704(%rsp), %zmm19
595 vmovups 640(%rsp), %zmm20
596 vmovups 576(%rsp), %zmm21
597 vmovups 512(%rsp), %zmm22
598 vmovups 448(%rsp), %zmm23
599 vmovups 384(%rsp), %zmm24
600 vmovups 320(%rsp), %zmm25
601 vmovups 256(%rsp), %zmm27
602 vmovups 192(%rsp), %zmm28
603 vmovups 128(%rsp), %zmm29
604 vmovups 64(%rsp), %zmm30
605 vmovups (%rsp), %zmm31
606 vmovups 1216(%rsp), %zmm2
607 movq 1000(%rsp), %rsi
609 movq 1032(%rsp), %r12
611 movq 1024(%rsp), %r13
613 movq 1016(%rsp), %r14
615 movq 1008(%rsp), %r15
622 vmovss 1156(%rsp,%r15,8), %xmm1
624 vmovss 1092(%rsp,%r15,8), %xmm0
625 call JUMPTARGET(powf)
626 vmovss %xmm0, 1220(%rsp,%r15,8)
631 vmovss 1152(%rsp,%r15,8), %xmm1
633 vmovss 1088(%rsp,%r15,8), %xmm0
634 call JUMPTARGET(powf)
635 vmovss %xmm0, 1216(%rsp,%r15,8)
637 END (_ZGVeN16vv_powf_skx)