[thirdparty/glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_s_tanhf8_core_avx2.S

/* Function tanhf vectorized with AVX2.
   Copyright (C) 2021-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */

/*
 * ALGORITHM DESCRIPTION:
 *
 *   NOTE: Since the hyperbolic tangent function is odd
 *         (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
 *         value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
 *
 *   We use a table lookup method to compute tanh(|x|).
 *   The basic idea is to split the input range into a number of subintervals
 *   and to approximate tanh(.) with a polynomial on each of them.
 *
 *   IEEE SPECIAL CONDITIONS:
 *   x = [+, -]0, r = [+, -]0
 *   x = +Inf,   r = +1
 *   x = -Inf,   r = -1
 *   x = QNaN,   r = QNaN
 *   x = SNaN,   r = QNaN
 *
 *
 *   ALGORITHM DETAILS
 *   We handle special values in a callout function, aside from main path
 *   computations. "Special" for this algorithm are:
 *   INF, NAN, |x| > HUGE_THRESHOLD
 *
 *
 *   Main path computations are organized as follows:
 *   Actually we split the interval [0, SATURATION_THRESHOLD)
 *   into a number of subintervals.  On each subinterval we approximate tanh(.)
 *   with a minimax polynomial of pre-defined degree. Polynomial coefficients
 *   are computed beforehand and stored in table. We also use
 *
 *       y := |x| + B,
 *
 *   here B depends on subinterval and is used to make argument
 *   closer to zero.
 *   We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
 *   where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
 *   preserve main path computation logic but return 1.0 for all arguments.
 *
 *   Hence reconstruction looks as follows:
 *   we extract proper polynomial and range reduction coefficients
 *        (Pj and B), corresponding to subinterval, to which |x| belongs,
 *        and return
 *
 *       r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
 *
 *   NOTE: we use multiprecision technique to multiply and sum the first
 *         K terms of the polynomial. So Pj, j = 0..K are stored in
 *         table each as a pair of target precision numbers (Pj and PLj) to
 *         achieve wider than target precision.
 *
 *
 */

#include <sysdep.h>

/* tanhf data tables for avx2 and sse4 implementatins defined here.
 */
#include "svml_s_tanhf_rodata.S"

	.section .text.avx2, "ax", @progbits
ENTRY(_ZGVdN8v_tanhf_avx2)
	/* Here huge arguments, INF and NaNs are filtered out to callout. */
	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2

	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
	vpxor	%ymm3, %ymm3, %ymm3
	vpmaxsd	%ymm3, %ymm2, %ymm2
	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2

	/*
	 *  small table specific variables *
	 *  Constant loading
	 */
	vpsrld	$14, %ymm2, %ymm1

	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
	   store/load as we can take advantage of store-forwarding.  */
	vmovq	%xmm1, %r8
	/* We have eliminated all negative values for ymm1 so no need to sign
	   extend.  */
	movl	%r8d, %r9d
	shrq	$32, %r8

	/* Store base of lookup table in rax.  */
	leaq	TANHF_DATA(_lookupTable)(%rip), %rax

	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
	   with memory operand. This helps alleviate bottleneck on p5.  */
	vmovupd	16(%r9, %rax), %xmm5

	vpextrq	$1, %xmm1, %rsi
	movl	%esi, %edi
	shrq	$32, %rsi

	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5

	vextracti128 $1, %ymm1, %xmm2
	vmovq	%xmm2, %rdx
	movl	%edx, %ecx
	shrq	$32, %rdx

	vmovupd	(%rcx, %rax), %xmm6

	vpextrq	$1, %xmm2, %r10
	movl	%r10d, %r11d
	shrq	$32, %r10

	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6

	vmovupd	16(%r8, %rax), %xmm1
	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
	vmovupd	(%rdx, %rax), %xmm3
	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3

	vunpcklpd %ymm3, %ymm6, %ymm7
	vunpckhpd %ymm3, %ymm6, %ymm6

	vunpcklpd %ymm1, %ymm5, %ymm3
	vunpckhpd %ymm1, %ymm5, %ymm1

	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
	/* Store special cases in ymm15.  */
	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15

	vandps	%ymm11, %ymm0, %ymm4

	vcvtps2pd %xmm4, %ymm5

	vextractf128 $1, %ymm4, %xmm4
	vcvtps2pd %xmm4, %ymm4

	vmovupd	16(%rcx, %rax), %xmm2
	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2

	vfmadd213pd %ymm3, %ymm5, %ymm1

	vmovupd	16(%rdx, %rax), %xmm3
	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3

	vunpcklpd %ymm3, %ymm2, %ymm10
	vunpckhpd %ymm3, %ymm2, %ymm2

	vfmadd213pd %ymm10, %ymm4, %ymm2
	vfmadd213pd %ymm6, %ymm4, %ymm2
	vfmadd213pd %ymm7, %ymm4, %ymm2
	vcvtpd2ps %ymm2, %xmm2

	vmovupd	(%r9, %rax), %xmm7
	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7

	vmovupd	(%r8, %rax), %xmm3
	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3

	vunpckhpd %ymm3, %ymm7, %ymm4
	vunpcklpd %ymm3, %ymm7, %ymm7

	vfmadd213pd %ymm4, %ymm5, %ymm1
	vfmadd213pd %ymm7, %ymm5, %ymm1


	vcvtpd2ps %ymm1, %xmm1
	vinsertf128 $1, %xmm2, %ymm1, %ymm1

	vmovmskps %ymm15, %edx
	vandnps	%ymm0, %ymm11, %ymm2
	testl	%edx, %edx
	/* Go to special inputs processing branch */
	jne	L(SPECIAL_VALUES_BRANCH)
	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
	/* Wait until after branch of write over ymm0.  */
	vorps	%ymm2, %ymm1, %ymm0
	/* No stack restoration on the fastpath.  */
	ret


	/* Cold case. edx has 1s where there was a special value that
	   needs to be handled by a tanhf call. Optimize for code size
	   more so than speed here. */
L(SPECIAL_VALUES_BRANCH):
	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
    /* Use r13 to save/restore the stack. This allows us to use rbp as
       callee save register saving code size. */
	pushq	%r13
	cfi_adjust_cfa_offset(8)
	cfi_offset(r13, -16)
	/* Need to callee save registers to preserve state across tanhf calls.
	 */
	pushq	%rbx
	cfi_adjust_cfa_offset(8)
	cfi_offset(rbx, -24)
	pushq	%rbp
	cfi_adjust_cfa_offset(8)
	cfi_offset(rbp, -32)
	movq	%rsp, %r13
	cfi_def_cfa_register(r13)

	/* Align stack and make room for 2x ymm vectors.  */
	andq	$-32, %rsp
	addq	$-64, %rsp

	/* Save all already computed inputs.  */
	vorps	%ymm2, %ymm1, %ymm1
	vmovaps	%ymm1, (%rsp)
	/* Save original input (ymm0 unchanged up to this point).  */
	vmovaps	%ymm0, 32(%rsp)

	vzeroupper

	/* edx has 1s where there was a special value that needs to be handled
	   by a tanhf call.  */
	movl	%edx, %ebx
L(SPECIAL_VALUES_LOOP):
	# LOE rbx rbp r12 r13 r14 r15
	/* use rbp as index for special value that is saved across calls to
	   tanhf. We technically don't need a callee save register here as offset
	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
	   in the loop. Realigning also costs more code size.  */
	xorl	%ebp, %ebp
	tzcntl	%ebx, %ebp

	/* Scalar math function call to process special input.  */
	vmovss	32(%rsp, %rbp, 4), %xmm0
	call	tanhf@PLT

	/* No good way to avoid the store-forwarding fault this will cause on
	   return. `lfence` avoids the SF fault but at greater cost as it
	   serialized stack/callee save restoration.  */
	vmovss	%xmm0, (%rsp, %rbp, 4)

	blsrl   %ebx, %ebx
	jnz	L(SPECIAL_VALUES_LOOP)
	# LOE r12 r13 r14 r15


	/* All results have been written to (%rsp).  */
	vmovups	(%rsp), %ymm0
	/* Restore rsp.  */
	movq	%r13, %rsp
	cfi_def_cfa_register(rsp)
	/* Restore callee save registers.  */
	popq	%rbp
	cfi_adjust_cfa_offset(-8)
	cfi_restore(rbp)
	popq	%rbx
	cfi_adjust_cfa_offset(-8)
	cfi_restore(rbp)
	popq	%r13
	cfi_adjust_cfa_offset(-8)
	cfi_restore(r13)
	ret
END(_ZGVdN8v_tanhf_avx2)
Commit	Line	Data
c0f36fc3	1	/* Function tanhf vectorized with AVX2.
581c785b	2	Copyright (C) 2021-2022 Free Software Foundation, Inc.
c0f36fc3 SP	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, see
	17	https://www.gnu.org/licenses/. */
	18
	19	/*
	20	* ALGORITHM DESCRIPTION:
	21	*
	22	* NOTE: Since the hyperbolic tangent function is odd
	23	* (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
	24	* value of the argument \|x\|: tanh(x) = sign(x) * tanh(\|x\|)
	25	*
	26	* We use a table lookup method to compute tanh(\|x\|).
	27	* The basic idea is to split the input range into a number of subintervals
	28	* and to approximate tanh(.) with a polynomial on each of them.
	29	*
	30	* IEEE SPECIAL CONDITIONS:
2c632117	31	* x = [+, -]0, r = [+, -]0
c0f36fc3 SP	32	* x = +Inf, r = +1
	33	* x = -Inf, r = -1
	34	* x = QNaN, r = QNaN
	35	* x = SNaN, r = QNaN
	36	*
	37	*
	38	* ALGORITHM DETAILS
	39	* We handle special values in a callout function, aside from main path
	40	* computations. "Special" for this algorithm are:
	41	* INF, NAN, \|x\| > HUGE_THRESHOLD
	42	*
	43	*
	44	* Main path computations are organized as follows:
	45	* Actually we split the interval [0, SATURATION_THRESHOLD)
	46	* into a number of subintervals. On each subinterval we approximate tanh(.)
	47	* with a minimax polynomial of pre-defined degree. Polynomial coefficients
	48	* are computed beforehand and stored in table. We also use
	49	*
	50	* y := \|x\| + B,
	51	*
	52	* here B depends on subinterval and is used to make argument
	53	* closer to zero.
	54	* We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
	55	* where 1.0 + 0.0y + 0.0y^2 ... coefficients are stored - just to
	56	* preserve main path computation logic but return 1.0 for all arguments.
	57	*
	58	* Hence reconstruction looks as follows:
	59	* we extract proper polynomial and range reduction coefficients
	60	* (Pj and B), corresponding to subinterval, to which \|x\| belongs,
	61	* and return
	62	*
	63	* r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
	64	*
	65	* NOTE: we use multiprecision technique to multiply and sum the first
	66	* K terms of the polynomial. So Pj, j = 0..K are stored in
	67	* table each as a pair of target precision numbers (Pj and PLj) to
	68	* achieve wider than target precision.
	69	*
	70	*
	71	*/
	72
c0f36fc3 SP	73	#include <sysdep.h>
c0f36fc3 SP	74
bcc41f66 NG	75	/* tanhf data tables for avx2 and sse4 implementatins defined here.
	76	*/
	77	#include "svml_s_tanhf_rodata.S"
	78
2c632117	79	.section .text.avx2, "ax", @progbits
c0f36fc3	80	ENTRY(_ZGVdN8v_tanhf_avx2)
2c632117	81	/* Here huge arguments, INF and NaNs are filtered out to callout. */
bcc41f66 NG	82	vpand TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
	83	vpsubd TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
	84
	85	/* Selection of arguments between [0, 0x04280000] into ymm2. */
	86	vpxor %ymm3, %ymm3, %ymm3
	87	vpmaxsd %ymm3, %ymm2, %ymm2
	88	vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
c0f36fc3	89
2c632117 SP	90	/*
	91	* small table specific variables *
	92	* Constant loading
	93	*/
bcc41f66 NG	94	vpsrld $14, %ymm2, %ymm1
	95
	96	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
	97	store/load as we can take advantage of store-forwarding. */
	98	vmovq %xmm1, %r8
	99	/* We have eliminated all negative values for ymm1 so no need to sign
	100	extend. */
	101	movl %r8d, %r9d
	102	shrq $32, %r8
	103
	104	/* Store base of lookup table in rax. */
	105	leaq TANHF_DATA(_lookupTable)(%rip), %rax
	106
	107	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
	108	with memory operand. This helps alleviate bottleneck on p5. */
	109	vmovupd 16(%r9, %rax), %xmm5
	110
	111	vpextrq $1, %xmm1, %rsi
	112	movl %esi, %edi
	113	shrq $32, %rsi
	114
	115	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
	116
	117	vextracti128 $1, %ymm1, %xmm2
	118	vmovq %xmm2, %rdx
	119	movl %edx, %ecx
	120	shrq $32, %rdx
	121
	122	vmovupd (%rcx, %rax), %xmm6
	123
	124	vpextrq $1, %xmm2, %r10
	125	movl %r10d, %r11d
	126	shrq $32, %r10
	127
	128	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
	129
	130	vmovupd 16(%r8, %rax), %xmm1
	131	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
	132	vmovupd (%rdx, %rax), %xmm3
	133	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
	134
	135	vunpcklpd %ymm3, %ymm6, %ymm7
2c632117	136	vunpckhpd %ymm3, %ymm6, %ymm6
c0f36fc3	137
bcc41f66 NG	138	vunpcklpd %ymm1, %ymm5, %ymm3
bcc41f66 NG	139	vunpckhpd %ymm1, %ymm5, %ymm1
c0f36fc3	140
bcc41f66 NG	141	vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
	142	/* Store special cases in ymm15. */
	143	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
c0f36fc3	144
bcc41f66	145	vandps %ymm11, %ymm0, %ymm4
c0f36fc3	146
bcc41f66	147	vcvtps2pd %xmm4, %ymm5
c0f36fc3	148
bcc41f66 NG	149	vextractf128 $1, %ymm4, %xmm4
bcc41f66 NG	150	vcvtps2pd %xmm4, %ymm4
c0f36fc3	151
bcc41f66 NG	152	vmovupd 16(%rcx, %rax), %xmm2
bcc41f66 NG	153	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
c0f36fc3	154
bcc41f66 NG	155	vfmadd213pd %ymm3, %ymm5, %ymm1
	156
	157	vmovupd 16(%rdx, %rax), %xmm3
	158	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
	159
	160	vunpcklpd %ymm3, %ymm2, %ymm10
	161	vunpckhpd %ymm3, %ymm2, %ymm2
	162
	163	vfmadd213pd %ymm10, %ymm4, %ymm2
	164	vfmadd213pd %ymm6, %ymm4, %ymm2
	165	vfmadd213pd %ymm7, %ymm4, %ymm2
	166	vcvtpd2ps %ymm2, %xmm2
	167
	168	vmovupd (%r9, %rax), %xmm7
	169	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
	170
	171	vmovupd (%r8, %rax), %xmm3
	172	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
	173
	174	vunpckhpd %ymm3, %ymm7, %ymm4
	175	vunpcklpd %ymm3, %ymm7, %ymm7
c0f36fc3	176
bcc41f66 NG	177	vfmadd213pd %ymm4, %ymm5, %ymm1
	178	vfmadd213pd %ymm7, %ymm5, %ymm1
	179
	180
	181	vcvtpd2ps %ymm1, %xmm1
	182	vinsertf128 $1, %xmm2, %ymm1, %ymm1
	183
	184	vmovmskps %ymm15, %edx
	185	vandnps %ymm0, %ymm11, %ymm2
	186	testl %edx, %edx
	187	/* Go to special inputs processing branch */
	188	jne L(SPECIAL_VALUES_BRANCH)
	189	# LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
	190	/* Wait until after branch of write over ymm0. */
	191	vorps %ymm2, %ymm1, %ymm0
	192	/* No stack restoration on the fastpath. */
	193	ret
c0f36fc3	194
c0f36fc3	195
bcc41f66 NG	196	/* Cold case. edx has 1s where there was a special value that
	197	needs to be handled by a tanhf call. Optimize for code size
	198	more so than speed here. */
	199	L(SPECIAL_VALUES_BRANCH):
	200	# LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
	201	/* Use r13 to save/restore the stack. This allows us to use rbp as
	202	callee save register saving code size. */
	203	pushq %r13
	204	cfi_adjust_cfa_offset(8)
	205	cfi_offset(r13, -16)
	206	/* Need to callee save registers to preserve state across tanhf calls.
2c632117	207	*/
bcc41f66 NG	208	pushq %rbx
	209	cfi_adjust_cfa_offset(8)
	210	cfi_offset(rbx, -24)
	211	pushq %rbp
	212	cfi_adjust_cfa_offset(8)
	213	cfi_offset(rbp, -32)
	214	movq %rsp, %r13
	215	cfi_def_cfa_register(r13)
	216
	217	/* Align stack and make room for 2x ymm vectors. */
	218	andq $-32, %rsp
	219	addq $-64, %rsp
	220
	221	/* Save all already computed inputs. */
	222	vorps %ymm2, %ymm1, %ymm1
	223	vmovaps %ymm1, (%rsp)
	224	/* Save original input (ymm0 unchanged up to this point). */
	225	vmovaps %ymm0, 32(%rsp)
	226
	227	vzeroupper
c0f36fc3	228
bcc41f66 NG	229	/* edx has 1s where there was a special value that needs to be handled
	230	by a tanhf call. */
	231	movl %edx, %ebx
c0f36fc3	232	L(SPECIAL_VALUES_LOOP):
bcc41f66 NG	233	# LOE rbx rbp r12 r13 r14 r15
	234	/* use rbp as index for special value that is saved across calls to
	235	tanhf. We technically don't need a callee save register here as offset
	236	to rsp is always [0, 28] so we can restore rsp by realigning to 64.
	237	Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
	238	in the loop. Realigning also costs more code size. */
	239	xorl %ebp, %ebp
	240	tzcntl %ebx, %ebp
c0f36fc3	241
bcc41f66	242	/* Scalar math function call to process special input. */
3079f652	243	vmovss 32(%rsp, %rbp, 4), %xmm0
2c632117	244	call tanhf@PLT
c0f36fc3	245
bcc41f66 NG	246	/* No good way to avoid the store-forwarding fault this will cause on
	247	return. `lfence` avoids the SF fault but at greater cost as it
	248	serialized stack/callee save restoration. */
3079f652	249	vmovss %xmm0, (%rsp, %rbp, 4)
bcc41f66 NG	250
	251	blsrl %ebx, %ebx
	252	jnz L(SPECIAL_VALUES_LOOP)
	253	# LOE r12 r13 r14 r15
c0f36fc3	254
c0f36fc3	255
bcc41f66 NG	256	/* All results have been written to (%rsp). */
	257	vmovups (%rsp), %ymm0
	258	/* Restore rsp. */
	259	movq %r13, %rsp
	260	cfi_def_cfa_register(rsp)
	261	/* Restore callee save registers. */
	262	popq %rbp
	263	cfi_adjust_cfa_offset(-8)
	264	cfi_restore(rbp)
	265	popq %rbx
	266	cfi_adjust_cfa_offset(-8)
	267	cfi_restore(rbp)
	268	popq %r13
	269	cfi_adjust_cfa_offset(-8)
	270	cfi_restore(r13)
	271	ret
	272	END(_ZGVdN8v_tanhf_avx2)