/* Function cosh vectorized with SSE4. Copyright (C) 2021-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see https://www.gnu.org/licenses/. */ /* * ALGORITHM DESCRIPTION: * * Compute cosh(x) as (exp(x)+exp(-x))/2, * where exp is calculated as * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r) * * Special cases: * * cosh(NaN) = quiet NaN, and raise invalid exception * cosh(INF) = that INF * cosh(0) = 1 * cosh(x) overflows for big x and returns MAXLOG+log(2) * */ /* Offsets for data table __svml_dcosh_data_internal */ #define _dbT 0 #define _dbInvLn2 2064 #define _dbLn2hi 2080 #define _dbLn2lo 2096 #define _dbShifter 2112 #define _iIndexMask 2128 #define _dPC2 2144 #define _dPC3 2160 #define _dPC4 2176 #define _iMaxIndex 2192 #define _lExpMask 2208 #define _dSign 2224 #define _iDomainRange 2240 #include .section .text.sse4, "ax", @progbits ENTRY(_ZGVbN2v_cosh_sse4) subq $72, %rsp cfi_def_cfa_offset(80) movaps %xmm0, %xmm4 movups _dSign+__svml_dcosh_data_internal(%rip), %xmm2 lea _dbT+__svml_dcosh_data_internal(%rip), %r8 /* Abs argument */ movaps %xmm2, %xmm5 /* dXSign=0x001000000000 */ psrlq $11, %xmm2 /* * Load argument * dM = x*2^K/log(2) + RShifter */ movups _dbInvLn2+__svml_dcosh_data_internal(%rip), %xmm3 andnps %xmm4, %xmm5 mulpd %xmm5, %xmm3 movups _dbShifter+__svml_dcosh_data_internal(%rip), %xmm1 addpd %xmm1, %xmm3 /* * R * dN = dM - RShifter */ movaps %xmm3, %xmm15 subpd %xmm1, %xmm15 /* dR = dX - dN*Log2_hi/2^K */ movups _dbLn2hi+__svml_dcosh_data_internal(%rip), %xmm14 mulpd %xmm15, %xmm14 /* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */ movups _dbLn2lo+__svml_dcosh_data_internal(%rip), %xmm1 mulpd %xmm15, %xmm1 /* * Check for overflow\underflow * */ pshufd $221, %xmm5, %xmm7 subpd %xmm14, %xmm5 movq _iIndexMask+__svml_dcosh_data_internal(%rip), %xmm8 /* Index and lookup */ pshufd $136, %xmm3, %xmm9 /* * G1, G2, G3: dTdif, dTn * 2^N, 2^(-N) * NB: copied from sinh_la - to be optimized!!!!! */ psllq $44, %xmm3 /* * trick * 256=-iIndex */ movq _iMaxIndex+__svml_dcosh_data_internal(%rip), %xmm12 pand %xmm8, %xmm9 subpd %xmm1, %xmm5 psubd %xmm9, %xmm12 /* iIndex*=3 */ movdqa %xmm9, %xmm10 /* iDomainRange*=3 */ pslld $3, %xmm12 pslld $3, %xmm10 movd %xmm12, %esi pshufd $1, %xmm12, %xmm13 movq _iDomainRange+__svml_dcosh_data_internal(%rip), %xmm6 movd %xmm13, %edi pcmpgtd %xmm6, %xmm7 movmskps %xmm7, %eax /* dR2 = dR^2 */ movaps %xmm5, %xmm7 /* lM now is an EXP(2^N) */ pand _lExpMask+__svml_dcosh_data_internal(%rip), %xmm3 pshufd $1, %xmm10, %xmm11 movslq %esi, %rsi mulpd %xmm5, %xmm7 movd %xmm10, %edx movsd (%r8, %rsi), %xmm6 movd %xmm11, %ecx movslq %edi, %rdi movslq %edx, %rdx movslq %ecx, %rcx movhpd (%r8, %rdi), %xmm6 /* */ psubq %xmm3, %xmm6 /* lX- = EXP(1/2) */ psubq %xmm2, %xmm6 /* * sinh(r) = r +r*r^2*a3 .... * dSinh_r = r^2*a3 */ movups _dPC3+__svml_dcosh_data_internal(%rip), %xmm2 mulpd %xmm7, %xmm2 /* dSinh_r = r + r*r^2*a3 */ mulpd %xmm5, %xmm2 movsd (%r8, %rdx), %xmm0 movhpd (%r8, %rcx), %xmm0 paddq %xmm3, %xmm0 addpd %xmm2, %xmm5 /* dTn = dTn*2^N - dTn*2^-N */ movaps %xmm0, %xmm3 subpd %xmm6, %xmm3 /* dTp = dTn*2^N + dTn*2^-N */ addpd %xmm6, %xmm0 mulpd %xmm5, %xmm3 /* poly(r) = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ movups _dPC4+__svml_dcosh_data_internal(%rip), %xmm5 mulpd %xmm7, %xmm5 addpd _dPC2+__svml_dcosh_data_internal(%rip), %xmm5 mulpd %xmm5, %xmm7 /* dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ mulpd %xmm0, %xmm7 addpd %xmm7, %xmm3 /* _VRES1 = dTp + dTn*sinh(dR)+dTp*dR2*(a2 +a4*dR2) */ addpd %xmm3, %xmm0 andl $3, %eax /* Ret H */ /* Go to special inputs processing branch */ jne L(SPECIAL_VALUES_BRANCH) # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm4 /* Restore registers * and exit the function */ L(EXIT): addq $72, %rsp cfi_def_cfa_offset(8) ret cfi_def_cfa_offset(80) /* Branch to process * special inputs */ L(SPECIAL_VALUES_BRANCH): movups %xmm4, 32(%rsp) movups %xmm0, 48(%rsp) # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xorl %edx, %edx movq %r12, 16(%rsp) cfi_offset(12, -64) movl %edx, %r12d movq %r13, 8(%rsp) cfi_offset(13, -72) movl %eax, %r13d movq %r14, (%rsp) cfi_offset(14, -80) # LOE rbx rbp r15 r12d r13d /* Range mask * bits check */ L(RANGEMASK_CHECK): btl %r12d, %r13d /* Call scalar math function */ jc L(SCALAR_MATH_CALL) # LOE rbx rbp r15 r12d r13d /* Special inputs * processing loop */ L(SPECIAL_VALUES_LOOP): incl %r12d cmpl $2, %r12d /* Check bits in range mask */ jl L(RANGEMASK_CHECK) # LOE rbx rbp r15 r12d r13d movq 16(%rsp), %r12 cfi_restore(12) movq 8(%rsp), %r13 cfi_restore(13) movq (%rsp), %r14 cfi_restore(14) movups 48(%rsp), %xmm0 /* Go to exit */ jmp L(EXIT) cfi_offset(12, -64) cfi_offset(13, -72) cfi_offset(14, -80) # LOE rbx rbp r12 r13 r14 r15 xmm0 /* Scalar math function call * to process special input */ L(SCALAR_MATH_CALL): movl %r12d, %r14d movsd 32(%rsp, %r14, 8), %xmm0 call cosh@PLT # LOE rbx rbp r14 r15 r12d r13d xmm0 movsd %xmm0, 48(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) # LOE rbx rbp r15 r12d r13d END(_ZGVbN2v_cosh_sse4) .section .rodata, "a" .align 16 #ifdef __svml_dcosh_data_internal_typedef typedef unsigned int VUINT32; typedef struct { __declspec(align(16)) VUINT32 _dbT[(1+(1<<8))][2]; // dTpj ONLY! __declspec(align(16)) VUINT32 _dbInvLn2[2][2]; __declspec(align(16)) VUINT32 _dbLn2hi[2][2]; __declspec(align(16)) VUINT32 _dbLn2lo[2][2]; __declspec(align(16)) VUINT32 _dbShifter[2][2]; __declspec(align(16)) VUINT32 _iIndexMask[4][1]; // (1<