sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S

   1 /* Function tanhf vectorized with SSE4.
   2    Copyright (C) 2021-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *   NOTE: Since the hyperbolic tangent function is odd
  23  *         (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
  24  *         value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
  25  *
  26  *   We use a table lookup method to compute tanh(|x|).
  27  *   The basic idea is to split the input range into a number of subintervals
  28  *   and to approximate tanh(.) with a polynomial on each of them.
  29  *
  30  *   IEEE SPECIAL CONDITIONS:
  31  *   x = [+, -]0, r = [+, -]0
  32  *   x = +Inf,   r = +1
  33  *   x = -Inf,   r = -1
  34  *   x = QNaN,   r = QNaN
  35  *   x = SNaN,   r = QNaN
  36  *
  37  *
  38  *   ALGORITHM DETAILS
  39  *   We handle special values in a callout function, aside from main path
  40  *   computations. "Special" for this algorithm are:
  41  *   INF, NAN, |x| > HUGE_THRESHOLD
  42  *
  43  *
  44  *   Main path computations are organized as follows:
  45  *   Actually we split the interval [0, SATURATION_THRESHOLD)
  46  *   into a number of subintervals.  On each subinterval we approximate tanh(.)
  47  *   with a minimax polynomial of pre-defined degree. Polynomial coefficients
  48  *   are computed beforehand and stored in table. We also use
  49  *
  50  *       y := |x| + B,
  51  *
  52  *   here B depends on subinterval and is used to make argument
  53  *   closer to zero.
  54  *   We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
  55  *   where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
  56  *   preserve main path computation logic but return 1.0 for all arguments.
  57  *
  58  *   Hence reconstruction looks as follows:
  59  *   we extract proper polynomial and range reduction coefficients
  60  *        (Pj and B), corresponding to subinterval, to which |x| belongs,
  61  *        and return
  62  *
  63  *       r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
  64  *
  65  *   NOTE: we use multiprecision technique to multiply and sum the first
  66  *         K terms of the polynomial. So Pj, j = 0..K are stored in
  67  *         table each as a pair of target precision numbers (Pj and PLj) to
  68  *         achieve wider than target precision.
  69  *
  70  *
  71  */
  72
  73
  74 #include <sysdep.h>
  75
  76 /* tanhf data tables for avx2 and sse4 implementations defined here.
  77  */
  78 #define ONLY_DECL_OFFSET
  79 #include "svml_s_tanhf_rodata.S"
  80
  81         .section .text.sse4, "ax", @progbits
  82 ENTRY(_ZGVbN4v_tanhf_sse4)
  83         /* Save copy of input in xmm12.  */
  84         movaps  %xmm0, %xmm12
  85
  86         /* Here huge arguments, INF and NaNs are filtered out to callout. */
  87         movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
  88         pand    %xmm0, %xmm3
  89
  90
  91         /* Selection of arguments between [0, 0x04280000] into xmm3.  */
  92         pxor    %xmm7, %xmm7
  93         /* Save xmm3 for special values check at end.  */
  94         movdqa  %xmm3, %xmm8
  95         psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
  96         pmaxsd  %xmm7, %xmm3
  97         pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
  98         psrld   $14, %xmm3
  99
 100         movq    %xmm3, %rcx
 101         movl    %ecx, %edx
 102         shrq    $32, %rcx
 103
 104         pshufd  $0x0e, %xmm3, %xmm3
 105         movq    %xmm3, %rdi
 106         movl    %edi, %esi
 107         shrq    $32, %rdi
 108
 109         movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
 110         andps   %xmm1, %xmm0
 111
 112         leaq    TANHF_DATA(_lookupTable)(%rip), %rax
 113         movups  (%rdx, %rax), %xmm2
 114         movups  (%rcx, %rax), %xmm6
 115
 116         /*
 117          *  small table specific variables *
 118          *  Constant loading
 119          */
 120         movaps  %xmm2, %xmm4
 121         movlhps %xmm6, %xmm4
 122         unpckhpd %xmm6, %xmm2
 123
 124         cvtps2pd %xmm0, %xmm6
 125         movhlps %xmm0, %xmm0
 126         cvtps2pd %xmm0, %xmm0
 127
 128         movups  16(%rdx, %rax), %xmm5
 129         movups  16(%rsi, %rax), %xmm13
 130
 131         movaps  %xmm5, %xmm10
 132         movaps  %xmm13, %xmm11
 133
 134         movups  16(%rcx, %rax), %xmm7
 135         movups  16(%rdi, %rax), %xmm3
 136
 137         unpckhpd %xmm7, %xmm5
 138         unpckhpd %xmm3, %xmm13
 139
 140         mulpd   %xmm6, %xmm5
 141         mulpd   %xmm0, %xmm13
 142
 143         movlhps %xmm7, %xmm10
 144         movlhps %xmm3, %xmm11
 145
 146         addpd   %xmm10, %xmm5
 147         addpd   %xmm11, %xmm13
 148
 149         mulpd   %xmm6, %xmm5
 150         mulpd   %xmm0, %xmm13
 151
 152         addpd   %xmm2, %xmm5
 153
 154         movups  (%rsi, %rax), %xmm2
 155         movups  (%rdi, %rax), %xmm7
 156
 157         movaps  %xmm2, %xmm3
 158
 159         unpckhpd %xmm7, %xmm2
 160         movlhps %xmm7, %xmm3
 161
 162         addpd   %xmm13, %xmm2
 163
 164         mulpd   %xmm5, %xmm6
 165         addpd   %xmm4, %xmm6
 166
 167         mulpd   %xmm2, %xmm0
 168         addpd   %xmm3, %xmm0
 169
 170         cvtpd2ps %xmm0, %xmm2
 171         cvtpd2ps %xmm6, %xmm0
 172
 173         movlhps %xmm2, %xmm0
 174         andnps  %xmm12, %xmm1
 175         orps    %xmm1, %xmm0
 176
 177         /* xmm8 contains mask of special values.  */
 178         pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
 179
 180         movmskps %xmm8, %edx
 181         testl   %edx, %edx
 182
 183         /* Go to special inputs processing branch */
 184         jne     L(SPECIAL_VALUES_BRANCH)
 185         # LOE rbx rbp r12 r13 r14 r15 xmm0
 186         /* No stack restoration on the fastpath.  */
 187         ret
 188
 189         /* Cold case. edx has 1s where there was a special value that
 190            needs to be handled by a tanhf call. Optimize for code size
 191            more so than speed here. */
 192 L(SPECIAL_VALUES_BRANCH):
 193         # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm12
 194         /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
 195        call entry will be 16-byte aligned. */
 196         subq    $56, %rsp
 197         cfi_def_cfa_offset(64)
 198         movups  %xmm0, 24(%rsp)
 199         movups  %xmm12, 40(%rsp)
 200
 201         /* Use rbx/rbp for callee save registers as they get short
 202        encoding for many instructions (as compared with r12/r13). */
 203         movq    %rbx, (%rsp)
 204         cfi_offset(rbx, -64)
 205         movq    %rbp, 8(%rsp)
 206         cfi_offset(rbp, -56)
 207         /* edx has 1s where there was a special value that needs to be handled
 208            by a tanhf call.  */
 209         movl    %edx, %ebx
 210 L(SPECIAL_VALUES_LOOP):
 211         # LOE rbx rbp r12 r13 r14 r15
 212         /* use rbp as index for special value that is saved across calls to
 213            tanhf. We technically don't need a callee save register here as offset
 214            to rsp is always [0, 12] so we can restore rsp by realigning to 64.
 215            Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
 216            in the loop.  */
 217         xorl    %ebp, %ebp
 218         bsfl    %ebx, %ebp
 219
 220         /* Scalar math function call to process special input.  */
 221         movss   40(%rsp, %rbp, 4), %xmm0
 222         call    tanhf@PLT
 223         /* No good way to avoid the store-forwarding fault this will cause on
 224            return. `lfence` avoids the SF fault but at greater cost as it
 225            serialized stack/callee save restoration.  */
 226         movss   %xmm0, 24(%rsp, %rbp, 4)
 227
 228         leal    -1(%rbx), %eax
 229         andl    %eax, %ebx
 230         jnz     L(SPECIAL_VALUES_LOOP)
 231         # LOE r12 r13 r14 r15
 232         /* All results have been written to 24(%rsp).  */
 233         movups  24(%rsp), %xmm0
 234         movq    (%rsp), %rbx
 235         cfi_restore(rbx)
 236         movq    8(%rsp), %rbp
 237         cfi_restore(rbp)
 238         addq    $56, %rsp
 239         cfi_def_cfa_offset(8)
 240         ret
 241 END(_ZGVbN4v_tanhf_sse4)