sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S

   1 /* Function tanhf vectorized with AVX2.
   2    Copyright (C) 2021-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *   NOTE: Since the hyperbolic tangent function is odd
  23  *         (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
  24  *         value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
  25  *
  26  *   We use a table lookup method to compute tanh(|x|).
  27  *   The basic idea is to split the input range into a number of subintervals
  28  *   and to approximate tanh(.) with a polynomial on each of them.
  29  *
  30  *   IEEE SPECIAL CONDITIONS:
  31  *   x = [+, -]0, r = [+, -]0
  32  *   x = +Inf,   r = +1
  33  *   x = -Inf,   r = -1
  34  *   x = QNaN,   r = QNaN
  35  *   x = SNaN,   r = QNaN
  36  *
  37  *
  38  *   ALGORITHM DETAILS
  39  *   We handle special values in a callout function, aside from main path
  40  *   computations. "Special" for this algorithm are:
  41  *   INF, NAN, |x| > HUGE_THRESHOLD
  42  *
  43  *
  44  *   Main path computations are organized as follows:
  45  *   Actually we split the interval [0, SATURATION_THRESHOLD)
  46  *   into a number of subintervals.  On each subinterval we approximate tanh(.)
  47  *   with a minimax polynomial of pre-defined degree. Polynomial coefficients
  48  *   are computed beforehand and stored in table. We also use
  49  *
  50  *       y := |x| + B,
  51  *
  52  *   here B depends on subinterval and is used to make argument
  53  *   closer to zero.
  54  *   We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
  55  *   where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
  56  *   preserve main path computation logic but return 1.0 for all arguments.
  57  *
  58  *   Hence reconstruction looks as follows:
  59  *   we extract proper polynomial and range reduction coefficients
  60  *        (Pj and B), corresponding to subinterval, to which |x| belongs,
  61  *        and return
  62  *
  63  *       r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
  64  *
  65  *   NOTE: we use multiprecision technique to multiply and sum the first
  66  *         K terms of the polynomial. So Pj, j = 0..K are stored in
  67  *         table each as a pair of target precision numbers (Pj and PLj) to
  68  *         achieve wider than target precision.
  69  *
  70  *
  71  */
  72
  73 #include <sysdep.h>
  74
  75 /* tanhf data tables for avx2 and sse4 implementations defined here.
  76  */
  77 #include "svml_s_tanhf_rodata.S"
  78
  79         .section .text.avx2, "ax", @progbits
  80 ENTRY(_ZGVdN8v_tanhf_avx2)
  81         /* Here huge arguments, INF and NaNs are filtered out to callout. */
  82         vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
  83         vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
  84
  85         /* Selection of arguments between [0, 0x04280000] into ymm2.  */
  86         vpxor   %ymm3, %ymm3, %ymm3
  87         vpmaxsd %ymm3, %ymm2, %ymm2
  88         vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
  89
  90         /*
  91          *  small table specific variables *
  92          *  Constant loading
  93          */
  94         vpsrld  $14, %ymm2, %ymm1
  95
  96         /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
  97            store/load as we can take advantage of store-forwarding.  */
  98         vmovq   %xmm1, %r8
  99         /* We have eliminated all negative values for ymm1 so no need to sign
 100            extend.  */
 101         movl    %r8d, %r9d
 102         shrq    $32, %r8
 103
 104         /* Store base of lookup table in rax.  */
 105         leaq    TANHF_DATA(_lookupTable)(%rip), %rax
 106
 107         /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
 108            with memory operand. This helps alleviate bottleneck on p5.  */
 109         vmovupd 16(%r9, %rax), %xmm5
 110
 111         vpextrq $1, %xmm1, %rsi
 112         movl    %esi, %edi
 113         shrq    $32, %rsi
 114
 115         vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
 116
 117         vextracti128 $1, %ymm1, %xmm2
 118         vmovq   %xmm2, %rdx
 119         movl    %edx, %ecx
 120         shrq    $32, %rdx
 121
 122         vmovupd (%rcx, %rax), %xmm6
 123
 124         vpextrq $1, %xmm2, %r10
 125         movl    %r10d, %r11d
 126         shrq    $32, %r10
 127
 128         vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
 129
 130         vmovupd 16(%r8, %rax), %xmm1
 131         vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
 132         vmovupd (%rdx, %rax), %xmm3
 133         vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
 134
 135         vunpcklpd %ymm3, %ymm6, %ymm7
 136         vunpckhpd %ymm3, %ymm6, %ymm6
 137
 138         vunpcklpd %ymm1, %ymm5, %ymm3
 139         vunpckhpd %ymm1, %ymm5, %ymm1
 140
 141         vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
 142         /* Store special cases in ymm15.  */
 143         vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
 144
 145         vandps  %ymm11, %ymm0, %ymm4
 146
 147         vcvtps2pd %xmm4, %ymm5
 148
 149         vextractf128 $1, %ymm4, %xmm4
 150         vcvtps2pd %xmm4, %ymm4
 151
 152         vmovupd 16(%rcx, %rax), %xmm2
 153         vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
 154
 155         vfmadd213pd %ymm3, %ymm5, %ymm1
 156
 157         vmovupd 16(%rdx, %rax), %xmm3
 158         vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
 159
 160         vunpcklpd %ymm3, %ymm2, %ymm10
 161         vunpckhpd %ymm3, %ymm2, %ymm2
 162
 163         vfmadd213pd %ymm10, %ymm4, %ymm2
 164         vfmadd213pd %ymm6, %ymm4, %ymm2
 165         vfmadd213pd %ymm7, %ymm4, %ymm2
 166         vcvtpd2ps %ymm2, %xmm2
 167
 168         vmovupd (%r9, %rax), %xmm7
 169         vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
 170
 171         vmovupd (%r8, %rax), %xmm3
 172         vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
 173
 174         vunpckhpd %ymm3, %ymm7, %ymm4
 175         vunpcklpd %ymm3, %ymm7, %ymm7
 176
 177         vfmadd213pd %ymm4, %ymm5, %ymm1
 178         vfmadd213pd %ymm7, %ymm5, %ymm1
 179
 180
 181         vcvtpd2ps %ymm1, %xmm1
 182         vinsertf128 $1, %xmm2, %ymm1, %ymm1
 183
 184         vmovmskps %ymm15, %edx
 185         vandnps %ymm0, %ymm11, %ymm2
 186         testl   %edx, %edx
 187         /* Go to special inputs processing branch */
 188         jne     L(SPECIAL_VALUES_BRANCH)
 189         # LOE rbx r12 r13 r14 r15 ymm0 ymm1 ymm2
 190         /* Wait until after branch of write over ymm0.  */
 191         vorps   %ymm2, %ymm1, %ymm0
 192         /* No stack restoration on the fastpath.  */
 193         ret
 194
 195
 196         /* Cold case. edx has 1s where there was a special value that
 197            needs to be handled by a tanhf call. Optimize for code size
 198            more so than speed here. */
 199 L(SPECIAL_VALUES_BRANCH):
 200         # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm1 ymm2
 201     /* Use r13 to save/restore the stack. This allows us to use rbp as
 202        callee save register saving code size. */
 203         pushq   %r13
 204         cfi_adjust_cfa_offset(8)
 205         cfi_offset(r13, -16)
 206         /* Need to callee save registers to preserve state across tanhf calls.
 207          */
 208         pushq   %rbx
 209         cfi_adjust_cfa_offset(8)
 210         cfi_offset(rbx, -24)
 211         pushq   %rbp
 212         cfi_adjust_cfa_offset(8)
 213         cfi_offset(rbp, -32)
 214         movq    %rsp, %r13
 215         cfi_def_cfa_register(r13)
 216
 217         /* Align stack and make room for 2x ymm vectors.  */
 218         andq    $-32, %rsp
 219         addq    $-64, %rsp
 220
 221         /* Save all already computed inputs.  */
 222         vorps   %ymm2, %ymm1, %ymm1
 223         vmovaps %ymm1, (%rsp)
 224         /* Save original input (ymm0 unchanged up to this point).  */
 225         vmovaps %ymm0, 32(%rsp)
 226
 227         vzeroupper
 228
 229         /* edx has 1s where there was a special value that needs to be handled
 230            by a tanhf call.  */
 231         movl    %edx, %ebx
 232 L(SPECIAL_VALUES_LOOP):
 233         # LOE rbx rbp r12 r13 r14 r15
 234         /* use rbp as index for special value that is saved across calls to
 235            tanhf. We technically don't need a callee save register here as offset
 236            to rsp is always [0, 28] so we can restore rsp by realigning to 64.
 237            Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
 238            in the loop. Realigning also costs more code size.  */
 239         xorl    %ebp, %ebp
 240         tzcntl  %ebx, %ebp
 241
 242         /* Scalar math function call to process special input.  */
 243         vmovss  32(%rsp, %rbp, 4), %xmm0
 244         call    tanhf@PLT
 245
 246         /* No good way to avoid the store-forwarding fault this will cause on
 247            return. `lfence` avoids the SF fault but at greater cost as it
 248            serialized stack/callee save restoration.  */
 249         vmovss  %xmm0, (%rsp, %rbp, 4)
 250
 251         blsrl   %ebx, %ebx
 252         jnz     L(SPECIAL_VALUES_LOOP)
 253         # LOE r12 r13 r14 r15
 254
 255
 256         /* All results have been written to (%rsp).  */
 257         vmovups (%rsp), %ymm0
 258         /* Restore rsp.  */
 259         movq    %r13, %rsp
 260         cfi_def_cfa_register(rsp)
 261         /* Restore callee save registers.  */
 262         popq    %rbp
 263         cfi_adjust_cfa_offset(-8)
 264         cfi_restore(rbp)
 265         popq    %rbx
 266         cfi_adjust_cfa_offset(-8)
 267         cfi_restore(rbp)
 268         popq    %r13
 269         cfi_adjust_cfa_offset(-8)
 270         cfi_restore(r13)
 271         ret
 272 END(_ZGVdN8v_tanhf_avx2)