sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core_avx2.S

   1 /* Function sinf vectorized with AVX2.
   2    Copyright (C) 2014-2019 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_s_trig_data.h"
  21
  22         .text
  23 ENTRY(_ZGVdN8v_sinf_avx2)
  24 /*
  25    ALGORITHM DESCRIPTION:
  26
  27    1) Range reduction to [-Pi/2; +Pi/2] interval
  28       a) Grab sign from source argument and save it.
  29       b) Remove sign using AND operation
  30       c) Getting octant Y by 1/Pi multiplication
  31       d) Add "Right Shifter" value
  32       e) Treat obtained value as integer for destination sign setting.
  33          Shift first bit of this value to the last (sign) position
  34       f) Change destination sign if source sign is negative
  35          using XOR operation.
  36       g) Subtract "Right Shifter" value
  37       h) Subtract Y*PI from X argument, where PI divided to 4 parts:
  38          X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
  39    2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
  40       a) Calculate X^2 = X * X
  41       b) Calculate polynomial:
  42          R = X + X * X^2 * (A3 + x^2 * (A5 + ......
  43    3) Destination sign setting
  44       a) Set shifted destination sign using XOR operation:
  45          R = XOR( R, S );
  46  */
  47         pushq     %rbp
  48         cfi_adjust_cfa_offset (8)
  49         cfi_rel_offset (%rbp, 0)
  50         movq      %rsp, %rbp
  51         cfi_def_cfa_register (%rbp)
  52         andq      $-64, %rsp
  53         subq      $448, %rsp
  54         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
  55         vmovdqa   %ymm0, %ymm5
  56         vmovups __sAbsMask(%rax), %ymm3
  57         vmovups __sInvPI(%rax), %ymm7
  58         vmovups __sRShifter(%rax), %ymm0
  59         vmovups __sPI1_FMA(%rax), %ymm1
  60
  61 /* b) Remove sign using AND operation */
  62         vandps    %ymm3, %ymm5, %ymm4
  63
  64 /*
  65   c) Getting octant Y by 1/Pi multiplication
  66   d) Add "Right Shifter" value
  67  */
  68         vfmadd213ps %ymm0, %ymm4, %ymm7
  69
  70 /* g) Subtract "Right Shifter" value */
  71         vsubps    %ymm0, %ymm7, %ymm2
  72
  73 /*
  74   e) Treat obtained value as integer for destination sign setting.
  75   Shift first bit of this value to the last (sign) position
  76  */
  77         vpslld    $31, %ymm7, %ymm6
  78
  79 /*
  80   h) Subtract Y*PI from X argument, where PI divided to 4 parts:
  81   X = X - Y*PI1 - Y*PI2 - Y*PI3;
  82  */
  83         vmovdqa   %ymm4, %ymm0
  84         vfnmadd231ps %ymm2, %ymm1, %ymm0
  85
  86 /* Check for large and special values */
  87         vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
  88         vfnmadd231ps __sPI2_FMA(%rax), %ymm2, %ymm0
  89         vfnmadd132ps __sPI3_FMA(%rax), %ymm0, %ymm2
  90
  91 /*
  92   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
  93   a) Calculate X^2 = X * X
  94   b) Calculate polynomial:
  95   R = X + X * X^2 * (A3 + x^2 * (A5 + ......
  96  */
  97         vmulps    %ymm2, %ymm2, %ymm1
  98
  99 /*
 100   f) Change destination sign if source sign is negative
 101   using XOR operation.
 102  */
 103         vandnps   %ymm5, %ymm3, %ymm0
 104         vxorps    %ymm6, %ymm2, %ymm3
 105         vmovups __sA9(%rax), %ymm2
 106         vfmadd213ps __sA7(%rax), %ymm1, %ymm2
 107         vfmadd213ps __sA5(%rax), %ymm1, %ymm2
 108         vfmadd213ps __sA3(%rax), %ymm1, %ymm2
 109         vmulps    %ymm1, %ymm2, %ymm6
 110         vfmadd213ps %ymm3, %ymm3, %ymm6
 111         vmovmskps %ymm4, %ecx
 112
 113 /*
 114   3) Destination sign setting
 115   a) Set shifted destination sign using XOR operation:
 116   R = XOR( R, S );
 117  */
 118         vxorps    %ymm0, %ymm6, %ymm0
 119         testl     %ecx, %ecx
 120         jne       .LBL_1_3
 121
 122 .LBL_1_2:
 123         cfi_remember_state
 124         movq      %rbp, %rsp
 125         cfi_def_cfa_register (%rsp)
 126         popq      %rbp
 127         cfi_adjust_cfa_offset (-8)
 128         cfi_restore (%rbp)
 129         ret
 130
 131 .LBL_1_3:
 132         cfi_restore_state
 133         vmovups   %ymm5, 320(%rsp)
 134         vmovups   %ymm0, 384(%rsp)
 135         je        .LBL_1_2
 136
 137         xorb      %dl, %dl
 138         xorl      %eax, %eax
 139         vmovups   %ymm8, 224(%rsp)
 140         vmovups   %ymm9, 192(%rsp)
 141         vmovups   %ymm10, 160(%rsp)
 142         vmovups   %ymm11, 128(%rsp)
 143         vmovups   %ymm12, 96(%rsp)
 144         vmovups   %ymm13, 64(%rsp)
 145         vmovups   %ymm14, 32(%rsp)
 146         vmovups   %ymm15, (%rsp)
 147         movq      %rsi, 264(%rsp)
 148         movq      %rdi, 256(%rsp)
 149         movq      %r12, 296(%rsp)
 150         cfi_offset_rel_rsp (12, 296)
 151         movb      %dl, %r12b
 152         movq      %r13, 288(%rsp)
 153         cfi_offset_rel_rsp (13, 288)
 154         movl      %ecx, %r13d
 155         movq      %r14, 280(%rsp)
 156         cfi_offset_rel_rsp (14, 280)
 157         movl      %eax, %r14d
 158         movq      %r15, 272(%rsp)
 159         cfi_offset_rel_rsp (15, 272)
 160         cfi_remember_state
 161
 162 .LBL_1_6:
 163         btl       %r14d, %r13d
 164         jc        .LBL_1_12
 165
 166 .LBL_1_7:
 167         lea       1(%r14), %esi
 168         btl       %esi, %r13d
 169         jc        .LBL_1_10
 170
 171 .LBL_1_8:
 172         incb      %r12b
 173         addl      $2, %r14d
 174         cmpb      $16, %r12b
 175         jb        .LBL_1_6
 176
 177         vmovups   224(%rsp), %ymm8
 178         vmovups   192(%rsp), %ymm9
 179         vmovups   160(%rsp), %ymm10
 180         vmovups   128(%rsp), %ymm11
 181         vmovups   96(%rsp), %ymm12
 182         vmovups   64(%rsp), %ymm13
 183         vmovups   32(%rsp), %ymm14
 184         vmovups   (%rsp), %ymm15
 185         vmovups   384(%rsp), %ymm0
 186         movq      264(%rsp), %rsi
 187         movq      256(%rsp), %rdi
 188         movq      296(%rsp), %r12
 189         cfi_restore (%r12)
 190         movq      288(%rsp), %r13
 191         cfi_restore (%r13)
 192         movq      280(%rsp), %r14
 193         cfi_restore (%r14)
 194         movq      272(%rsp), %r15
 195         cfi_restore (%r15)
 196         jmp       .LBL_1_2
 197
 198 .LBL_1_10:
 199         cfi_restore_state
 200         movzbl    %r12b, %r15d
 201         vmovss    324(%rsp,%r15,8), %xmm0
 202         vzeroupper
 203
 204         call      JUMPTARGET(sinf)
 205
 206         vmovss    %xmm0, 388(%rsp,%r15,8)
 207         jmp       .LBL_1_8
 208
 209 .LBL_1_12:
 210         movzbl    %r12b, %r15d
 211         vmovss    320(%rsp,%r15,8), %xmm0
 212         vzeroupper
 213
 214         call      JUMPTARGET(sinf)
 215
 216         vmovss    %xmm0, 384(%rsp,%r15,8)
 217         jmp       .LBL_1_7
 218
 219 END(_ZGVdN8v_sinf_avx2)