sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S

   1 /* Function sincosf vectorized with AVX2.
   2    Copyright (C) 2014-2019 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_s_trig_data.h"
  21
  22         .text
  23 ENTRY (_ZGVdN8vl4l4_sincosf_avx2)
  24 /*
  25    ALGORITHM DESCRIPTION:
  26
  27      1) Range reduction to [-Pi/4; +Pi/4] interval
  28         a) Grab sign from source argument and save it.
  29         b) Remove sign using AND operation
  30         c) Getting octant Y by 2/Pi multiplication
  31         d) Add "Right Shifter" value
  32         e) Treat obtained value as integer S for destination sign setting.
  33            SS = ((S-S&1)&2)<<30; For sin part
  34            SC = ((S+S&1)&2)<<30; For cos part
  35         f) Change destination sign if source sign is negative
  36            using XOR operation.
  37         g) Subtract "Right Shifter" (0x4B000000) value
  38         h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
  39            X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
  40      2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
  41         a) Calculate X^2 = X * X
  42         b) Calculate 2 polynomials for sin and cos:
  43            RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
  44            RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
  45         c) Swap RS & RC if first bit of obtained value after
  46            Right Shifting is set to 1. Using And, Andnot & Or operations.
  47      3) Destination sign setting
  48         a) Set shifted destination sign using XOR operation:
  49            R1 = XOR( RS, SS );
  50            R2 = XOR( RC, SC ).  */
  51
  52         pushq     %rbp
  53         cfi_adjust_cfa_offset (8)
  54         cfi_rel_offset (%rbp, 0)
  55         movq      %rsp, %rbp
  56         cfi_def_cfa_register (%rbp)
  57         andq      $-64, %rsp
  58         subq      $448, %rsp
  59         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
  60         vmovdqa   %ymm0, %ymm5
  61         vmovups   %ymm13, 352(%rsp)
  62         vmovups __sAbsMask(%rax), %ymm2
  63         vmovups __sInvPI(%rax), %ymm1
  64         vmovups __sPI1_FMA(%rax), %ymm13
  65         vmovups   %ymm15, 288(%rsp)
  66
  67 /* Absolute argument computation */
  68         vandps    %ymm2, %ymm5, %ymm4
  69
  70 /* c) Getting octant Y by 2/Pi multiplication
  71    d) Add "Right Shifter" value */
  72         vfmadd213ps __sRShifter(%rax), %ymm4, %ymm1
  73
  74 /* e) Treat obtained value as integer S for destination sign setting */
  75         vpslld    $31, %ymm1, %ymm0
  76
  77 /* g) Subtract "Right Shifter" (0x4B000000) value */
  78         vsubps __sRShifter(%rax), %ymm1, %ymm1
  79
  80 /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
  81       X = X - Y*PI1 - Y*PI2 - Y*PI3 */
  82         vmovdqa   %ymm4, %ymm7
  83         vfnmadd231ps %ymm1, %ymm13, %ymm7
  84         vfnmadd231ps __sPI2_FMA(%rax), %ymm1, %ymm7
  85         vandps __sSignMask(%rax), %ymm7, %ymm15
  86         vxorps __sOneHalf(%rax), %ymm15, %ymm6
  87
  88 /* Add correction term 0.5 for cos() part */
  89         vaddps    %ymm6, %ymm1, %ymm6
  90         vmovdqa   %ymm4, %ymm3
  91         vfnmadd231ps %ymm6, %ymm13, %ymm3
  92         vmovups __sPI3_FMA(%rax), %ymm13
  93         vcmpnle_uqps __sRangeReductionVal(%rax), %ymm4, %ymm4
  94         vfnmadd231ps __sPI2_FMA(%rax), %ymm6, %ymm3
  95         vfnmadd213ps %ymm7, %ymm13, %ymm1
  96         vfnmadd213ps %ymm3, %ymm13, %ymm6
  97
  98 /* Result sign calculations */
  99         vxorps __sSignMask(%rax), %ymm15, %ymm3
 100         vxorps    %ymm0, %ymm3, %ymm7
 101         vxorps    %ymm7, %ymm6, %ymm3
 102         vxorps    %ymm0, %ymm1, %ymm15
 103         vandnps   %ymm5, %ymm2, %ymm6
 104         vmovups __sA7_FMA(%rax), %ymm2
 105         vmulps    %ymm15, %ymm15, %ymm13
 106         vmovups __sA9_FMA(%rax), %ymm7
 107         vmulps    %ymm3, %ymm3, %ymm1
 108
 109 /* 2) Polynomial (minimax for sin within  [-Pi/4; +Pi/4] interval)
 110       a) Calculate X^2 = X * X
 111       b) Calculate 2 polynomials for sin and cos:
 112          RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
 113          RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
 114         vmovdqa   %ymm2, %ymm0
 115         vfmadd231ps __sA9_FMA(%rax), %ymm13, %ymm0
 116         vfmadd213ps %ymm2, %ymm1, %ymm7
 117         vfmadd213ps __sA5_FMA(%rax), %ymm13, %ymm0
 118         vfmadd213ps __sA5_FMA(%rax), %ymm1, %ymm7
 119         vfmadd213ps __sA3(%rax), %ymm13, %ymm0
 120         vfmadd213ps __sA3(%rax), %ymm1, %ymm7
 121         vmulps    %ymm13, %ymm0, %ymm13
 122         vmulps    %ymm1, %ymm7, %ymm1
 123         vfmadd213ps %ymm15, %ymm15, %ymm13
 124         vfmadd213ps %ymm3, %ymm3, %ymm1
 125         vmovmskps %ymm4, %ecx
 126         vxorps    %ymm6, %ymm13, %ymm0
 127         testl     %ecx, %ecx
 128         jne       .LBL_1_3
 129
 130 .LBL_1_2:
 131         cfi_remember_state
 132         vmovups   352(%rsp), %ymm13
 133         vmovups   288(%rsp), %ymm15
 134         vmovups   %ymm0, (%rdi)
 135         vmovups   %ymm1, (%rsi)
 136         movq      %rbp, %rsp
 137         cfi_def_cfa_register (%rsp)
 138         popq      %rbp
 139         cfi_adjust_cfa_offset (-8)
 140         cfi_restore (%rbp)
 141         ret
 142
 143 .LBL_1_3:
 144         cfi_restore_state
 145         vmovups   %ymm5, 256(%rsp)
 146         vmovups   %ymm0, 320(%rsp)
 147         vmovups   %ymm1, 384(%rsp)
 148         je        .LBL_1_2
 149
 150         xorb      %dl, %dl
 151         xorl      %eax, %eax
 152         vmovups   %ymm8, 160(%rsp)
 153         vmovups   %ymm9, 128(%rsp)
 154         vmovups   %ymm10, 96(%rsp)
 155         vmovups   %ymm11, 64(%rsp)
 156         vmovups   %ymm12, 32(%rsp)
 157         vmovups   %ymm14, (%rsp)
 158         movq      %rsi, 192(%rsp)
 159         movq      %r12, 232(%rsp)
 160         cfi_offset_rel_rsp (12, 232)
 161         movb      %dl, %r12b
 162         movq      %r13, 224(%rsp)
 163         cfi_offset_rel_rsp (13, 224)
 164         movl      %eax, %r13d
 165         movq      %r14, 216(%rsp)
 166         cfi_offset_rel_rsp (14, 216)
 167         movl      %ecx, %r14d
 168         movq      %r15, 208(%rsp)
 169         cfi_offset_rel_rsp (14, 208)
 170         movq      %rbx, 200(%rsp)
 171         movq      %rdi, %rbx
 172         cfi_remember_state
 173
 174 .LBL_1_6:
 175         btl       %r13d, %r14d
 176         jc        .LBL_1_13
 177
 178 .LBL_1_7:
 179         lea       1(%r13), %esi
 180         btl       %esi, %r14d
 181         jc        .LBL_1_10
 182
 183 .LBL_1_8:
 184         incb      %r12b
 185         addl      $2, %r13d
 186         cmpb      $16, %r12b
 187         jb        .LBL_1_6
 188
 189         vmovups   160(%rsp), %ymm8
 190         movq      %rbx, %rdi
 191         vmovups   128(%rsp), %ymm9
 192         vmovups   96(%rsp), %ymm10
 193         vmovups   64(%rsp), %ymm11
 194         vmovups   32(%rsp), %ymm12
 195         vmovups   (%rsp), %ymm14
 196         vmovups   320(%rsp), %ymm0
 197         vmovups   384(%rsp), %ymm1
 198         movq      192(%rsp), %rsi
 199         movq      232(%rsp), %r12
 200         cfi_restore (%r12)
 201         movq      224(%rsp), %r13
 202         cfi_restore (%r13)
 203         movq      216(%rsp), %r14
 204         cfi_restore (%r14)
 205         movq      208(%rsp), %r15
 206         cfi_restore (%r15)
 207         movq      200(%rsp), %rbx
 208         jmp       .LBL_1_2
 209
 210 .LBL_1_10:
 211         cfi_restore_state
 212         movzbl    %r12b, %r15d
 213         vmovss    260(%rsp,%r15,8), %xmm0
 214         vzeroupper
 215
 216         call      JUMPTARGET(sinf)
 217
 218         vmovss    %xmm0, 324(%rsp,%r15,8)
 219         vmovss    260(%rsp,%r15,8), %xmm0
 220
 221         call      JUMPTARGET(cosf)
 222
 223         vmovss    %xmm0, 388(%rsp,%r15,8)
 224         jmp       .LBL_1_8
 225
 226 .LBL_1_13:
 227         movzbl    %r12b, %r15d
 228         vmovss    256(%rsp,%r15,8), %xmm0
 229         vzeroupper
 230
 231         call      JUMPTARGET(sinf)
 232
 233         vmovss    %xmm0, 320(%rsp,%r15,8)
 234         vmovss    256(%rsp,%r15,8), %xmm0
 235
 236         call      JUMPTARGET(cosf)
 237
 238         vmovss    %xmm0, 384(%rsp,%r15,8)
 239         jmp       .LBL_1_7
 240
 241 END (_ZGVdN8vl4l4_sincosf_avx2)
 242 libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2)
 243
 244 /* vvv version implemented with wrapper to vl4l4 variant.  */
 245 ENTRY (_ZGVdN8vvv_sincosf_avx2)
 246 #ifndef __ILP32__
 247         pushq     %rbp
 248         cfi_adjust_cfa_offset (8)
 249         cfi_rel_offset (%rbp, 0)
 250         movq      %rsp, %rbp
 251         cfi_def_cfa_register (%rbp)
 252         andq      $-32, %rsp
 253         subq      $192, %rsp
 254         vmovdqu   %ymm1, 64(%rsp)
 255         lea       (%rsp), %rdi
 256         vmovdqu   %ymm2, 96(%rdi)
 257         vmovdqu   %ymm3, 128(%rdi)
 258         vmovdqu   %ymm4, 160(%rdi)
 259         lea       32(%rsp), %rsi
 260         call      HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2)
 261         movq      64(%rsp), %rdx
 262         movq      72(%rsp), %rsi
 263         movq      80(%rsp), %r8
 264         movq      88(%rsp), %r10
 265         movl      (%rsp), %eax
 266         movl      4(%rsp), %ecx
 267         movl      8(%rsp), %edi
 268         movl      12(%rsp), %r9d
 269         movl      %eax, (%rdx)
 270         movl      %ecx, (%rsi)
 271         movq      96(%rsp), %rax
 272         movq      104(%rsp), %rcx
 273         movl      %edi, (%r8)
 274         movl      %r9d, (%r10)
 275         movq      112(%rsp), %rdi
 276         movq      120(%rsp), %r9
 277         movl      16(%rsp), %r11d
 278         movl      20(%rsp), %edx
 279         movl      24(%rsp), %esi
 280         movl      28(%rsp), %r8d
 281         movl      %r11d, (%rax)
 282         movl      %edx, (%rcx)
 283         movq      128(%rsp), %r11
 284         movq      136(%rsp), %rdx
 285         movl      %esi, (%rdi)
 286         movl      %r8d, (%r9)
 287         movq      144(%rsp), %rsi
 288         movq      152(%rsp), %r8
 289         movl      32(%rsp), %r10d
 290         movl      36(%rsp), %eax
 291         movl      40(%rsp), %ecx
 292         movl      44(%rsp), %edi
 293         movl      %r10d, (%r11)
 294         movl      %eax, (%rdx)
 295         movq      160(%rsp), %r10
 296         movq      168(%rsp), %rax
 297         movl      %ecx, (%rsi)
 298         movl      %edi, (%r8)
 299         movq      176(%rsp), %rcx
 300         movq      184(%rsp), %rdi
 301         movl      48(%rsp), %r9d
 302         movl      52(%rsp), %r11d
 303         movl      56(%rsp), %edx
 304         movl      60(%rsp), %esi
 305         movl      %r9d, (%r10)
 306         movl      %r11d, (%rax)
 307         movl      %edx, (%rcx)
 308         movl      %esi, (%rdi)
 309         movq      %rbp, %rsp
 310         cfi_def_cfa_register (%rsp)
 311         popq      %rbp
 312         cfi_adjust_cfa_offset (-8)
 313         cfi_restore (%rbp)
 314         ret
 315 #else
 316         leal    8(%rsp), %r10d
 317         .cfi_def_cfa 10, 0
 318         andl    $-32, %esp
 319         pushq   -8(%r10d)
 320         pushq   %rbp
 321         .cfi_escape 0x10,0x6,0x2,0x76,0
 322         movl    %esp, %ebp
 323         pushq   %r10
 324         .cfi_escape 0xf,0x3,0x76,0x78,0x6
 325         leal    -48(%rbp), %esi
 326         leal    -80(%rbp), %edi
 327         subl    $136, %esp
 328         vmovdqa %ymm1, -112(%ebp)
 329         vmovdqa %ymm2, -144(%ebp)
 330         call    HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2)
 331         vmovdqa -112(%ebp), %xmm0
 332         vmovq   %xmm0, %rax
 333         vmovss  -80(%ebp), %xmm0
 334         vmovss  %xmm0, (%eax)
 335         vmovss  -76(%ebp), %xmm0
 336         shrq    $32, %rax
 337         vmovss  %xmm0, (%eax)
 338         movq    -104(%ebp), %rax
 339         vmovss  -72(%ebp), %xmm0
 340         vmovss  %xmm0, (%eax)
 341         vmovss  -68(%ebp), %xmm0
 342         shrq    $32, %rax
 343         vmovss  %xmm0, (%eax)
 344         movq    -96(%ebp), %rax
 345         vmovss  -64(%ebp), %xmm0
 346         vmovss  %xmm0, (%eax)
 347         vmovss  -60(%ebp), %xmm0
 348         shrq    $32, %rax
 349         vmovss  %xmm0, (%eax)
 350         movq    -88(%ebp), %rax
 351         vmovss  -56(%ebp), %xmm0
 352         vmovss  %xmm0, (%eax)
 353         vmovss  -52(%ebp), %xmm0
 354         shrq    $32, %rax
 355         vmovss  %xmm0, (%eax)
 356         vmovdqa -144(%ebp), %xmm0
 357         vmovq   %xmm0, %rax
 358         vmovss  -48(%ebp), %xmm0
 359         vmovss  %xmm0, (%eax)
 360         vmovss  -44(%ebp), %xmm0
 361         shrq    $32, %rax
 362         vmovss  %xmm0, (%eax)
 363         movq    -136(%ebp), %rax
 364         vmovss  -40(%ebp), %xmm0
 365         vmovss  %xmm0, (%eax)
 366         vmovss  -36(%ebp), %xmm0
 367         shrq    $32, %rax
 368         vmovss  %xmm0, (%eax)
 369         movq    -128(%ebp), %rax
 370         vmovss  -32(%ebp), %xmm0
 371         vmovss  %xmm0, (%eax)
 372         vmovss  -28(%ebp), %xmm0
 373         shrq    $32, %rax
 374         vmovss  %xmm0, (%eax)
 375         movq    -120(%ebp), %rax
 376         vmovss  -24(%ebp), %xmm0
 377         vmovss  %xmm0, (%eax)
 378         vmovss  -20(%ebp), %xmm0
 379         shrq    $32, %rax
 380         vmovss  %xmm0, (%eax)
 381         addl    $136, %esp
 382         popq    %r10
 383         .cfi_def_cfa 10, 0
 384         popq    %rbp
 385         leal    -8(%r10), %esp
 386         .cfi_def_cfa 7, 8
 387         ret
 388 #endif
 389 END (_ZGVdN8vvv_sincosf_avx2)