sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S

   1 /* Function atan2 vectorized with SSE4.
   2    Copyright (C) 2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
  22  *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
  23  *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
  24  *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
  25  *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
  26  *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
  27  *
  28  *
  29  */
  30
  31 /* Offsets for data table __svml_datan2_data_internal
  32  */
  33 #define dPI                             0
  34 #define dPIO2                           16
  35 #define dA19                            32
  36 #define dA18                            48
  37 #define dA17                            64
  38 #define dA16                            80
  39 #define dA15                            96
  40 #define dA14                            112
  41 #define dA13                            128
  42 #define dA12                            144
  43 #define dA11                            160
  44 #define dA10                            176
  45 #define dA09                            192
  46 #define dA08                            208
  47 #define dA07                            224
  48 #define dA06                            240
  49 #define dA05                            256
  50 #define dA04                            272
  51 #define dA03                            288
  52 #define dA02                            304
  53 #define dA01                            320
  54 #define dA00                            336
  55 #define dSIGN_MASK                      352
  56 #define iCHK_WORK_SUB                   368
  57 #define iCHK_WORK_CMP                   384
  58 #define dABS_MASK                       400
  59 #define dZERO                           416
  60
  61 #include <sysdep.h>
  62
  63         .text
  64         .section .text.sse4,"ax",@progbits
  65 ENTRY(_ZGVbN2vv_atan2_sse4)
  66         subq      $88, %rsp
  67         cfi_def_cfa_offset(96)
  68         movaps    %xmm0, %xmm8
  69
  70 /*
  71  * #define NO_VECTOR_ZERO_ATAN2_ARGS
  72  *  Declarations
  73  * Variables
  74  * Constants
  75  *  The end of declarations
  76  *  Implementation
  77  * Get r0~=1/B
  78  * Cannot be replaced by VQRCP(D, dR0, dB);
  79  * Argument Absolute values
  80  */
  81         movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
  82         movaps    %xmm1, %xmm9
  83         movaps    %xmm4, %xmm1
  84         andps     %xmm8, %xmm4
  85         andps     %xmm9, %xmm1
  86         movaps    %xmm4, %xmm2
  87         cmpnltpd  %xmm1, %xmm2
  88
  89 /* Argument signs */
  90         movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
  91         movaps    %xmm2, %xmm0
  92         movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm5
  93         movaps    %xmm3, %xmm7
  94         movaps    %xmm3, %xmm6
  95
  96 /*
  97  * 1) If y<x then a= y, b=x, PIO2=0
  98  * 2) If y>x then a=-x, b=y, PIO2=Pi/2
  99  */
 100         orps      %xmm1, %xmm3
 101         movaps    %xmm2, %xmm10
 102         andps     %xmm2, %xmm5
 103         andnps    %xmm4, %xmm0
 104         andps     %xmm2, %xmm3
 105         andnps    %xmm1, %xmm10
 106         andps     %xmm4, %xmm2
 107         orps      %xmm3, %xmm0
 108         orps      %xmm2, %xmm10
 109         divpd     %xmm10, %xmm0
 110         movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
 111
 112 /* if x<0, dPI = Pi, else dPI =0 */
 113         movaps    %xmm9, %xmm3
 114
 115 /* Check if y and x are on main path. */
 116         pshufd    $221, %xmm1, %xmm12
 117         andps     %xmm9, %xmm7
 118         psubd     %xmm11, %xmm12
 119         andps     %xmm8, %xmm6
 120         movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
 121         xorl      %edx, %edx
 122         movups    %xmm4, 16(%rsp)
 123         xorl      %eax, %eax
 124         pshufd    $221, %xmm4, %xmm14
 125         movdqa    %xmm12, %xmm4
 126         pcmpgtd   %xmm13, %xmm4
 127         pcmpeqd   %xmm13, %xmm12
 128         por       %xmm12, %xmm4
 129
 130 /* Polynomial. */
 131         movaps    %xmm0, %xmm12
 132         mulpd     %xmm0, %xmm12
 133         cmplepd   dZERO+__svml_datan2_data_internal(%rip), %xmm3
 134         psubd     %xmm11, %xmm14
 135         movdqa    %xmm14, %xmm15
 136         pcmpeqd   %xmm13, %xmm14
 137         pcmpgtd   %xmm13, %xmm15
 138         por       %xmm14, %xmm15
 139         movaps    %xmm12, %xmm14
 140         mulpd     %xmm12, %xmm14
 141         por       %xmm15, %xmm4
 142         movaps    %xmm14, %xmm15
 143         mulpd     %xmm14, %xmm15
 144         movmskps  %xmm4, %ecx
 145         movups    %xmm10, (%rsp)
 146         movups    dA19+__svml_datan2_data_internal(%rip), %xmm10
 147         mulpd     %xmm15, %xmm10
 148         movups    dA18+__svml_datan2_data_internal(%rip), %xmm13
 149         movups    dA17+__svml_datan2_data_internal(%rip), %xmm11
 150         addpd     dA15+__svml_datan2_data_internal(%rip), %xmm10
 151         mulpd     %xmm15, %xmm13
 152         mulpd     %xmm15, %xmm11
 153         mulpd     %xmm15, %xmm10
 154         addpd     dA14+__svml_datan2_data_internal(%rip), %xmm13
 155         addpd     dA13+__svml_datan2_data_internal(%rip), %xmm11
 156         addpd     dA11+__svml_datan2_data_internal(%rip), %xmm10
 157         mulpd     %xmm15, %xmm13
 158         mulpd     %xmm15, %xmm11
 159         mulpd     %xmm15, %xmm10
 160         addpd     dA10+__svml_datan2_data_internal(%rip), %xmm13
 161         addpd     dA09+__svml_datan2_data_internal(%rip), %xmm11
 162         addpd     dA07+__svml_datan2_data_internal(%rip), %xmm10
 163         mulpd     %xmm15, %xmm13
 164         mulpd     %xmm15, %xmm11
 165         mulpd     %xmm15, %xmm10
 166         addpd     dA06+__svml_datan2_data_internal(%rip), %xmm13
 167         addpd     dA05+__svml_datan2_data_internal(%rip), %xmm11
 168         addpd     dA03+__svml_datan2_data_internal(%rip), %xmm10
 169         mulpd     %xmm15, %xmm13
 170         mulpd     %xmm15, %xmm11
 171         mulpd     %xmm12, %xmm10
 172         addpd     dA02+__svml_datan2_data_internal(%rip), %xmm13
 173         addpd     dA01+__svml_datan2_data_internal(%rip), %xmm11
 174         addpd     %xmm10, %xmm13
 175         mulpd     %xmm11, %xmm12
 176         mulpd     %xmm13, %xmm14
 177         movups    dA16+__svml_datan2_data_internal(%rip), %xmm2
 178         mulpd     %xmm15, %xmm2
 179         addpd     dA12+__svml_datan2_data_internal(%rip), %xmm2
 180         mulpd     %xmm15, %xmm2
 181         addpd     dA08+__svml_datan2_data_internal(%rip), %xmm2
 182         mulpd     %xmm15, %xmm2
 183         addpd     dA04+__svml_datan2_data_internal(%rip), %xmm2
 184
 185 /* A00=1.0, account for it later  VQFMA(D, dP4, dP4, dR8, dA00); */
 186         mulpd     %xmm2, %xmm15
 187         addpd     %xmm12, %xmm15
 188         addpd     %xmm14, %xmm15
 189
 190 /*
 191  * Reconstruction.
 192  * dP=(R+R*dP) + dPIO2
 193  */
 194         mulpd     %xmm0, %xmm15
 195         addpd     %xmm15, %xmm0
 196         addpd     %xmm5, %xmm0
 197         andps     __svml_datan2_data_internal(%rip), %xmm3
 198         orps      %xmm7, %xmm0
 199         addpd     %xmm3, %xmm0
 200
 201 /*  Special branch for fast (vector) processing of zero arguments  */
 202         movups    16(%rsp), %xmm11
 203         orps      %xmm6, %xmm0
 204         testb     $3, %cl
 205
 206 /* Go to auxilary branch */
 207         jne       L(AUX_BRANCH)
 208                                 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
 209
 210 /* Return from auxilary branch
 211  * for out of main path inputs
 212  */
 213
 214 L(AUX_BRANCH_RETURN):
 215 /*
 216  *  Special branch for fast (vector) processing of zero arguments
 217  *  The end of implementation
 218  */
 219         testl     %edx, %edx
 220
 221 /* Go to special inputs processing branch */
 222         jne       L(SPECIAL_VALUES_BRANCH)
 223                                 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
 224
 225 /* Restore registers
 226  * and exit the function
 227  */
 228
 229 L(EXIT):
 230         addq      $88, %rsp
 231         cfi_def_cfa_offset(8)
 232         ret
 233         cfi_def_cfa_offset(96)
 234
 235 /* Branch to process
 236  * special inputs
 237  */
 238
 239 L(SPECIAL_VALUES_BRANCH):
 240         movups    %xmm8, 32(%rsp)
 241         movups    %xmm9, 48(%rsp)
 242         movups    %xmm0, 64(%rsp)
 243                                 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
 244
 245         movq      %r12, 16(%rsp)
 246         cfi_offset(12, -80)
 247         movl      %eax, %r12d
 248         movq      %r13, 8(%rsp)
 249         cfi_offset(13, -88)
 250         movl      %edx, %r13d
 251         movq      %r14, (%rsp)
 252         cfi_offset(14, -96)
 253                                 # LOE rbx rbp r15 r12d r13d
 254
 255 /* Range mask
 256  * bits check
 257  */
 258
 259 L(RANGEMASK_CHECK):
 260         btl       %r12d, %r13d
 261
 262 /* Call scalar math function */
 263         jc        L(SCALAR_MATH_CALL)
 264                                 # LOE rbx rbp r15 r12d r13d
 265
 266 /* Special inputs
 267  * processing loop
 268  */
 269
 270 L(SPECIAL_VALUES_LOOP):
 271         incl      %r12d
 272         cmpl      $2, %r12d
 273
 274 /* Check bits in range mask */
 275         jl        L(RANGEMASK_CHECK)
 276                                 # LOE rbx rbp r15 r12d r13d
 277
 278         movq      16(%rsp), %r12
 279         cfi_restore(12)
 280         movq      8(%rsp), %r13
 281         cfi_restore(13)
 282         movq      (%rsp), %r14
 283         cfi_restore(14)
 284         movups    64(%rsp), %xmm0
 285
 286 /* Go to exit */
 287         jmp       L(EXIT)
 288         cfi_offset(12, -80)
 289         cfi_offset(13, -88)
 290         cfi_offset(14, -96)
 291                                 # LOE rbx rbp r12 r13 r14 r15 xmm0
 292
 293 /* Scalar math fucntion call
 294  * to process special input
 295  */
 296
 297 L(SCALAR_MATH_CALL):
 298         movl      %r12d, %r14d
 299         movsd     32(%rsp,%r14,8), %xmm0
 300         movsd     48(%rsp,%r14,8), %xmm1
 301         call      atan2@PLT
 302                                 # LOE rbx rbp r14 r15 r12d r13d xmm0
 303
 304         movsd     %xmm0, 64(%rsp,%r14,8)
 305
 306 /* Process special inputs in loop */
 307         jmp       L(SPECIAL_VALUES_LOOP)
 308         cfi_restore(12)
 309         cfi_restore(13)
 310         cfi_restore(14)
 311                                 # LOE rbx rbp r15 r12d r13d
 312
 313 /* Auxilary branch
 314  * for out of main path inputs
 315  */
 316
 317 L(AUX_BRANCH):
 318 /* Check if at least on of Y or Y is zero: iAXAYZERO */
 319         movups    dZERO+__svml_datan2_data_internal(%rip), %xmm2
 320
 321 /* Check if both X & Y are not NaNs:  iXYnotNAN */
 322         movaps    %xmm9, %xmm12
 323         movaps    %xmm8, %xmm10
 324         cmpordpd  %xmm9, %xmm12
 325         cmpordpd  %xmm8, %xmm10
 326         cmpeqpd   %xmm2, %xmm1
 327         cmpeqpd   %xmm2, %xmm11
 328         andps     %xmm10, %xmm12
 329         orps      %xmm11, %xmm1
 330         pshufd    $221, %xmm1, %xmm1
 331         pshufd    $221, %xmm12, %xmm11
 332
 333 /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
 334         pand      %xmm11, %xmm1
 335
 336 /* Exclude from previous callout mask zero (and not NaN) arguments */
 337         movdqa    %xmm1, %xmm13
 338         pandn     %xmm4, %xmm13
 339
 340 /*
 341  *  Path for zero arguments (at least one of both)
 342  * Check if both args are zeros (den. is zero)
 343  */
 344         movups    (%rsp), %xmm4
 345         cmpeqpd   %xmm2, %xmm4
 346
 347 /* Go to callout */
 348         movmskps  %xmm13, %edx
 349
 350 /* Set sPIO2 to zero if den. is zero */
 351         movaps    %xmm4, %xmm15
 352         andps     %xmm2, %xmm4
 353         andnps    %xmm5, %xmm15
 354         andl      $3, %edx
 355         orps      %xmm4, %xmm15
 356         pshufd    $221, %xmm9, %xmm5
 357         orps      %xmm7, %xmm15
 358
 359 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
 360         pshufd    $221, %xmm2, %xmm7
 361         pcmpgtd   %xmm5, %xmm7
 362         pshufd    $80, %xmm7, %xmm14
 363         andps     %xmm3, %xmm14
 364         addpd     %xmm14, %xmm15
 365
 366 /* Merge results from main and spec path */
 367         pshufd    $80, %xmm1, %xmm3
 368         orps      %xmm6, %xmm15
 369         movdqa    %xmm3, %xmm6
 370         andps     %xmm3, %xmm15
 371         andnps    %xmm0, %xmm6
 372         movaps    %xmm6, %xmm0
 373         orps      %xmm15, %xmm0
 374
 375 /* Return to main vector processing path */
 376         jmp       L(AUX_BRANCH_RETURN)
 377                                 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
 378 END(_ZGVbN2vv_atan2_sse4)
 379
 380         .section .rodata, "a"
 381         .align 16
 382
 383 #ifdef __svml_datan2_data_internal_typedef
 384 typedef unsigned int VUINT32;
 385 typedef struct {
 386         __declspec(align(16)) VUINT32 dPI[2][2];
 387         __declspec(align(16)) VUINT32 dPIO2[2][2];
 388         __declspec(align(16)) VUINT32 dA19[2][2];
 389         __declspec(align(16)) VUINT32 dA18[2][2];
 390         __declspec(align(16)) VUINT32 dA17[2][2];
 391         __declspec(align(16)) VUINT32 dA16[2][2];
 392         __declspec(align(16)) VUINT32 dA15[2][2];
 393         __declspec(align(16)) VUINT32 dA14[2][2];
 394         __declspec(align(16)) VUINT32 dA13[2][2];
 395         __declspec(align(16)) VUINT32 dA12[2][2];
 396         __declspec(align(16)) VUINT32 dA11[2][2];
 397         __declspec(align(16)) VUINT32 dA10[2][2];
 398         __declspec(align(16)) VUINT32 dA09[2][2];
 399         __declspec(align(16)) VUINT32 dA08[2][2];
 400         __declspec(align(16)) VUINT32 dA07[2][2];
 401         __declspec(align(16)) VUINT32 dA06[2][2];
 402         __declspec(align(16)) VUINT32 dA05[2][2];
 403         __declspec(align(16)) VUINT32 dA04[2][2];
 404         __declspec(align(16)) VUINT32 dA03[2][2];
 405         __declspec(align(16)) VUINT32 dA02[2][2];
 406         __declspec(align(16)) VUINT32 dA01[2][2];
 407         __declspec(align(16)) VUINT32 dA00[2][2];
 408         __declspec(align(16)) VUINT32 dSIGN_MASK[2][2];
 409         __declspec(align(16)) VUINT32 iCHK_WORK_SUB[4][1];
 410         __declspec(align(16)) VUINT32 iCHK_WORK_CMP[4][1];
 411         __declspec(align(16)) VUINT32 dABS_MASK[2][2];
 412         __declspec(align(16)) VUINT32 dZERO[2][2];
 413 } __svml_datan2_data_internal;
 414 #endif
 415 __svml_datan2_data_internal:
 416         .quad 0x400921FB54442D18, 0x400921FB54442D18 //dPI
 417         .align 16
 418         .quad 0x3FF921FB54442D18, 0x3FF921FB54442D18 //dPIO2
 419         .align 16
 420         .quad 0xBEF4FDB537ABC7A3, 0xBEF4FDB537ABC7A3 // dA19
 421         .align 16
 422         .quad 0x3F2CED0A36665209, 0x3F2CED0A36665209 // dA18
 423         .align 16
 424         .quad 0xBF52E67C93954C23, 0xBF52E67C93954C23 // dA17
 425         .align 16
 426         .quad 0x3F6F5A1DAE82AFB3, 0x3F6F5A1DAE82AFB3 // dA16
 427         .align 16
 428         .quad 0xBF82B2EC618E4BAD, 0xBF82B2EC618E4BAD // dA15
 429         .align 16
 430         .quad 0x3F914F4C661116A5, 0x3F914F4C661116A5 // dA14
 431         .align 16
 432         .quad 0xBF9A5E83B081F69C, 0xBF9A5E83B081F69C // dA13
 433         .align 16
 434         .quad 0x3FA169980CB6AD4F, 0x3FA169980CB6AD4F // dA12
 435         .align 16
 436         .quad 0xBFA4EFA2E563C1BC, 0xBFA4EFA2E563C1BC // dA11
 437         .align 16
 438         .quad 0x3FA7EC0FBC50683B, 0x3FA7EC0FBC50683B // dA10
 439         .align 16
 440         .quad 0xBFAAD261EAA09954, 0xBFAAD261EAA09954 // dA09
 441         .align 16
 442         .quad 0x3FAE1749BD612DCF, 0x3FAE1749BD612DCF // dA08
 443         .align 16
 444         .quad 0xBFB11084009435E0, 0xBFB11084009435E0 // dA07
 445         .align 16
 446         .quad 0x3FB3B12A49295651, 0x3FB3B12A49295651 // dA06
 447         .align 16
 448         .quad 0xBFB745D009BADA94, 0xBFB745D009BADA94 // dA05
 449         .align 16
 450         .quad 0x3FBC71C707F7D5B5, 0x3FBC71C707F7D5B5 // dA04
 451         .align 16
 452         .quad 0xBFC2492491EE55C7, 0xBFC2492491EE55C7 // dA03
 453         .align 16
 454         .quad 0x3FC999999997EE34, 0x3FC999999997EE34 // dA02
 455         .align 16
 456         .quad 0xBFD55555555553C5, 0xBFD55555555553C5 // dA01
 457         .align 16
 458         .quad 0x3FF0000000000000, 0x3FF0000000000000 // dA00
 459         .align 16
 460         .quad 0x8000000000000000, 0x8000000000000000 //dSIGN_MASK
 461         .align 16
 462         .long 0x80300000, 0x80300000, 0x80300000, 0x80300000 //iCHK_WORK_SUB
 463         .align 16
 464         .long 0xfdd00000, 0xfdd00000, 0xfdd00000, 0xfdd00000 //iCHK_WORK_CMP
 465         .align 16
 466         .quad 0x7fffffffffffffff, 0x7fffffffffffffff //dABS_MASK
 467         .align 16
 468         .quad 0x0000000000000000, 0x0000000000000000 //dZERO
 469         .align 16
 470         .type   __svml_datan2_data_internal,@object
 471         .size   __svml_datan2_data_internal,.-__svml_datan2_data_internal