sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S

   1 /* Function hypot vectorized with AVX2.
   2    Copyright (C) 2021-2022 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *      HIGH LEVEL OVERVIEW
  23  *
  24  *      Calculate z = (x*x+y*y)
  25  *      Calculate reciplicle sqrt (z)
  26  *      Calculate error = z*(rsqrt(z)*rsqrt(z)) - 1
  27  *      Calculate fixing part p with polynom
  28  *      Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
  29  *
  30  *      ALGORITHM DETAILS
  31  *
  32  *    Multiprecision branch for _HA_ only
  33  *      Remove sigm from both arguments
  34  *      Find maximum (_x) and minimum (_y) (by abs value) between arguments
  35  *      Split _x int _a and _b for multiprecision
  36  *      If _x >> _y we will we will not split _y for multiprecision
  37  *      all _y will be put into lower part (_d) and higher part (_c = 0)
  38  *      Fixing _hilo_mask for the case _x >> _y
  39  *      Split _y into _c and _d for multiprecision with fixed mask
  40  *
  41  *      compute Hi and Lo parts of _z = _x*_x + _y*_y
  42  *
  43  *      _zHi = _a*_a + _c*_c
  44  *      _zLo = (_x + _a)*_b + _d*_y + _d*_c
  45  *      _z = _zHi + _zLo
  46  *
  47  *    No multiprecision branch for _LA_ and _EP_
  48  *      _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
  49  *
  50  *    Check _z exponent to be withing borders [3BC ; 441] else goto Callout
  51  *
  52  *    _s  ~ 1.0/sqrt(_z)
  53  *    _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
  54  *    _e[rror]  =  (1.0/_z + O) * _z - 1.0
  55  *    calculate fixing part _p
  56  *    _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
  57  *    some parts of polynom are skipped for lower flav
  58  *
  59  *    result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
  60  *
  61  *
  62  */
  63
  64 /* Offsets for data table __svml_dhypot_data_internal
  65  */
  66 #define _dHiLoMask                      0
  67 #define _dAbsMask                       32
  68 #define _dOne                           64
  69 #define _POLY_C5                        96
  70 #define _POLY_C4                        128
  71 #define _POLY_C3                        160
  72 #define _POLY_C2                        192
  73 #define _POLY_C1                        224
  74 #define _LowBoundary                    256
  75 #define _HighBoundary                   288
  76
  77 #include <sysdep.h>
  78
  79         .section .text.avx2, "ax", @progbits
  80 ENTRY(_ZGVdN4vv_hypot_avx2)
  81         pushq   %rbp
  82         cfi_def_cfa_offset(16)
  83         movq    %rsp, %rbp
  84         cfi_def_cfa(6, 16)
  85         cfi_offset(6, -16)
  86         andq    $-32, %rsp
  87         subq    $128, %rsp
  88         vmovapd %ymm1, %ymm2
  89         vmovapd %ymm0, %ymm1
  90
  91         /*
  92          *  Defines
  93          *  Implementation
  94          * Multiprecision branch for _HA_ only
  95          * _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
  96          */
  97         vmulpd  %ymm1, %ymm1, %ymm0
  98
  99         /*
 100          * calculate fixing part _p
 101          * _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
 102          * some parts of polynom are skipped for lower flav
 103          */
 104         vmovupd _POLY_C4+__svml_dhypot_data_internal(%rip), %ymm15
 105         vmovups _LowBoundary+__svml_dhypot_data_internal(%rip), %xmm4
 106         vfmadd231pd %ymm2, %ymm2, %ymm0
 107
 108         /*
 109          * _s  ~ 1.0/sqrt(_z)
 110          * _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z
 111          */
 112         vcvtpd2ps %ymm0, %xmm12
 113
 114         /* Check _z exponent to be withing borders [3BC ; 441] else goto Callout */
 115         vextractf128 $1, %ymm0, %xmm3
 116         vrsqrtps %xmm12, %xmm13
 117         vshufps $221, %xmm3, %xmm0, %xmm5
 118         vcvtps2pd %xmm13, %ymm3
 119         vpcmpgtd %xmm5, %xmm4, %xmm6
 120         vpcmpgtd _HighBoundary+__svml_dhypot_data_internal(%rip), %xmm5, %xmm7
 121         vpor    %xmm7, %xmm6, %xmm9
 122         vpshufd $80, %xmm9, %xmm8
 123         vmulpd  %ymm3, %ymm3, %ymm14
 124         vpshufd $250, %xmm9, %xmm10
 125
 126         /* _e[rror]  ~  (1.0/_z + O) * _z - 1.0 */
 127         vfmsub213pd _dOne+__svml_dhypot_data_internal(%rip), %ymm0, %ymm14
 128         vfmadd213pd _POLY_C3+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
 129         vfmadd213pd _POLY_C2+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
 130         vfmadd213pd _POLY_C1+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
 131
 132         /* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z */
 133         vmulpd  %ymm15, %ymm14, %ymm14
 134         vmulpd  %ymm14, %ymm3, %ymm15
 135         vmulpd  %ymm15, %ymm0, %ymm4
 136         vfmadd213pd %ymm4, %ymm3, %ymm0
 137         vinsertf128 $1, %xmm10, %ymm8, %ymm11
 138         vmovmskpd %ymm11, %edx
 139
 140         /*  The end of implementation  */
 141         testl   %edx, %edx
 142
 143         /* Go to special inputs processing branch */
 144         jne     L(SPECIAL_VALUES_BRANCH)
 145         # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2
 146
 147         /* Restore registers
 148          * and exit the function
 149          */
 150
 151 L(EXIT):
 152         movq    %rbp, %rsp
 153         popq    %rbp
 154         cfi_def_cfa(7, 8)
 155         cfi_restore(6)
 156         ret
 157         cfi_def_cfa(6, 16)
 158         cfi_offset(6, -16)
 159
 160         /* Branch to process
 161          * special inputs
 162          */
 163
 164 L(SPECIAL_VALUES_BRANCH):
 165         vmovupd %ymm1, 32(%rsp)
 166         vmovupd %ymm2, 64(%rsp)
 167         vmovupd %ymm0, 96(%rsp)
 168         # LOE rbx r12 r13 r14 r15 edx ymm0
 169
 170         xorl    %eax, %eax
 171         # LOE rbx r12 r13 r14 r15 eax edx
 172
 173         vzeroupper
 174         movq    %r12, 16(%rsp)
 175         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus)  */
 176         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
 177         movl    %eax, %r12d
 178         movq    %r13, 8(%rsp)
 179         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
 180         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
 181         movl    %edx, %r13d
 182         movq    %r14, (%rsp)
 183         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
 184         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
 185         # LOE rbx r15 r12d r13d
 186
 187         /* Range mask
 188          * bits check
 189          */
 190
 191 L(RANGEMASK_CHECK):
 192         btl     %r12d, %r13d
 193
 194         /* Call scalar math function */
 195         jc      L(SCALAR_MATH_CALL)
 196         # LOE rbx r15 r12d r13d
 197
 198         /* Special inputs
 199          * processing loop
 200          */
 201
 202 L(SPECIAL_VALUES_LOOP):
 203         incl    %r12d
 204         cmpl    $4, %r12d
 205
 206         /* Check bits in range mask */
 207         jl      L(RANGEMASK_CHECK)
 208         # LOE rbx r15 r12d r13d
 209
 210         movq    16(%rsp), %r12
 211         cfi_restore(12)
 212         movq    8(%rsp), %r13
 213         cfi_restore(13)
 214         movq    (%rsp), %r14
 215         cfi_restore(14)
 216         vmovupd 96(%rsp), %ymm0
 217
 218         /* Go to exit */
 219         jmp     L(EXIT)
 220         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus)  */
 221         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xff, 0xff, 0xff, 0x22
 222         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
 223         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
 224         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
 225         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
 226         # LOE rbx r12 r13 r14 r15 ymm0
 227
 228         /* Scalar math fucntion call
 229          * to process special input
 230          */
 231
 232 L(SCALAR_MATH_CALL):
 233         movl    %r12d, %r14d
 234         movsd   32(%rsp, %r14, 8), %xmm0
 235         movsd   64(%rsp, %r14, 8), %xmm1
 236         call    hypot@PLT
 237         # LOE rbx r14 r15 r12d r13d xmm0
 238
 239         movsd   %xmm0, 96(%rsp, %r14, 8)
 240
 241         /* Process special inputs in loop */
 242         jmp     L(SPECIAL_VALUES_LOOP)
 243         # LOE rbx r15 r12d r13d
 244 END(_ZGVdN4vv_hypot_avx2)
 245
 246         .section .rodata, "a"
 247         .align  32
 248
 249 #ifdef __svml_dhypot_data_internal_typedef
 250 typedef unsigned int VUINT32;
 251 typedef struct {
 252         __declspec(align(32)) VUINT32 _dHiLoMask[4][2];
 253         __declspec(align(32)) VUINT32 _dAbsMask[4][2];
 254         __declspec(align(32)) VUINT32 _dOne[4][2];
 255         __declspec(align(32)) VUINT32 _POLY_C5[4][2];
 256         __declspec(align(32)) VUINT32 _POLY_C4[4][2];
 257         __declspec(align(32)) VUINT32 _POLY_C3[4][2];
 258         __declspec(align(32)) VUINT32 _POLY_C2[4][2];
 259         __declspec(align(32)) VUINT32 _POLY_C1[4][2];
 260         __declspec(align(32)) VUINT32 _LowBoundary[8][1];
 261         __declspec(align(32)) VUINT32 _HighBoundary[8][1];
 262 } __svml_dhypot_data_internal;
 263 #endif
 264 __svml_dhypot_data_internal:
 265         /* legacy algorithm */
 266         .quad   0xffffc00000000000, 0xffffc00000000000, 0xffffc00000000000, 0xffffc00000000000 /* _dHiLoMask */
 267         .align  32
 268         .quad   0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* _dAbsMask */
 269         .align  32
 270         .quad   0x3FF0000000000000, 0x3FF0000000000000, 0x3FF0000000000000, 0x3FF0000000000000 /* _dOne */
 271         .align  32
 272         .quad   0xBFCF800000000000, 0xBFCF800000000000, 0xBFCF800000000000, 0xBFCF800000000000 /* _POLY_C5 */
 273         .align  32
 274         .quad   0x3FD1800000000000, 0x3FD1800000000000, 0x3FD1800000000000, 0x3FD1800000000000 /* _POLY_C4 */
 275         .align  32
 276         .quad   0xBFD4000000000000, 0xBFD4000000000000, 0xBFD4000000000000, 0xBFD4000000000000 /* _POLY_C3 */
 277         .align  32
 278         .quad   0x3FD8000000000000, 0x3FD8000000000000, 0x3FD8000000000000, 0x3FD8000000000000 /* _POLY_C2 */
 279         .align  32
 280         .quad   0xBFE0000000000000, 0xBFE0000000000000, 0xBFE0000000000000, 0xBFE0000000000000 /* _POLY_C1 */
 281         .align  32
 282         .long   0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000, 0x3BC00000 /* _LowBoundary */
 283         .align  32
 284         .long   0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000, 0x44100000 /* _HighBoundary */
 285         .align  32
 286         .type   __svml_dhypot_data_internal, @object
 287         .size   __svml_dhypot_data_internal, .-__svml_dhypot_data_internal