sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S

   1 /* Function hypotf vectorized with AVX-512.
   2    Copyright (C) 2021-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *      HIGH LEVEL OVERVIEW
  23  *
  24  *      Calculate z = (x*x+y*y)
  25  *      Calculate reciplicle sqrt (z)
  26  *      Calculate make two NR iterations
  27  *
  28  *      ALGORITHM DETAILS
  29  *
  30  *    Multiprecision branch for _HA_ only
  31  *      Remove sigm from both arguments
  32  *      Find maximum (_x) and minimum (_y) (by abs value) between arguments
  33  *      Split _x int _a and _b for multiprecision
  34  *      If _x >> _y we will we will not split _y for multiprecision
  35  *      all _y will be put into lower part (_d) and higher part (_c = 0)
  36  *      Fixing _hilo_mask for the case _x >> _y
  37  *      Split _y into _c and _d for multiprecision with fixed mask
  38  *
  39  *      compute Hi and Lo parts of _z = _x*_x + _y*_y
  40  *
  41  *      _zHi = _a*_a + _c*_c
  42  *      _zLo = (_x + _a)*_b + _d*_y + _d*_c
  43  *      _z = _zHi + _zLo
  44  *
  45  *    No multiprecision branch for _LA_ and _EP_
  46  *      _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
  47  *
  48  *    Check _z exponent to be within borders [1E3 ; 60A] else goto Callout
  49  *
  50  *    Compute resciplicle sqrt s0 ~ 1.0/sqrt(_z),
  51  *      that multiplied by _z, is final result for _EP_ version.
  52  *
  53  *    First iteration (or zero iteration):
  54  *       s =  z * s0
  55  *       h = .5 * s0
  56  *       d =  s *  h - .5
  57  *
  58  *    Second iteration:
  59  *       h = d * h + h
  60  *       s = s * d + s
  61  *       d = s * s - z (in multiprecision for _HA_)
  62  *
  63  *    result = s - h * d
  64  *
  65  *    EP version of the function can be implemented as y[i]=sqrt(a[i]^2+b[i]^2)
  66  *    with all intermediate operations done in target precision for i=1, .., n.
  67  *    It can return result y[i]=0 in case a[i]^2 and b[i]^2 underflow in target
  68  *    precision (for some i). It can return result y[i]=NAN in case
  69  *    a[i]^2+b[i]^2 overflow in target precision, for some i. It can return
  70  *    result y[i]=NAN in case a[i] or b[i] is infinite, for some i.
  71  *
  72  *
  73  */
  74
  75 /* Offsets for data table __svml_shypot_data_internal
  76  */
  77 #define _sAbsMask                       0
  78 #define _sHalf                          64
  79 #define _iExpBound                      128
  80
  81 #include <sysdep.h>
  82
  83         .section .text.evex512, "ax", @progbits
  84 ENTRY(_ZGVeN16vv_hypotf_skx)
  85         pushq   %rbp
  86         cfi_def_cfa_offset(16)
  87         movq    %rsp, %rbp
  88         cfi_def_cfa(6, 16)
  89         cfi_offset(6, -16)
  90         andq    $-64, %rsp
  91         subq    $256, %rsp
  92         vgetexpps {sae}, %zmm0, %zmm2
  93         vgetexpps {sae}, %zmm1, %zmm3
  94         vmovups _sHalf+__svml_shypot_data_internal(%rip), %zmm6
  95         vmaxps  {sae}, %zmm3, %zmm2, %zmm4
  96         vmulps  {rn-sae}, %zmm0, %zmm0, %zmm2
  97         vandps  _sAbsMask+__svml_shypot_data_internal(%rip), %zmm4, %zmm5
  98         vfmadd231ps {rn-sae}, %zmm1, %zmm1, %zmm2
  99         vpcmpd  $5, _iExpBound+__svml_shypot_data_internal(%rip), %zmm5, %k0
 100         vrsqrt14ps %zmm2, %zmm7
 101         kmovw   %k0, %edx
 102         vmulps  {rn-sae}, %zmm7, %zmm2, %zmm9
 103         vmulps  {rn-sae}, %zmm7, %zmm6, %zmm8
 104         vfnmadd231ps {rn-sae}, %zmm9, %zmm9, %zmm2
 105         vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm2
 106
 107         /*
 108          * VSCALEF( S, _VRES1, _VRES1, sExp );
 109          *  The end of implementation
 110          */
 111         testl   %edx, %edx
 112
 113         /* Go to special inputs processing branch */
 114         jne     L(SPECIAL_VALUES_BRANCH)
 115         # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2
 116
 117         /* Restore registers
 118          * and exit the function
 119          */
 120
 121 L(EXIT):
 122         vmovaps %zmm2, %zmm0
 123         movq    %rbp, %rsp
 124         popq    %rbp
 125         cfi_def_cfa(7, 8)
 126         cfi_restore(6)
 127         ret
 128         cfi_def_cfa(6, 16)
 129         cfi_offset(6, -16)
 130
 131         /* Branch to process
 132          * special inputs
 133          */
 134
 135 L(SPECIAL_VALUES_BRANCH):
 136         vmovups %zmm0, 64(%rsp)
 137         vmovups %zmm1, 128(%rsp)
 138         vmovups %zmm2, 192(%rsp)
 139         # LOE rbx r12 r13 r14 r15 edx zmm2
 140
 141         xorl    %eax, %eax
 142         # LOE rbx r12 r13 r14 r15 eax edx
 143
 144         vzeroupper
 145         movq    %r12, 16(%rsp)
 146         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus)  */
 147         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
 148         movl    %eax, %r12d
 149         movq    %r13, 8(%rsp)
 150         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus)  */
 151         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
 152         movl    %edx, %r13d
 153         movq    %r14, (%rsp)
 154         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus)  */
 155         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
 156         # LOE rbx r15 r12d r13d
 157
 158         /* Range mask
 159          * bits check
 160          */
 161
 162 L(RANGEMASK_CHECK):
 163         btl     %r12d, %r13d
 164
 165         /* Call scalar math function */
 166         jc      L(SCALAR_MATH_CALL)
 167         # LOE rbx r15 r12d r13d
 168
 169         /* Special inputs
 170          * processing loop
 171          */
 172
 173 L(SPECIAL_VALUES_LOOP):
 174         incl    %r12d
 175         cmpl    $16, %r12d
 176
 177         /* Check bits in range mask */
 178         jl      L(RANGEMASK_CHECK)
 179         # LOE rbx r15 r12d r13d
 180
 181         movq    16(%rsp), %r12
 182         cfi_restore(12)
 183         movq    8(%rsp), %r13
 184         cfi_restore(13)
 185         movq    (%rsp), %r14
 186         cfi_restore(14)
 187         vmovups 192(%rsp), %zmm2
 188
 189         /* Go to exit */
 190         jmp     L(EXIT)
 191         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus)  */
 192         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
 193         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus)  */
 194         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
 195         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus)  */
 196         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
 197         # LOE rbx r12 r13 r14 r15 zmm2
 198
 199         /* Scalar math function call
 200          * to process special input
 201          */
 202
 203 L(SCALAR_MATH_CALL):
 204         movl    %r12d, %r14d
 205         vmovss  64(%rsp, %r14, 4), %xmm0
 206         vmovss  128(%rsp, %r14, 4), %xmm1
 207         call    hypotf@PLT
 208         # LOE rbx r14 r15 r12d r13d xmm0
 209
 210         vmovss  %xmm0, 192(%rsp, %r14, 4)
 211
 212         /* Process special inputs in loop */
 213         jmp     L(SPECIAL_VALUES_LOOP)
 214         # LOE rbx r15 r12d r13d
 215 END(_ZGVeN16vv_hypotf_skx)
 216
 217         .section .rodata, "a"
 218         .align  64
 219
 220 #ifdef __svml_shypot_data_internal_typedef
 221 typedef unsigned int VUINT32;
 222 typedef struct {
 223         __declspec(align(64)) VUINT32 _sAbsMask[16][1];
 224         __declspec(align(64)) VUINT32 _sHalf[16][1];
 225         __declspec(align(64)) VUINT32 _iExpBound[16][1];
 226 } __svml_shypot_data_internal;
 227 #endif
 228 __svml_shypot_data_internal:
 229         .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _sAbsMask */
 230         .align  64
 231         .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sHalf */
 232         /* fma based algorithm*/
 233         .align  64
 234         .long   0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000, 0x427C0000 /* _iExpBound */
 235         .align  64
 236         .type   __svml_shypot_data_internal, @object
 237         .size   __svml_shypot_data_internal, .-__svml_shypot_data_internal