sysdeps/x86_64/fpu/multiarch/svml_s_log2f4_core_sse4.S

   1 /* Function log2f vectorized with SSE4.
   2    Copyright (C) 2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *    Get short reciprocal approximation Rcp ~ 1/mantissa(x)
  23  *    R = Rcp*x - 1.0
  24  *    log2(x) = k - log2(Rcp) + poly_approximation(R)
  25  *       log2(Rcp) is tabulated
  26  *
  27  *
  28  */
  29
  30 /* Offsets for data table __svml_slog2_data_internal
  31  */
  32 #define MinNorm                         0
  33 #define MaxNorm                         16
  34 #define iBrkValue                       32
  35 #define iOffExpoMask                    48
  36 #define One                             64
  37 #define sPoly                           80
  38
  39 #include <sysdep.h>
  40
  41         .text
  42         .section .text.sse4,"ax",@progbits
  43 ENTRY(_ZGVbN4v_log2f_sse4)
  44         subq      $72, %rsp
  45         cfi_def_cfa_offset(80)
  46         movaps    %xmm0, %xmm1
  47
  48 /* reduction: compute r,n */
  49         movdqu    iBrkValue+__svml_slog2_data_internal(%rip), %xmm2
  50         movaps    %xmm0, %xmm4
  51         movdqu    iOffExpoMask+__svml_slog2_data_internal(%rip), %xmm10
  52         psubd     %xmm2, %xmm1
  53         pand      %xmm1, %xmm10
  54         movaps    %xmm0, %xmm3
  55         paddd     %xmm2, %xmm10
  56         psrad     $23, %xmm1
  57         movups    sPoly+__svml_slog2_data_internal(%rip), %xmm5
  58         movups    sPoly+32+__svml_slog2_data_internal(%rip), %xmm6
  59         movups    sPoly+64+__svml_slog2_data_internal(%rip), %xmm7
  60         movups    sPoly+96+__svml_slog2_data_internal(%rip), %xmm9
  61         cmpltps   MinNorm+__svml_slog2_data_internal(%rip), %xmm4
  62         cmpnleps  MaxNorm+__svml_slog2_data_internal(%rip), %xmm3
  63         cvtdq2ps  %xmm1, %xmm1
  64         subps     One+__svml_slog2_data_internal(%rip), %xmm10
  65         mulps     %xmm10, %xmm5
  66         movaps    %xmm10, %xmm8
  67         mulps     %xmm10, %xmm6
  68         mulps     %xmm10, %xmm8
  69         addps     sPoly+16+__svml_slog2_data_internal(%rip), %xmm5
  70         mulps     %xmm10, %xmm7
  71         addps     sPoly+48+__svml_slog2_data_internal(%rip), %xmm6
  72         mulps     %xmm10, %xmm9
  73         mulps     %xmm8, %xmm5
  74         addps     sPoly+80+__svml_slog2_data_internal(%rip), %xmm7
  75         addps     sPoly+112+__svml_slog2_data_internal(%rip), %xmm9
  76         addps     %xmm5, %xmm6
  77         mulps     %xmm8, %xmm6
  78         orps      %xmm3, %xmm4
  79
  80 /* combine and get argument value range mask */
  81         movmskps  %xmm4, %edx
  82         addps     %xmm6, %xmm7
  83         mulps     %xmm7, %xmm8
  84         addps     %xmm8, %xmm9
  85         mulps     %xmm10, %xmm9
  86         addps     sPoly+128+__svml_slog2_data_internal(%rip), %xmm9
  87         mulps     %xmm9, %xmm10
  88         addps     %xmm10, %xmm1
  89         testl     %edx, %edx
  90
  91 /* Go to special inputs processing branch */
  92         jne       L(SPECIAL_VALUES_BRANCH)
  93                                 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
  94
  95 /* Restore registers
  96  * and exit the function
  97  */
  98
  99 L(EXIT):
 100         movaps    %xmm1, %xmm0
 101         addq      $72, %rsp
 102         cfi_def_cfa_offset(8)
 103         ret
 104         cfi_def_cfa_offset(80)
 105
 106 /* Branch to process
 107  * special inputs
 108  */
 109
 110 L(SPECIAL_VALUES_BRANCH):
 111         movups    %xmm0, 32(%rsp)
 112         movups    %xmm1, 48(%rsp)
 113                                 # LOE rbx rbp r12 r13 r14 r15 edx
 114
 115         xorl      %eax, %eax
 116         movq      %r12, 16(%rsp)
 117         cfi_offset(12, -64)
 118         movl      %eax, %r12d
 119         movq      %r13, 8(%rsp)
 120         cfi_offset(13, -72)
 121         movl      %edx, %r13d
 122         movq      %r14, (%rsp)
 123         cfi_offset(14, -80)
 124                                 # LOE rbx rbp r15 r12d r13d
 125
 126 /* Range mask
 127  * bits check
 128  */
 129
 130 L(RANGEMASK_CHECK):
 131         btl       %r12d, %r13d
 132
 133 /* Call scalar math function */
 134         jc        L(SCALAR_MATH_CALL)
 135                                 # LOE rbx rbp r15 r12d r13d
 136
 137 /* Special inputs
 138  * processing loop
 139  */
 140
 141 L(SPECIAL_VALUES_LOOP):
 142         incl      %r12d
 143         cmpl      $4, %r12d
 144
 145 /* Check bits in range mask */
 146         jl        L(RANGEMASK_CHECK)
 147                                 # LOE rbx rbp r15 r12d r13d
 148
 149         movq      16(%rsp), %r12
 150         cfi_restore(12)
 151         movq      8(%rsp), %r13
 152         cfi_restore(13)
 153         movq      (%rsp), %r14
 154         cfi_restore(14)
 155         movups    48(%rsp), %xmm1
 156
 157 /* Go to exit */
 158         jmp       L(EXIT)
 159         cfi_offset(12, -64)
 160         cfi_offset(13, -72)
 161         cfi_offset(14, -80)
 162                                 # LOE rbx rbp r12 r13 r14 r15 xmm1
 163
 164 /* Scalar math fucntion call
 165  * to process special input
 166  */
 167
 168 L(SCALAR_MATH_CALL):
 169         movl      %r12d, %r14d
 170         movss     32(%rsp,%r14,4), %xmm0
 171         call      log2f@PLT
 172                                 # LOE rbx rbp r14 r15 r12d r13d xmm0
 173
 174         movss     %xmm0, 48(%rsp,%r14,4)
 175
 176 /* Process special inputs in loop */
 177         jmp       L(SPECIAL_VALUES_LOOP)
 178                                 # LOE rbx rbp r15 r12d r13d
 179 END(_ZGVbN4v_log2f_sse4)
 180
 181         .section .rodata, "a"
 182         .align 16
 183
 184 #ifdef __svml_slog2_data_internal_typedef
 185 typedef unsigned int VUINT32;
 186 typedef struct {
 187         __declspec(align(16)) VUINT32 MinNorm[4][1];
 188         __declspec(align(16)) VUINT32 MaxNorm[4][1];
 189         __declspec(align(16)) VUINT32 iBrkValue[4][1];
 190         __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
 191         __declspec(align(16)) VUINT32 One[4][1];
 192         __declspec(align(16)) VUINT32 sPoly[9][4][1];
 193 } __svml_slog2_data_internal;
 194 #endif
 195 __svml_slog2_data_internal:
 196         /*== MinNorm ==*/
 197         .long 0x00800000, 0x00800000, 0x00800000, 0x00800000
 198         /*== MaxNorm ==*/
 199         .align 16
 200         .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
 201         /*== iBrkValue = SP 2/3 ==*/
 202         .align 16
 203         .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 204         /*== iOffExpoMask = SP significand mask ==*/
 205         .align 16
 206         .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
 207         /*== sOne = SP 1.0 ==*/
 208         .align 16
 209         .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
 210         /*== spoly[9] ==*/
 211         .align 16
 212         .long 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012 /* coeff9 */
 213         .long 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14 /* coeff8 */
 214         .long 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B /* coeff7 */
 215         .long 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824 /* coeff6 */
 216         .long 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07 /* coeff5 */
 217         .long 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969 /* coeff4 */
 218         .long 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0 /* coeff3 */
 219         .long 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B /* coeff2 */
 220         .long 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B /* coeff1 */
 221         .align 16
 222         .type   __svml_slog2_data_internal,@object
 223         .size   __svml_slog2_data_internal,.-__svml_slog2_data_internal