src/patches/glibc/glibc-rh849651.patch

   1 diff -Nrup a/sysdeps/x86_64/fpu/e_expf.S b/sysdeps/x86_64/fpu/e_expf.S
   2 --- a/sysdeps/x86_64/fpu/e_expf.S       1969-12-31 17:00:00.000000000 -0700
   3 +++ b/sysdeps/x86_64/fpu/e_expf.S       2012-08-20 09:47:15.551971545 -0600
   4 @@ -0,0 +1,339 @@
   5 +/* Optimized __ieee754_expf function.
   6 +   Copyright (C) 2012 Free Software Foundation, Inc.
   7 +   Contributed by Intel Corporation.
   8 +   This file is part of the GNU C Library.
   9 +
  10 +   The GNU C Library is free software; you can redistribute it and/or
  11 +   modify it under the terms of the GNU Lesser General Public
  12 +   License as published by the Free Software Foundation; either
  13 +   version 2.1 of the License, or (at your option) any later version.
  14 +
  15 +   The GNU C Library is distributed in the hope that it will be useful,
  16 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 +   Lesser General Public License for more details.
  19 +
  20 +   You should have received a copy of the GNU Lesser General Public
  21 +   License along with the GNU C Library; if not, see
  22 +   <http://www.gnu.org/licenses/>.  */
  23 +
  24 +#include <sysdep.h>
  25 +
  26 +/* Short algorithm description:
  27 + *
  28 + *  Let K = 64 (table size).
  29 + *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
  30 + *  where
  31 + *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
  32 + *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
  33 + *       values of 2^(j/K) are tabulated as T[j].
  34 + *
  35 + *       P(y) is a minimax polynomial approximation of expf(x)-1
  36 + *       on small interval [0.0..log(2)/K].
  37 + *
  38 + *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
  39 + *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
  40 + *
  41 + * Special cases:
  42 + *  expf(NaN) = NaN
  43 + *  expf(+INF) = +INF
  44 + *  expf(-INF) = 0
  45 + *  expf(x) = 1 for subnormals
  46 + *  for finite argument, only expf(0)=1 is exact
  47 + *  expf(x) overflows if x>88.7228317260742190
  48 + *  expf(x) underflows if x<-103.972076416015620
  49 + */
  50 +
  51 +       .text
  52 +ENTRY(__ieee754_expf)
  53 +       /* Input: single precision x in %xmm0 */
  54 +       cvtss2sd        %xmm0, %xmm1    /* Convert x to double precision */
  55 +       movd    %xmm0, %ecx             /* Copy x */
  56 +       movsd   L(DP_KLN2)(%rip), %xmm2 /* DP K/log(2) */
  57 +       movsd   L(DP_P2)(%rip), %xmm3   /* DP P2 */
  58 +       movl    %ecx, %eax              /* x */
  59 +       mulsd   %xmm1, %xmm2            /* DP x*K/log(2) */
  60 +       andl    $0x7fffffff, %ecx       /* |x| */
  61 +       lea     L(DP_T)(%rip), %rsi     /* address of table T[j] */
  62 +       cmpl    $0x42ad496b, %ecx       /* |x|<125*log(2) ? */
  63 +       movsd   L(DP_P3)(%rip), %xmm4   /* DP P3 */
  64 +       addsd   L(DP_RS)(%rip), %xmm2   /* DP x*K/log(2)+RS */
  65 +       jae     L(special_paths)
  66 +
  67 +       /* Here if |x|<125*log(2) */
  68 +       cmpl    $0x31800000, %ecx       /* |x|<2^(-28) ? */
  69 +       jb      L(small_arg)
  70 +
  71 +       /* Main path: here if 2^(-28)<=|x|<125*log(2) */
  72 +       cvtsd2ss        %xmm2, %xmm2    /* SP x*K/log(2)+RS */
  73 +       movd    %xmm2, %eax             /* bits of n*K+j with trash */
  74 +       subss   L(SP_RS)(%rip), %xmm2   /* SP t=round(x*K/log(2)) */
  75 +       movl    %eax, %edx              /* n*K+j with trash */
  76 +       cvtss2sd        %xmm2, %xmm2    /* DP t */
  77 +       andl    $0x3f, %eax             /* bits of j */
  78 +       mulsd   L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
  79 +       andl    $0xffffffc0, %edx       /* bits of n */
  80 +#ifdef __AVX__
  81 +       vaddsd  %xmm1, %xmm2, %xmm0     /* DP y=x-t*log(2)/K */
  82 +       vmulsd  %xmm0, %xmm0, %xmm2     /* DP z=y*y */
  83 +#else
  84 +       addsd   %xmm1, %xmm2            /* DP y=x-t*log(2)/K */
  85 +       movaps  %xmm2, %xmm0            /* DP y */
  86 +       mulsd   %xmm2, %xmm2            /* DP z=y*y */
  87 +#endif
  88 +       mulsd   %xmm2, %xmm4            /* DP P3*z */
  89 +       addl    $0x1fc0, %edx           /* bits of n + SP exponent bias */
  90 +       mulsd   %xmm2, %xmm3            /* DP P2*z */
  91 +       shll    $17, %edx               /* SP 2^n */
  92 +       addsd   L(DP_P1)(%rip), %xmm4   /* DP P3*z+P1 */
  93 +       addsd   L(DP_P0)(%rip), %xmm3   /* DP P2*z+P0 */
  94 +       movd    %edx, %xmm1             /* SP 2^n */
  95 +       mulsd   %xmm2, %xmm4            /* DP (P3*z+P1)*z */
  96 +       mulsd   %xmm3, %xmm0            /* DP (P2*z+P0)*y */
  97 +       addsd   %xmm4, %xmm0            /* DP P(y) */
  98 +       mulsd   (%rsi,%rax,8), %xmm0    /* DP P(y)*T[j] */
  99 +       addsd   (%rsi,%rax,8), %xmm0    /* DP T[j]*(P(y)+1) */
 100 +       cvtsd2ss        %xmm0, %xmm0    /* SP T[j]*(P(y)+1) */
 101 +       mulss   %xmm1, %xmm0            /* SP result=2^n*(T[j]*(P(y)+1)) */
 102 +       ret
 103 +
 104 +       .p2align        4
 105 +L(small_arg):
 106 +       /* Here if 0<=|x|<2^(-28) */
 107 +       addss   L(SP_ONE)(%rip), %xmm0  /* 1.0 + x */
 108 +       /* Return 1.0 with inexact raised, except for x==0 */
 109 +       ret
 110 +
 111 +       .p2align        4
 112 +L(special_paths):
 113 +       /* Here if 125*log(2)<=|x| */
 114 +       shrl    $31, %eax               /* Get sign bit of x, and depending on it: */
 115 +       lea     L(SP_RANGE)(%rip), %rdx /* load over/underflow bound */
 116 +       cmpl    (%rdx,%rax,4), %ecx     /* |x|<under/overflow bound ? */
 117 +       jbe     L(near_under_or_overflow)
 118 +
 119 +       /* Here if |x|>under/overflow bound */
 120 +       cmpl    $0x7f800000, %ecx       /* |x| is finite ? */
 121 +       jae     L(arg_inf_or_nan)
 122 +
 123 +       /* Here if |x|>under/overflow bound, and x is finite */
 124 +       testq   %rax, %rax              /* sign of x nonzero ? */
 125 +       je      L(res_overflow)
 126 +
 127 +       /* Here if -inf<x<underflow bound (x<0) */
 128 +       movss   L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
 129 +       mulss   %xmm0, %xmm0            /* Return underflowed result (zero or subnormal) */
 130 +       ret
 131 +
 132 +       .p2align        4
 133 +L(res_overflow):
 134 +       /* Here if overflow bound<x<inf (x>0) */
 135 +       movss   L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
 136 +       mulss   %xmm0, %xmm0            /* Return overflowed result (Inf or max normal) */
 137 +       ret
 138 +
 139 +       .p2align        4
 140 +L(arg_inf_or_nan):
 141 +       /* Here if |x| is Inf or NAN */
 142 +       jne     L(arg_nan)      /* |x| is Inf ? */
 143 +
 144 +       /* Here if |x| is Inf */
 145 +       lea     L(SP_INF_0)(%rip), %rdx /* depending on sign of x: */
 146 +       movss   (%rdx,%rax,4), %xmm0    /* return zero or Inf */
 147 +       ret
 148 +
 149 +       .p2align        4
 150 +L(arg_nan):
 151 +       /* Here if |x| is NaN */
 152 +       addss   %xmm0, %xmm0            /* Return x+x (raise invalid) */
 153 +       ret
 154 +
 155 +       .p2align        4
 156 +L(near_under_or_overflow):
 157 +       /* Here if 125*log(2)<=|x|<under/overflow bound */
 158 +       cvtsd2ss        %xmm2, %xmm2    /* SP x*K/log(2)+RS */
 159 +       movd    %xmm2, %eax             /* bits of n*K+j with trash */
 160 +       subss   L(SP_RS)(%rip), %xmm2   /* SP t=round(x*K/log(2)) */
 161 +       movl    %eax, %edx              /* n*K+j with trash */
 162 +       cvtss2sd        %xmm2, %xmm2    /* DP t */
 163 +       andl    $0x3f, %eax             /* bits of j */
 164 +       mulsd   L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
 165 +       andl    $0xffffffc0, %edx       /* bits of n */
 166 +#ifdef __AVX__
 167 +       vaddsd  %xmm1, %xmm2, %xmm0     /* DP y=x-t*log(2)/K */
 168 +       vmulsd  %xmm0, %xmm0, %xmm2     /* DP z=y*y */
 169 +#else
 170 +       addsd   %xmm1, %xmm2            /* DP y=x-t*log(2)/K */
 171 +       movaps  %xmm2, %xmm0            /* DP y */
 172 +       mulsd   %xmm2, %xmm2            /* DP z=y*y */
 173 +#endif
 174 +       mulsd   %xmm2, %xmm4            /* DP P3*z */
 175 +       addl    $0xffc0, %edx           /* bits of n + DP exponent bias */
 176 +       mulsd   %xmm2, %xmm3            /* DP P2*z */
 177 +       shlq    $46, %rdx               /* DP 2^n */
 178 +       addsd   L(DP_P1)(%rip), %xmm4   /* DP P3*z+P1 */
 179 +       addsd   L(DP_P0)(%rip), %xmm3   /* DP P2*z+P0 */
 180 +       movd    %rdx, %xmm1             /* DP 2^n */
 181 +       mulsd   %xmm2, %xmm4            /* DP (P3*z+P1)*z */
 182 +       mulsd   %xmm3, %xmm0            /* DP (P2*z+P0)*y */
 183 +       addsd   %xmm4, %xmm0            /* DP P(y) */
 184 +       mulsd   (%rsi,%rax,8), %xmm0    /* DP P(y)*T[j] */
 185 +       addsd   (%rsi,%rax,8), %xmm0    /* DP T[j]*(P(y)+1) */
 186 +       mulsd   %xmm1, %xmm0            /* DP result=2^n*(T[j]*(P(y)+1)) */
 187 +       cvtsd2ss        %xmm0, %xmm0    /* convert result to single precision */
 188 +       ret
 189 +END(__ieee754_expf)
 190 +
 191 +       .section .rodata, "a"
 192 +       .p2align 3
 193 +L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
 194 +       .long   0x00000000, 0x3ff00000
 195 +       .long   0x3e778061, 0x3ff02c9a
 196 +       .long   0xd3158574, 0x3ff059b0
 197 +       .long   0x18759bc8, 0x3ff08745
 198 +       .long   0x6cf9890f, 0x3ff0b558
 199 +       .long   0x32d3d1a2, 0x3ff0e3ec
 200 +       .long   0xd0125b51, 0x3ff11301
 201 +       .long   0xaea92de0, 0x3ff1429a
 202 +       .long   0x3c7d517b, 0x3ff172b8
 203 +       .long   0xeb6fcb75, 0x3ff1a35b
 204 +       .long   0x3168b9aa, 0x3ff1d487
 205 +       .long   0x88628cd6, 0x3ff2063b
 206 +       .long   0x6e756238, 0x3ff2387a
 207 +       .long   0x65e27cdd, 0x3ff26b45
 208 +       .long   0xf51fdee1, 0x3ff29e9d
 209 +       .long   0xa6e4030b, 0x3ff2d285
 210 +       .long   0x0a31b715, 0x3ff306fe
 211 +       .long   0xb26416ff, 0x3ff33c08
 212 +       .long   0x373aa9cb, 0x3ff371a7
 213 +       .long   0x34e59ff7, 0x3ff3a7db
 214 +       .long   0x4c123422, 0x3ff3dea6
 215 +       .long   0x21f72e2a, 0x3ff4160a
 216 +       .long   0x6061892d, 0x3ff44e08
 217 +       .long   0xb5c13cd0, 0x3ff486a2
 218 +       .long   0xd5362a27, 0x3ff4bfda
 219 +       .long   0x769d2ca7, 0x3ff4f9b2
 220 +       .long   0x569d4f82, 0x3ff5342b
 221 +       .long   0x36b527da, 0x3ff56f47
 222 +       .long   0xdd485429, 0x3ff5ab07
 223 +       .long   0x15ad2148, 0x3ff5e76f
 224 +       .long   0xb03a5585, 0x3ff6247e
 225 +       .long   0x82552225, 0x3ff66238
 226 +       .long   0x667f3bcd, 0x3ff6a09e
 227 +       .long   0x3c651a2f, 0x3ff6dfb2
 228 +       .long   0xe8ec5f74, 0x3ff71f75
 229 +       .long   0x564267c9, 0x3ff75feb
 230 +       .long   0x73eb0187, 0x3ff7a114
 231 +       .long   0x36cf4e62, 0x3ff7e2f3
 232 +       .long   0x994cce13, 0x3ff82589
 233 +       .long   0x9b4492ed, 0x3ff868d9
 234 +       .long   0x422aa0db, 0x3ff8ace5
 235 +       .long   0x99157736, 0x3ff8f1ae
 236 +       .long   0xb0cdc5e5, 0x3ff93737
 237 +       .long   0x9fde4e50, 0x3ff97d82
 238 +       .long   0x82a3f090, 0x3ff9c491
 239 +       .long   0x7b5de565, 0x3ffa0c66
 240 +       .long   0xb23e255d, 0x3ffa5503
 241 +       .long   0x5579fdbf, 0x3ffa9e6b
 242 +       .long   0x995ad3ad, 0x3ffae89f
 243 +       .long   0xb84f15fb, 0x3ffb33a2
 244 +       .long   0xf2fb5e47, 0x3ffb7f76
 245 +       .long   0x904bc1d2, 0x3ffbcc1e
 246 +       .long   0xdd85529c, 0x3ffc199b
 247 +       .long   0x2e57d14b, 0x3ffc67f1
 248 +       .long   0xdcef9069, 0x3ffcb720
 249 +       .long   0x4a07897c, 0x3ffd072d
 250 +       .long   0xdcfba487, 0x3ffd5818
 251 +       .long   0x03db3285, 0x3ffda9e6
 252 +       .long   0x337b9b5f, 0x3ffdfc97
 253 +       .long   0xe78b3ff6, 0x3ffe502e
 254 +       .long   0xa2a490da, 0x3ffea4af
 255 +       .long   0xee615a27, 0x3ffefa1b
 256 +       .long   0x5b6e4540, 0x3fff5076
 257 +       .long   0x819e90d8, 0x3fffa7c1
 258 +       .type L(DP_T), @object
 259 +       ASM_SIZE_DIRECTIVE(L(DP_T))
 260 +
 261 +       .section .rodata.cst8,"aM",@progbits,8
 262 +       .p2align 3
 263 +L(DP_KLN2): /* double precision K/log(2) */
 264 +       .long   0x652b82fe, 0x40571547
 265 +       .type L(DP_KLN2), @object
 266 +       ASM_SIZE_DIRECTIVE(L(DP_KLN2))
 267 +
 268 +       .p2align 3
 269 +L(DP_NLN2K): /* double precision -log(2)/K */
 270 +       .long   0xfefa39ef, 0xbf862e42
 271 +       .type L(DP_NLN2K), @object
 272 +       ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
 273 +
 274 +       .p2align 3
 275 +L(DP_RS): /* double precision 2^23+2^22 */
 276 +       .long   0x00000000, 0x41680000
 277 +       .type L(DP_RS), @object
 278 +       ASM_SIZE_DIRECTIVE(L(DP_RS))
 279 +
 280 +       .p2align 3
 281 +L(DP_P3): /* double precision polynomial coefficient P3 */
 282 +       .long   0xeb78fa85, 0x3fa56420
 283 +       .type L(DP_P3), @object
 284 +       ASM_SIZE_DIRECTIVE(L(DP_P3))
 285 +
 286 +       .p2align 3
 287 +L(DP_P1): /* double precision polynomial coefficient P1 */
 288 +       .long   0x008d6118, 0x3fe00000
 289 +       .type L(DP_P1), @object
 290 +       ASM_SIZE_DIRECTIVE(L(DP_P1))
 291 +
 292 +       .p2align 3
 293 +L(DP_P2): /* double precision polynomial coefficient P2 */
 294 +       .long   0xda752d4f, 0x3fc55550
 295 +       .type L(DP_P2), @object
 296 +       ASM_SIZE_DIRECTIVE(L(DP_P2))
 297 +
 298 +       .p2align 3
 299 +L(DP_P0): /* double precision polynomial coefficient P0 */
 300 +       .long   0xffffe7c6, 0x3fefffff
 301 +       .type L(DP_P0), @object
 302 +       ASM_SIZE_DIRECTIVE(L(DP_P0))
 303 +
 304 +       .p2align 2
 305 +L(SP_RANGE): /* single precision overflow/underflow bounds */
 306 +       .long   0x42b17217      /* if x>this bound, then result overflows */
 307 +       .long   0x42cff1b4      /* if x<this bound, then result underflows */
 308 +       .type L(SP_RANGE), @object
 309 +       ASM_SIZE_DIRECTIVE(L(SP_RANGE))
 310 +
 311 +       .p2align 2
 312 +L(SP_INF_0):
 313 +       .long   0x7f800000      /* single precision Inf */
 314 +       .long   0               /* single precision zero */
 315 +       .type L(SP_INF_0), @object
 316 +       ASM_SIZE_DIRECTIVE(L(SP_INF_0))
 317 +
 318 +       .section .rodata.cst4,"aM",@progbits,4
 319 +       .p2align 2
 320 +L(SP_RS): /* single precision 2^23+2^22 */
 321 +       .long   0x4b400000
 322 +       .type L(SP_RS), @object
 323 +       ASM_SIZE_DIRECTIVE(L(SP_RS))
 324 +
 325 +       .p2align 2
 326 +L(SP_SMALL): /* single precision small value 2^(-100) */
 327 +       .long   0x0d800000
 328 +       .type L(SP_SMALL), @object
 329 +       ASM_SIZE_DIRECTIVE(L(SP_SMALL))
 330 +
 331 +       .p2align 2
 332 +L(SP_LARGE): /* single precision large value 2^100 */
 333 +       .long   0x71800000
 334 +       .type L(SP_LARGE), @object
 335 +       ASM_SIZE_DIRECTIVE(L(SP_LARGE))
 336 +
 337 +       .p2align 2
 338 +L(SP_ONE): /* single precision 1.0 */
 339 +       .long   0x3f800000
 340 +       .type L(SP_ONE), @object
 341 +       ASM_SIZE_DIRECTIVE(L(SP_ONE))
 342 +
 343 +strong_alias (__ieee754_expf, __expf_finite)