sysdeps/powerpc/power4/fpu/mpa.c

   1
   2 /*
   3  * IBM Accurate Mathematical Library
   4  * written by International Business Machines Corp.
   5  * Copyright (C) 2001-2013 Free Software Foundation, Inc.
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU Lesser General Public License as published by
   9  * the Free Software Foundation; either version 2.1 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public License
  18  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  19  */
  20
  21 /* Define __mul and __sqr and use the rest from generic code.  */
  22 #define NO__MUL
  23 #define NO__SQR
  24
  25 #include <sysdeps/ieee754/dbl-64/mpa.c>
  26
  27 /* Multiply *X and *Y and store result in *Z.  X and Y may overlap but not X
  28    and Z or Y and Z.  For P in [1, 2, 3], the exact result is truncated to P
  29    digits.  In case P > 3 the error is bounded by 1.001 ULP.  */
  30 void
  31 __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
  32 {
  33   long i, i1, i2, j, k, k2;
  34   long p2 = p;
  35   double u, zk, zk2;
  36
  37   /* Is z=0?  */
  38   if (__glibc_unlikely (X[0] * Y[0] == 0))
  39     {
  40       Z[0] = 0;
  41       return;
  42     }
  43
  44   /* Multiply, add and carry */
  45   k2 = (p2 < 3) ? p2 + p2 : p2 + 3;
  46   zk = Z[k2] = 0;
  47   for (k = k2; k > 1;)
  48     {
  49       if (k > p2)
  50         {
  51           i1 = k - p2;
  52           i2 = p2 + 1;
  53         }
  54       else
  55         {
  56           i1 = 1;
  57           i2 = k;
  58         }
  59 #if 1
  60       /* Rearrange this inner loop to allow the fmadd instructions to be
  61          independent and execute in parallel on processors that have
  62          dual symmetrical FP pipelines.  */
  63       if (i1 < (i2 - 1))
  64         {
  65           /* Make sure we have at least 2 iterations.  */
  66           if (((i2 - i1) & 1L) == 1L)
  67             {
  68               /* Handle the odd iterations case.  */
  69               zk2 = x->d[i2 - 1] * y->d[i1];
  70             }
  71           else
  72             zk2 = 0.0;
  73           /* Do two multiply/adds per loop iteration, using independent
  74              accumulators; zk and zk2.  */
  75           for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2)
  76             {
  77               zk += x->d[i] * y->d[j];
  78               zk2 += x->d[i + 1] * y->d[j - 1];
  79             }
  80           zk += zk2;            /* Final sum.  */
  81         }
  82       else
  83         {
  84           /* Special case when iterations is 1.  */
  85           zk += x->d[i1] * y->d[i1];
  86         }
  87 #else
  88       /* The original code.  */
  89       for (i = i1, j = i2 - 1; i < i2; i++, j--)
  90         zk += X[i] * Y[j];
  91 #endif
  92
  93       u = (zk + CUTTER) - CUTTER;
  94       if (u > zk)
  95         u -= RADIX;
  96       Z[k] = zk - u;
  97       zk = u * RADIXI;
  98       --k;
  99     }
 100   Z[k] = zk;
 101
 102   int e = EX + EY;
 103   /* Is there a carry beyond the most significant digit?  */
 104   if (Z[1] == 0)
 105     {
 106       for (i = 1; i <= p2; i++)
 107         Z[i] = Z[i + 1];
 108       e--;
 109     }
 110
 111   EZ = e;
 112   Z[0] = X[0] * Y[0];
 113 }
 114
 115 /* Square *X and store result in *Y.  X and Y may not overlap.  For P in
 116    [1, 2, 3], the exact result is truncated to P digits.  In case P > 3 the
 117    error is bounded by 1.001 ULP.  This is a faster special case of
 118    multiplication.  */
 119 void
 120 __sqr (const mp_no *x, mp_no *y, int p)
 121 {
 122   long i, j, k, ip;
 123   double u, yk;
 124
 125   /* Is z=0?  */
 126   if (__glibc_unlikely (X[0] == 0))
 127     {
 128       Y[0] = 0;
 129       return;
 130     }
 131
 132   /* We need not iterate through all X's since it's pointless to
 133      multiply zeroes.  */
 134   for (ip = p; ip > 0; ip--)
 135     if (X[ip] != 0)
 136       break;
 137
 138   k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
 139
 140   while (k > 2 * ip + 1)
 141     Y[k--] = 0;
 142
 143   yk = 0;
 144
 145   while (k > p)
 146     {
 147       double yk2 = 0.0;
 148       long lim = k / 2;
 149
 150       if (k % 2 == 0)
 151         {
 152           yk += X[lim] * X[lim];
 153           lim--;
 154         }
 155
 156       /* In __mul, this loop (and the one within the next while loop) run
 157          between a range to calculate the mantissa as follows:
 158
 159          Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
 160                 + X[n] * Y[k]
 161
 162          For X == Y, we can get away with summing halfway and doubling the
 163          result.  For cases where the range size is even, the mid-point needs
 164          to be added separately (above).  */
 165       for (i = k - p, j = p; i <= lim; i++, j--)
 166         yk2 += X[i] * X[j];
 167
 168       yk += 2.0 * yk2;
 169
 170       u = (yk + CUTTER) - CUTTER;
 171       if (u > yk)
 172         u -= RADIX;
 173       Y[k--] = yk - u;
 174       yk = u * RADIXI;
 175     }
 176
 177   while (k > 1)
 178     {
 179       double yk2 = 0.0;
 180       long lim = k / 2;
 181
 182       if (k % 2 == 0)
 183         {
 184           yk += X[lim] * X[lim];
 185           lim--;
 186         }
 187
 188       /* Likewise for this loop.  */
 189       for (i = 1, j = k - 1; i <= lim; i++, j--)
 190         yk2 += X[i] * X[j];
 191
 192       yk += 2.0 * yk2;
 193
 194       u = (yk + CUTTER) - CUTTER;
 195       if (u > yk)
 196         u -= RADIX;
 197       Y[k--] = yk - u;
 198       yk = u * RADIXI;
 199     }
 200   Y[k] = yk;
 201
 202   /* Squares are always positive.  */
 203   Y[0] = 1.0;
 204
 205   int e = EX * 2;
 206   /* Is there a carry beyond the most significant digit?  */
 207   if (__glibc_unlikely (Y[1] == 0))
 208     {
 209       for (i = 1; i <= p; i++)
 210         Y[i] = Y[i + 1];
 211       e--;
 212     }
 213   EY = e;
 214 }