[thirdparty/glibc.git] / sysdeps / powerpc / power4 / fpu / mpa.c


/*
 * IBM Accurate Mathematical Library
 * written by International Business Machines Corp.
 * Copyright (C) 2001-2019 Free Software Foundation, Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

/* Define __mul and __sqr and use the rest from generic code.  */
#define NO__MUL
#define NO__SQR

#include <sysdeps/ieee754/dbl-64/mpa.c>

/* Multiply *X and *Y and store result in *Z.  X and Y may overlap but not X
   and Z or Y and Z.  For P in [1, 2, 3], the exact result is truncated to P
   digits.  In case P > 3 the error is bounded by 1.001 ULP.  */
void
__mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
  long i, i1, i2, j, k, k2;
  long p2 = p;
  double u, zk, zk2;

  /* Is z=0?  */
  if (__glibc_unlikely (X[0] * Y[0] == 0))
    {
      Z[0] = 0;
      return;
    }

  /* Multiply, add and carry */
  k2 = (p2 < 3) ? p2 + p2 : p2 + 3;
  zk = Z[k2] = 0;
  for (k = k2; k > 1;)
    {
      if (k > p2)
	{
	  i1 = k - p2;
	  i2 = p2 + 1;
	}
      else
	{
	  i1 = 1;
	  i2 = k;
	}
#if 1
      /* Rearrange this inner loop to allow the fmadd instructions to be
         independent and execute in parallel on processors that have
         dual symmetrical FP pipelines.  */
      if (i1 < (i2 - 1))
	{
	  /* Make sure we have at least 2 iterations.  */
	  if (((i2 - i1) & 1L) == 1L)
	    {
	      /* Handle the odd iterations case.  */
	      zk2 = x->d[i2 - 1] * y->d[i1];
	    }
	  else
	    zk2 = 0.0;
	  /* Do two multiply/adds per loop iteration, using independent
	     accumulators; zk and zk2.  */
	  for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2)
	    {
	      zk += x->d[i] * y->d[j];
	      zk2 += x->d[i + 1] * y->d[j - 1];
	    }
	  zk += zk2;		/* Final sum.  */
	}
      else
	{
	  /* Special case when iterations is 1.  */
	  zk += x->d[i1] * y->d[i1];
	}
#else
      /* The original code.  */
      for (i = i1, j = i2 - 1; i < i2; i++, j--)
	zk += X[i] * Y[j];
#endif

      u = (zk + CUTTER) - CUTTER;
      if (u > zk)
	u -= RADIX;
      Z[k] = zk - u;
      zk = u * RADIXI;
      --k;
    }
  Z[k] = zk;

  int e = EX + EY;
  /* Is there a carry beyond the most significant digit?  */
  if (Z[1] == 0)
    {
      for (i = 1; i <= p2; i++)
	Z[i] = Z[i + 1];
      e--;
    }

  EZ = e;
  Z[0] = X[0] * Y[0];
}

/* Square *X and store result in *Y.  X and Y may not overlap.  For P in
   [1, 2, 3], the exact result is truncated to P digits.  In case P > 3 the
   error is bounded by 1.001 ULP.  This is a faster special case of
   multiplication.  */
void
__sqr (const mp_no *x, mp_no *y, int p)
{
  long i, j, k, ip;
  double u, yk;

  /* Is z=0?  */
  if (__glibc_unlikely (X[0] == 0))
    {
      Y[0] = 0;
      return;
    }

  /* We need not iterate through all X's since it's pointless to
     multiply zeroes.  */
  for (ip = p; ip > 0; ip--)
    if (X[ip] != 0)
      break;

  k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;

  while (k > 2 * ip + 1)
    Y[k--] = 0;

  yk = 0;

  while (k > p)
    {
      double yk2 = 0.0;
      long lim = k / 2;

      if (k % 2 == 0)
        {
	  yk += X[lim] * X[lim];
	  lim--;
	}

      /* In __mul, this loop (and the one within the next while loop) run
         between a range to calculate the mantissa as follows:

         Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
		+ X[n] * Y[k]

         For X == Y, we can get away with summing halfway and doubling the
	 result.  For cases where the range size is even, the mid-point needs
	 to be added separately (above).  */
      for (i = k - p, j = p; i <= lim; i++, j--)
	yk2 += X[i] * X[j];

      yk += 2.0 * yk2;

      u = (yk + CUTTER) - CUTTER;
      if (u > yk)
	u -= RADIX;
      Y[k--] = yk - u;
      yk = u * RADIXI;
    }

  while (k > 1)
    {
      double yk2 = 0.0;
      long lim = k / 2;

      if (k % 2 == 0)
        {
	  yk += X[lim] * X[lim];
	  lim--;
	}

      /* Likewise for this loop.  */
      for (i = 1, j = k - 1; i <= lim; i++, j--)
	yk2 += X[i] * X[j];

      yk += 2.0 * yk2;

      u = (yk + CUTTER) - CUTTER;
      if (u > yk)
	u -= RADIX;
      Y[k--] = yk - u;
      yk = u * RADIXI;
    }
  Y[k] = yk;

  /* Squares are always positive.  */
  Y[0] = 1.0;

  int e = EX * 2;
  /* Is there a carry beyond the most significant digit?  */
  if (__glibc_unlikely (Y[1] == 0))
    {
      for (i = 1; i <= p; i++)
	Y[i] = Y[i + 1];
      e--;
    }
  EY = e;
}
Commit	Line	Data
04067002 UD	1
	2	/*
	3	* IBM Accurate Mathematical Library
	4	* written by International Business Machines Corp.
04277e02	5	* Copyright (C) 2001-2019 Free Software Foundation, Inc.
04067002 UD	6	*
	7	* This program is free software; you can redistribute it and/or modify
	8	* it under the terms of the GNU Lesser General Public License as published by
	9	* the Free Software Foundation; either version 2.1 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* This program is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public License
59ba27a6	18	* along with this program; if not, see <http://www.gnu.org/licenses/>.
04067002	19	*/
04067002	20
82a9811d SP	21	/* Define __mul and __sqr and use the rest from generic code. */
	22	#define NO__MUL
	23	#define NO__SQR
04067002	24
82a9811d	25	#include <sysdeps/ieee754/dbl-64/mpa.c>
04067002	26
950c99ca SP	27	/* Multiply X and Y and store result in *Z. X and Y may overlap but not X
	28	and Z or Y and Z. For P in [1, 2, 3], the exact result is truncated to P
	29	digits. In case P > 3 the error is bounded by 1.001 ULP. */
1066a534 SP	30	void
	31	__mul (const mp_no x, const mp_no y, mp_no *z, int p)
	32	{
04067002 UD	33	long i, i1, i2, j, k, k2;
	34	long p2 = p;
	35	double u, zk, zk2;
	36
20cd7fb3	37	/* Is z=0? */
a64d7e0e	38	if (__glibc_unlikely (X[0] * Y[0] == 0))
1066a534	39	{
a64d7e0e	40	Z[0] = 0;
1066a534 SP	41	return;
1066a534 SP	42	}
04067002	43
1066a534 SP	44	/* Multiply, add and carry */
1066a534 SP	45	k2 = (p2 < 3) ? p2 + p2 : p2 + 3;
a64d7e0e	46	zk = Z[k2] = 0;
1066a534	47	for (k = k2; k > 1;)
04067002	48	{
1066a534	49	if (k > p2)
04067002	50	{
1066a534 SP	51	i1 = k - p2;
1066a534 SP	52	i2 = p2 + 1;
04067002	53	}
1066a534	54	else
04067002	55	{
1066a534 SP	56	i1 = 1;
	57	i2 = k;
	58	}
	59	#if 1
	60	/* Rearrange this inner loop to allow the fmadd instructions to be
	61	independent and execute in parallel on processors that have
	62	dual symmetrical FP pipelines. */
	63	if (i1 < (i2 - 1))
	64	{
	65	/* Make sure we have at least 2 iterations. */
	66	if (((i2 - i1) & 1L) == 1L)
	67	{
	68	/* Handle the odd iterations case. */
	69	zk2 = x->d[i2 - 1] * y->d[i1];
	70	}
	71	else
	72	zk2 = 0.0;
	73	/* Do two multiply/adds per loop iteration, using independent
	74	accumulators; zk and zk2. */
	75	for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2)
	76	{
	77	zk += x->d[i] * y->d[j];
	78	zk2 += x->d[i + 1] * y->d[j - 1];
	79	}
	80	zk += zk2; /* Final sum. */
	81	}
	82	else
	83	{
	84	/* Special case when iterations is 1. */
	85	zk += x->d[i1] * y->d[i1];
04067002	86	}
04067002	87	#else
1066a534 SP	88	/* The original code. */
	89	for (i = i1, j = i2 - 1; i < i2; i++, j--)
	90	zk += X[i] * Y[j];
04067002 UD	91	#endif
04067002 UD	92
1066a534 SP	93	u = (zk + CUTTER) - CUTTER;
	94	if (u > zk)
	95	u -= RADIX;
	96	Z[k] = zk - u;
	97	zk = u * RADIXI;
	98	--k;
	99	}
04067002 UD	100	Z[k] = zk;
04067002 UD	101
e6ebd4a7	102	int e = EX + EY;
950c99ca	103	/* Is there a carry beyond the most significant digit? */
a64d7e0e	104	if (Z[1] == 0)
1066a534 SP	105	{
	106	for (i = 1; i <= p2; i++)
	107	Z[i] = Z[i + 1];
e6ebd4a7	108	e--;
1066a534	109	}
04067002	110
e6ebd4a7	111	EZ = e;
04067002	112	Z[0] = X[0] * Y[0];
04067002 UD	113	}
04067002 UD	114
d6752ccd SP	115	/* Square X and store result in Y. X and Y may not overlap. For P in
	116	[1, 2, 3], the exact result is truncated to P digits. In case P > 3 the
	117	error is bounded by 1.001 ULP. This is a faster special case of
	118	multiplication. */
	119	void
	120	__sqr (const mp_no x, mp_no y, int p)
	121	{
	122	long i, j, k, ip;
	123	double u, yk;
	124
	125	/* Is z=0? */
a64d7e0e	126	if (__glibc_unlikely (X[0] == 0))
d6752ccd	127	{
a64d7e0e	128	Y[0] = 0;
d6752ccd SP	129	return;
	130	}
	131
	132	/* We need not iterate through all X's since it's pointless to
	133	multiply zeroes. */
	134	for (ip = p; ip > 0; ip--)
a64d7e0e	135	if (X[ip] != 0)
d6752ccd SP	136	break;
	137
	138	k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
	139
	140	while (k > 2 * ip + 1)
a64d7e0e	141	Y[k--] = 0;
d6752ccd	142
a64d7e0e	143	yk = 0;
d6752ccd SP	144
	145	while (k > p)
	146	{
	147	double yk2 = 0.0;
	148	long lim = k / 2;
	149
	150	if (k % 2 == 0)
	151	{
	152	yk += X[lim] * X[lim];
	153	lim--;
	154	}
	155
	156	/* In __mul, this loop (and the one within the next while loop) run
	157	between a range to calculate the mantissa as follows:
	158
	159	Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
	160	+ X[n] * Y[k]
	161
	162	For X == Y, we can get away with summing halfway and doubling the
	163	result. For cases where the range size is even, the mid-point needs
	164	to be added separately (above). */
	165	for (i = k - p, j = p; i <= lim; i++, j--)
	166	yk2 += X[i] * X[j];
	167
	168	yk += 2.0 * yk2;
	169
	170	u = (yk + CUTTER) - CUTTER;
	171	if (u > yk)
	172	u -= RADIX;
	173	Y[k--] = yk - u;
	174	yk = u * RADIXI;
	175	}
	176
	177	while (k > 1)
	178	{
	179	double yk2 = 0.0;
	180	long lim = k / 2;
	181
	182	if (k % 2 == 0)
	183	{
	184	yk += X[lim] * X[lim];
	185	lim--;
	186	}
	187
	188	/* Likewise for this loop. */
	189	for (i = 1, j = k - 1; i <= lim; i++, j--)
	190	yk2 += X[i] * X[j];
	191
	192	yk += 2.0 * yk2;
	193
	194	u = (yk + CUTTER) - CUTTER;
	195	if (u > yk)
	196	u -= RADIX;
	197	Y[k--] = yk - u;
	198	yk = u * RADIXI;
	199	}
	200	Y[k] = yk;
	201
	202	/* Squares are always positive. */
	203	Y[0] = 1.0;
	204
e6ebd4a7	205	int e = EX * 2;
d6752ccd	206	/* Is there a carry beyond the most significant digit? */
a64d7e0e	207	if (__glibc_unlikely (Y[1] == 0))
d6752ccd SP	208	{
	209	for (i = 1; i <= p; i++)
	210	Y[i] = Y[i + 1];
e6ebd4a7	211	e--;
d6752ccd	212	}
e6ebd4a7	213	EY = e;
d6752ccd	214	}