[thirdparty/gcc.git] / libgfortran / generated / matmul_r16.c

/* Implementation of the MATMUL intrinsic
   Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
   Contributed by Paul Brook <paul@nowt.org>

This file is part of the GNU Fortran 95 runtime library (libgfortran).

Libgfortran is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.

In addition to the permissions in the GNU General Public License, the
Free Software Foundation gives you unlimited permission to link the
compiled version of this file into combinations with other programs,
and to distribute those combinations without any restriction coming
from the use of this file.  (The General Public License restrictions
do apply in other respects; for example, they cover modification of
the file, and distribution when not linked into a combine
executable.)

Libgfortran is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public
License along with libgfortran; see the file COPYING.  If not,
write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
Boston, MA 02110-1301, USA.  */

#include "config.h"
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "libgfortran.h"

#if defined (HAVE_GFC_REAL_16)

/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */

typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
                          const int *, const GFC_REAL_16 *, const int *,
                          const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
                          int, int);

/* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.

   The equivalent Fortran pseudo-code is:

   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
   IF (.NOT.IS_TRANSPOSED(A)) THEN
     C = 0
     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
         S = 0
         DO K=1,COUNT
           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
*/

/* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */

extern void matmul_r16 (gfc_array_r16 * const restrict retarray, 
	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
	int blas_limit, blas_call gemm);
export_proto(matmul_r16);

void
matmul_r16 (gfc_array_r16 * const restrict retarray, 
	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
	int blas_limit, blas_call gemm)
{
  const GFC_REAL_16 * restrict abase;
  const GFC_REAL_16 * restrict bbase;
  GFC_REAL_16 * restrict dest;

  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  index_type x, y, n, count, xcount, ycount;

  assert (GFC_DESCRIPTOR_RANK (a) == 2
          || GFC_DESCRIPTOR_RANK (b) == 2);

/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]

   Either A or B (but not both) can be rank 1:

   o One-dimensional argument A is implicitly treated as a row matrix
     dimensioned [1,count], so xcount=1.

   o One-dimensional argument B is implicitly treated as a column matrix
     dimensioned [count, 1], so ycount=1.
  */

  if (retarray->data == NULL)
    {
      if (GFC_DESCRIPTOR_RANK (a) == 1)
        {
          retarray->dim[0].lbound = 0;
          retarray->dim[0].ubound = b->dim[1].ubound - b->dim[1].lbound;
          retarray->dim[0].stride = 1;
        }
      else if (GFC_DESCRIPTOR_RANK (b) == 1)
        {
          retarray->dim[0].lbound = 0;
          retarray->dim[0].ubound = a->dim[0].ubound - a->dim[0].lbound;
          retarray->dim[0].stride = 1;
        }
      else
        {
          retarray->dim[0].lbound = 0;
          retarray->dim[0].ubound = a->dim[0].ubound - a->dim[0].lbound;
          retarray->dim[0].stride = 1;

          retarray->dim[1].lbound = 0;
          retarray->dim[1].ubound = b->dim[1].ubound - b->dim[1].lbound;
          retarray->dim[1].stride = retarray->dim[0].ubound+1;
        }

      retarray->data
	= internal_malloc_size (sizeof (GFC_REAL_16) * size0 ((array_t *) retarray));
      retarray->offset = 0;
    }


  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    {
      /* One-dimensional result may be addressed in the code below
	 either as a row or a column matrix. We want both cases to
	 work. */
      rxstride = rystride = retarray->dim[0].stride;
    }
  else
    {
      rxstride = retarray->dim[0].stride;
      rystride = retarray->dim[1].stride;
    }


  if (GFC_DESCRIPTOR_RANK (a) == 1)
    {
      /* Treat it as a a row matrix A[1,count]. */
      axstride = a->dim[0].stride;
      aystride = 1;

      xcount = 1;
      count = a->dim[0].ubound + 1 - a->dim[0].lbound;
    }
  else
    {
      axstride = a->dim[0].stride;
      aystride = a->dim[1].stride;

      count = a->dim[1].ubound + 1 - a->dim[1].lbound;
      xcount = a->dim[0].ubound + 1 - a->dim[0].lbound;
    }

  if (count != b->dim[0].ubound + 1 - b->dim[0].lbound)
    runtime_error ("dimension of array B incorrect in MATMUL intrinsic");

  if (GFC_DESCRIPTOR_RANK (b) == 1)
    {
      /* Treat it as a column matrix B[count,1] */
      bxstride = b->dim[0].stride;

      /* bystride should never be used for 1-dimensional b.
	 in case it is we want it to cause a segfault, rather than
	 an incorrect result. */
      bystride = 0xDEADBEEF;
      ycount = 1;
    }
  else
    {
      bxstride = b->dim[0].stride;
      bystride = b->dim[1].stride;
      ycount = b->dim[1].ubound + 1 - b->dim[1].lbound;
    }

  abase = a->data;
  bbase = b->data;
  dest = retarray->data;


  /* Now that everything is set up, we're performing the multiplication
     itself.  */

#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))

  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_REAL_16 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;

    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }

  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_REAL_16 * restrict bbase_y;
      GFC_REAL_16 * restrict dest_y;
      const GFC_REAL_16 * restrict abase_n;
      GFC_REAL_16 bbase_yn;

      if (rystride == xcount)
	memset (dest, 0, (sizeof (GFC_REAL_16) * xcount * ycount));
      else
	{
	  for (y = 0; y < ycount; y++)
	    for (x = 0; x < xcount; x++)
	      dest[x + y*rystride] = (GFC_REAL_16)0;
	}

      for (y = 0; y < ycount; y++)
	{
	  bbase_y = bbase + y*bystride;
	  dest_y = dest + y*rystride;
	  for (n = 0; n < count; n++)
	    {
	      abase_n = abase + n*aystride;
	      bbase_yn = bbase_y[n];
	      for (x = 0; x < xcount; x++)
		{
		  dest_y[x] += abase_n[x] * bbase_yn;
		}
	    }
	}
    }
  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      if (GFC_DESCRIPTOR_RANK (a) != 1)
	{
	  const GFC_REAL_16 *restrict abase_x;
	  const GFC_REAL_16 *restrict bbase_y;
	  GFC_REAL_16 *restrict dest_y;
	  GFC_REAL_16 s;

	  for (y = 0; y < ycount; y++)
	    {
	      bbase_y = &bbase[y*bystride];
	      dest_y = &dest[y*rystride];
	      for (x = 0; x < xcount; x++)
		{
		  abase_x = &abase[x*axstride];
		  s = (GFC_REAL_16) 0;
		  for (n = 0; n < count; n++)
		    s += abase_x[n] * bbase_y[n];
		  dest_y[x] = s;
		}
	    }
	}
      else
	{
	  const GFC_REAL_16 *restrict bbase_y;
	  GFC_REAL_16 s;

	  for (y = 0; y < ycount; y++)
	    {
	      bbase_y = &bbase[y*bystride];
	      s = (GFC_REAL_16) 0;
	      for (n = 0; n < count; n++)
		s += abase[n*axstride] * bbase_y[n];
	      dest[y*rystride] = s;
	    }
	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
	for (x = 0; x < xcount; x++)
	  dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;

      for (y = 0; y < ycount; y++)
	for (n = 0; n < count; n++)
	  for (x = 0; x < xcount; x++)
	    /* dest[x,y] += a[x,n] * b[n,y] */
	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else if (GFC_DESCRIPTOR_RANK (a) == 1)
    {
      const GFC_REAL_16 *restrict bbase_y;
      GFC_REAL_16 s;

      for (y = 0; y < ycount; y++)
	{
	  bbase_y = &bbase[y*bystride];
	  s = (GFC_REAL_16) 0;
	  for (n = 0; n < count; n++)
	    s += abase[n*axstride] * bbase_y[n*bxstride];
	  dest[y*rxstride] = s;
	}
    }
  else
    {
      const GFC_REAL_16 *restrict abase_x;
      const GFC_REAL_16 *restrict bbase_y;
      GFC_REAL_16 *restrict dest_y;
      GFC_REAL_16 s;

      for (y = 0; y < ycount; y++)
	{
	  bbase_y = &bbase[y*bystride];
	  dest_y = &dest[y*rystride];
	  for (x = 0; x < xcount; x++)
	    {
	      abase_x = &abase[x*axstride];
	      s = (GFC_REAL_16) 0;
	      for (n = 0; n < count; n++)
		s += abase_x[n*aystride] * bbase_y[n*bxstride];
	      dest_y[x*rxstride] = s;
	    }
	}
    }
}

#endif
Commit	Line	Data
644cb69f	1	/* Implementation of the MATMUL intrinsic
6ff24d45	2	Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
644cb69f FXC	3	Contributed by Paul Brook <paul@nowt.org>
	4
	5	This file is part of the GNU Fortran 95 runtime library (libgfortran).
	6
	7	Libgfortran is free software; you can redistribute it and/or
	8	modify it under the terms of the GNU General Public
	9	License as published by the Free Software Foundation; either
	10	version 2 of the License, or (at your option) any later version.
	11
	12	In addition to the permissions in the GNU General Public License, the
	13	Free Software Foundation gives you unlimited permission to link the
	14	compiled version of this file into combinations with other programs,
	15	and to distribute those combinations without any restriction coming
	16	from the use of this file. (The General Public License restrictions
	17	do apply in other respects; for example, they cover modification of
	18	the file, and distribution when not linked into a combine
	19	executable.)
	20
	21	Libgfortran is distributed in the hope that it will be useful,
	22	but WITHOUT ANY WARRANTY; without even the implied warranty of
	23	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	24	GNU General Public License for more details.
	25
	26	You should have received a copy of the GNU General Public
	27	License along with libgfortran; see the file COPYING. If not,
	28	write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	29	Boston, MA 02110-1301, USA. */
	30
	31	#include "config.h"
	32	#include <stdlib.h>
	33	#include <string.h>
	34	#include <assert.h>
	35	#include "libgfortran.h"
	36
	37	#if defined (HAVE_GFC_REAL_16)
	38
5a0aad31 FXC	39	/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
	40	passed to us by the front-end, in which case we'll call it for large
	41	matrices. */
	42
	43	typedef void (blas_call)(const char , const char , const int , const int *,
	44	const int , const GFC_REAL_16 , const GFC_REAL_16 *,
	45	const int , const GFC_REAL_16 , const int *,
	46	const GFC_REAL_16 , GFC_REAL_16 , const int *,
	47	int, int);
	48
1524f80b RS	49	/* The order of loops is different in the case of plain matrix
	50	multiplication C=MATMUL(A,B), and in the frequent special case where
	51	the argument A is the temporary result of a TRANSPOSE intrinsic:
	52	C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
	53	looking at their strides.
	54
	55	The equivalent Fortran pseudo-code is:
644cb69f FXC	56
644cb69f FXC	57	DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
1524f80b RS	58	IF (.NOT.IS_TRANSPOSED(A)) THEN
	59	C = 0
	60	DO J=1,N
	61	DO K=1,COUNT
	62	DO I=1,M
	63	C(I,J) = C(I,J)+A(I,K)*B(K,J)
	64	ELSE
	65	DO J=1,N
644cb69f	66	DO I=1,M
1524f80b RS	67	S = 0
1524f80b RS	68	DO K=1,COUNT
5a0aad31	69	S = S+A(I,K)*B(K,J)
1524f80b RS	70	C(I,J) = S
1524f80b RS	71	ENDIF
644cb69f FXC	72	*/
644cb69f FXC	73
5a0aad31 FXC	74	/* If try_blas is set to a nonzero value, then the matmul function will
	75	see if there is a way to perform the matrix multiplication by a call
	76	to the BLAS gemm function. */
	77
85206901	78	extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
5a0aad31 FXC	79	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
5a0aad31 FXC	80	int blas_limit, blas_call gemm);
644cb69f FXC	81	export_proto(matmul_r16);
	82
	83	void
85206901	84	matmul_r16 (gfc_array_r16 * const restrict retarray,
5a0aad31 FXC	85	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
5a0aad31 FXC	86	int blas_limit, blas_call gemm)
644cb69f	87	{
85206901 JB	88	const GFC_REAL_16 * restrict abase;
	89	const GFC_REAL_16 * restrict bbase;
	90	GFC_REAL_16 * restrict dest;
644cb69f FXC	91
	92	index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
	93	index_type x, y, n, count, xcount, ycount;
	94
	95	assert (GFC_DESCRIPTOR_RANK (a) == 2
	96	\|\| GFC_DESCRIPTOR_RANK (b) == 2);
	97
	98	/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
	99
	100	Either A or B (but not both) can be rank 1:
	101
	102	o One-dimensional argument A is implicitly treated as a row matrix
	103	dimensioned [1,count], so xcount=1.
	104
	105	o One-dimensional argument B is implicitly treated as a column matrix
	106	dimensioned [count, 1], so ycount=1.
	107	*/
	108
	109	if (retarray->data == NULL)
	110	{
	111	if (GFC_DESCRIPTOR_RANK (a) == 1)
	112	{
	113	retarray->dim[0].lbound = 0;
	114	retarray->dim[0].ubound = b->dim[1].ubound - b->dim[1].lbound;
	115	retarray->dim[0].stride = 1;
	116	}
	117	else if (GFC_DESCRIPTOR_RANK (b) == 1)
	118	{
	119	retarray->dim[0].lbound = 0;
	120	retarray->dim[0].ubound = a->dim[0].ubound - a->dim[0].lbound;
	121	retarray->dim[0].stride = 1;
	122	}
	123	else
	124	{
	125	retarray->dim[0].lbound = 0;
	126	retarray->dim[0].ubound = a->dim[0].ubound - a->dim[0].lbound;
	127	retarray->dim[0].stride = 1;
	128
	129	retarray->dim[1].lbound = 0;
	130	retarray->dim[1].ubound = b->dim[1].ubound - b->dim[1].lbound;
	131	retarray->dim[1].stride = retarray->dim[0].ubound+1;
	132	}
	133
	134	retarray->data
	135	= internal_malloc_size (sizeof (GFC_REAL_16) * size0 ((array_t *) retarray));
	136	retarray->offset = 0;
	137	}
	138
644cb69f FXC	139
	140	if (GFC_DESCRIPTOR_RANK (retarray) == 1)
	141	{
	142	/* One-dimensional result may be addressed in the code below
	143	either as a row or a column matrix. We want both cases to
	144	work. */
	145	rxstride = rystride = retarray->dim[0].stride;
	146	}
	147	else
	148	{
	149	rxstride = retarray->dim[0].stride;
	150	rystride = retarray->dim[1].stride;
	151	}
	152
	153
	154	if (GFC_DESCRIPTOR_RANK (a) == 1)
	155	{
	156	/* Treat it as a a row matrix A[1,count]. */
	157	axstride = a->dim[0].stride;
	158	aystride = 1;
	159
	160	xcount = 1;
	161	count = a->dim[0].ubound + 1 - a->dim[0].lbound;
	162	}
	163	else
	164	{
	165	axstride = a->dim[0].stride;
	166	aystride = a->dim[1].stride;
	167
	168	count = a->dim[1].ubound + 1 - a->dim[1].lbound;
	169	xcount = a->dim[0].ubound + 1 - a->dim[0].lbound;
	170	}
	171
18c492a9 TK	172	if (count != b->dim[0].ubound + 1 - b->dim[0].lbound)
18c492a9 TK	173	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
644cb69f FXC	174
	175	if (GFC_DESCRIPTOR_RANK (b) == 1)
	176	{
	177	/* Treat it as a column matrix B[count,1] */
	178	bxstride = b->dim[0].stride;
	179
	180	/* bystride should never be used for 1-dimensional b.
	181	in case it is we want it to cause a segfault, rather than
	182	an incorrect result. */
	183	bystride = 0xDEADBEEF;
	184	ycount = 1;
	185	}
	186	else
	187	{
	188	bxstride = b->dim[0].stride;
	189	bystride = b->dim[1].stride;
	190	ycount = b->dim[1].ubound + 1 - b->dim[1].lbound;
	191	}
	192
	193	abase = a->data;
	194	bbase = b->data;
	195	dest = retarray->data;
	196
5a0aad31 FXC	197
	198	/* Now that everything is set up, we're performing the multiplication
	199	itself. */
	200
	201	#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
	202
	203	if (try_blas && rxstride == 1 && (axstride == 1 \|\| aystride == 1)
	204	&& (bxstride == 1 \|\| bystride == 1)
	205	&& (((float) xcount) * ((float) ycount) * ((float) count)
	206	> POW3(blas_limit)))
	207	{
	208	const int m = xcount, n = ycount, k = count, ldc = rystride;
	209	const GFC_REAL_16 one = 1, zero = 0;
	210	const int lda = (axstride == 1) ? aystride : axstride,
	211	ldb = (bxstride == 1) ? bystride : bxstride;
	212
	213	if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
	214	{
	215	assert (gemm != NULL);
	216	gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
	217	&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
	218	return;
	219	}
	220	}
	221
644cb69f FXC	222	if (rxstride == 1 && axstride == 1 && bxstride == 1)
644cb69f FXC	223	{
85206901 JB	224	const GFC_REAL_16 * restrict bbase_y;
	225	GFC_REAL_16 * restrict dest_y;
	226	const GFC_REAL_16 * restrict abase_n;
644cb69f FXC	227	GFC_REAL_16 bbase_yn;
644cb69f FXC	228
1633cb7c FXC	229	if (rystride == xcount)
1633cb7c FXC	230	memset (dest, 0, (sizeof (GFC_REAL_16) * xcount * ycount));
644cb69f FXC	231	else
	232	{
	233	for (y = 0; y < ycount; y++)
	234	for (x = 0; x < xcount; x++)
	235	dest[x + y*rystride] = (GFC_REAL_16)0;
	236	}
	237
	238	for (y = 0; y < ycount; y++)
	239	{
	240	bbase_y = bbase + y*bystride;
	241	dest_y = dest + y*rystride;
	242	for (n = 0; n < count; n++)
	243	{
	244	abase_n = abase + n*aystride;
	245	bbase_yn = bbase_y[n];
	246	for (x = 0; x < xcount; x++)
	247	{
	248	dest_y[x] += abase_n[x] * bbase_yn;
	249	}
	250	}
	251	}
	252	}
1524f80b RS	253	else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1524f80b RS	254	{
a4a11197 PT	255	if (GFC_DESCRIPTOR_RANK (a) != 1)
	256	{
	257	const GFC_REAL_16 *restrict abase_x;
	258	const GFC_REAL_16 *restrict bbase_y;
	259	GFC_REAL_16 *restrict dest_y;
	260	GFC_REAL_16 s;
1524f80b	261
a4a11197 PT	262	for (y = 0; y < ycount; y++)
	263	{
	264	bbase_y = &bbase[y*bystride];
	265	dest_y = &dest[y*rystride];
	266	for (x = 0; x < xcount; x++)
	267	{
	268	abase_x = &abase[x*axstride];
	269	s = (GFC_REAL_16) 0;
	270	for (n = 0; n < count; n++)
	271	s += abase_x[n] * bbase_y[n];
	272	dest_y[x] = s;
	273	}
	274	}
	275	}
	276	else
1524f80b	277	{
a4a11197 PT	278	const GFC_REAL_16 *restrict bbase_y;
	279	GFC_REAL_16 s;
	280
	281	for (y = 0; y < ycount; y++)
1524f80b	282	{
a4a11197	283	bbase_y = &bbase[y*bystride];
1524f80b RS	284	s = (GFC_REAL_16) 0;
1524f80b RS	285	for (n = 0; n < count; n++)
a4a11197 PT	286	s += abase[naxstride] bbase_y[n];
a4a11197 PT	287	dest[y*rystride] = s;
1524f80b RS	288	}
	289	}
	290	}
	291	else if (axstride < aystride)
644cb69f FXC	292	{
	293	for (y = 0; y < ycount; y++)
	294	for (x = 0; x < xcount; x++)
	295	dest[xrxstride + yrystride] = (GFC_REAL_16)0;
	296
	297	for (y = 0; y < ycount; y++)
	298	for (n = 0; n < count; n++)
	299	for (x = 0; x < xcount; x++)
	300	/* dest[x,y] += a[x,n] * b[n,y] */
	301	dest[xrxstride + yrystride] += abase[xaxstride + naystride] * bbase[nbxstride + ybystride];
	302	}
f0e871d6 PT	303	else if (GFC_DESCRIPTOR_RANK (a) == 1)
	304	{
	305	const GFC_REAL_16 *restrict bbase_y;
	306	GFC_REAL_16 s;
	307
	308	for (y = 0; y < ycount; y++)
	309	{
	310	bbase_y = &bbase[y*bystride];
	311	s = (GFC_REAL_16) 0;
	312	for (n = 0; n < count; n++)
	313	s += abase[naxstride] bbase_y[n*bxstride];
	314	dest[y*rxstride] = s;
	315	}
	316	}
1524f80b RS	317	else
	318	{
	319	const GFC_REAL_16 *restrict abase_x;
	320	const GFC_REAL_16 *restrict bbase_y;
	321	GFC_REAL_16 *restrict dest_y;
	322	GFC_REAL_16 s;
	323
	324	for (y = 0; y < ycount; y++)
	325	{
	326	bbase_y = &bbase[y*bystride];
	327	dest_y = &dest[y*rystride];
	328	for (x = 0; x < xcount; x++)
	329	{
	330	abase_x = &abase[x*axstride];
	331	s = (GFC_REAL_16) 0;
	332	for (n = 0; n < count; n++)
	333	s += abase_x[naystride] bbase_y[n*bxstride];
	334	dest_y[x*rxstride] = s;
	335	}
	336	}
	337	}
644cb69f FXC	338	}
	339
	340	#endif