[thirdparty/gcc.git] / libgfortran / generated / matmul_i1.c

/* Implementation of the MATMUL intrinsic
   Copyright (C) 2002-2016 Free Software Foundation, Inc.
   Contributed by Paul Brook <paul@nowt.org>

This file is part of the GNU Fortran runtime library (libgfortran).

Libgfortran is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.

Libgfortran is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

#include "libgfortran.h"
#include <stdlib.h>
#include <string.h>
#include <assert.h>


#if defined (HAVE_GFC_INTEGER_1)

/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we call it for large
   matrices.  */

typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *,
                          const int *, const GFC_INTEGER_1 *, const int *,
                          const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *,
                          int, int);

/* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.

   The equivalent Fortran pseudo-code is:

   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
   IF (.NOT.IS_TRANSPOSED(A)) THEN
     C = 0
     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
         S = 0
         DO K=1,COUNT
           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
*/

/* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */

extern void matmul_i1 (gfc_array_i1 * const restrict retarray, 
	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
	int blas_limit, blas_call gemm);
export_proto(matmul_i1);

void
matmul_i1 (gfc_array_i1 * const restrict retarray, 
	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
	int blas_limit, blas_call gemm)
{
  const GFC_INTEGER_1 * restrict abase;
  const GFC_INTEGER_1 * restrict bbase;
  GFC_INTEGER_1 * restrict dest;

  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  index_type x, y, n, count, xcount, ycount;

  assert (GFC_DESCRIPTOR_RANK (a) == 2
          || GFC_DESCRIPTOR_RANK (b) == 2);

/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]

   Either A or B (but not both) can be rank 1:

   o One-dimensional argument A is implicitly treated as a row matrix
     dimensioned [1,count], so xcount=1.

   o One-dimensional argument B is implicitly treated as a column matrix
     dimensioned [count, 1], so ycount=1.
*/

  if (retarray->base_addr == NULL)
    {
      if (GFC_DESCRIPTOR_RANK (a) == 1)
        {
	  GFC_DIMENSION_SET(retarray->dim[0], 0,
	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
        }
      else if (GFC_DESCRIPTOR_RANK (b) == 1)
        {
	  GFC_DIMENSION_SET(retarray->dim[0], 0,
	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
        }
      else
        {
	  GFC_DIMENSION_SET(retarray->dim[0], 0,
	                    GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);

          GFC_DIMENSION_SET(retarray->dim[1], 0,
	                    GFC_DESCRIPTOR_EXTENT(b,1) - 1,
			    GFC_DESCRIPTOR_EXTENT(retarray,0));
        }

      retarray->base_addr
	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
      retarray->offset = 0;
    }
  else if (unlikely (compile_options.bounds_check))
    {
      index_type ret_extent, arg_extent;

      if (GFC_DESCRIPTOR_RANK (a) == 1)
	{
	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
	  if (arg_extent != ret_extent)
	    runtime_error ("Incorrect extent in return array in"
			   " MATMUL intrinsic: is %ld, should be %ld",
			   (long int) ret_extent, (long int) arg_extent);
	}
      else if (GFC_DESCRIPTOR_RANK (b) == 1)
	{
	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
	  if (arg_extent != ret_extent)
	    runtime_error ("Incorrect extent in return array in"
			   " MATMUL intrinsic: is %ld, should be %ld",
			   (long int) ret_extent, (long int) arg_extent);
	}
      else
	{
	  arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
	  if (arg_extent != ret_extent)
	    runtime_error ("Incorrect extent in return array in"
			   " MATMUL intrinsic for dimension 1:"
			   " is %ld, should be %ld",
			   (long int) ret_extent, (long int) arg_extent);

	  arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
	  ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
	  if (arg_extent != ret_extent)
	    runtime_error ("Incorrect extent in return array in"
			   " MATMUL intrinsic for dimension 2:"
			   " is %ld, should be %ld",
			   (long int) ret_extent, (long int) arg_extent);
	}
    }


  if (GFC_DESCRIPTOR_RANK (retarray) == 1)
    {
      /* One-dimensional result may be addressed in the code below
	 either as a row or a column matrix. We want both cases to
	 work. */
      rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
    }
  else
    {
      rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
      rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
    }


  if (GFC_DESCRIPTOR_RANK (a) == 1)
    {
      /* Treat it as a a row matrix A[1,count]. */
      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
      aystride = 1;

      xcount = 1;
      count = GFC_DESCRIPTOR_EXTENT(a,0);
    }
  else
    {
      axstride = GFC_DESCRIPTOR_STRIDE(a,0);
      aystride = GFC_DESCRIPTOR_STRIDE(a,1);

      count = GFC_DESCRIPTOR_EXTENT(a,1);
      xcount = GFC_DESCRIPTOR_EXTENT(a,0);
    }

  if (count != GFC_DESCRIPTOR_EXTENT(b,0))
    {
      if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
    }

  if (GFC_DESCRIPTOR_RANK (b) == 1)
    {
      /* Treat it as a column matrix B[count,1] */
      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);

      /* bystride should never be used for 1-dimensional b.
	 in case it is we want it to cause a segfault, rather than
	 an incorrect result. */
      bystride = 0xDEADBEEF;
      ycount = 1;
    }
  else
    {
      bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
      bystride = GFC_DESCRIPTOR_STRIDE(b,1);
      ycount = GFC_DESCRIPTOR_EXTENT(b,1);
    }

  abase = a->base_addr;
  bbase = b->base_addr;
  dest = retarray->base_addr;

  /* Now that everything is set up, we perform the multiplication
     itself.  */

#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))

  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
    {
      const int m = xcount, n = ycount, k = count, ldc = rystride;
      const GFC_INTEGER_1 one = 1, zero = 0;
      const int lda = (axstride == 1) ? aystride : axstride,
		ldb = (bxstride == 1) ? bystride : bxstride;

      if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
	{
	  assert (gemm != NULL);
	  gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
		&n, &k,	&one, abase, &lda, bbase, &ldb, &zero, dest,
		&ldc, 1, 1);
	  return;
	}
    }

  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      /* This block of code implements a tuned matmul, derived from
         Superscalar GEMM-based level 3 BLAS,  Beta version 0.1

               Bo Kagstrom and Per Ling
               Department of Computing Science
               Umea University
               S-901 87 Umea, Sweden

	 from netlib.org, translated to C, and modified for matmul.m4.  */

      const GFC_INTEGER_1 *a, *b;
      GFC_INTEGER_1 *c;
      const index_type m = xcount, n = ycount, k = count;

      /* System generated locals */
      index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
		 i1, i2, i3, i4, i5, i6;

      /* Local variables */
      GFC_INTEGER_1 t1[65536], /* was [256][256] */
		 f11, f12, f21, f22, f31, f32, f41, f42,
		 f13, f14, f23, f24, f33, f34, f43, f44;
      index_type i, j, l, ii, jj, ll;
      index_type isec, jsec, lsec, uisec, ujsec, ulsec;

      a = abase;
      b = bbase;
      c = retarray->base_addr;

      /* Parameter adjustments */
      c_dim1 = rystride;
      c_offset = 1 + c_dim1;
      c -= c_offset;
      a_dim1 = aystride;
      a_offset = 1 + a_dim1;
      a -= a_offset;
      b_dim1 = bystride;
      b_offset = 1 + b_dim1;
      b -= b_offset;

      /* Early exit if possible */
      if (m == 0 || n == 0 || k == 0)
	return;

      /* Empty c first.  */
      for (j=1; j<=n; j++)
	for (i=1; i<=m; i++)
	  c[i + j * c_dim1] = (GFC_INTEGER_1)0;

      /* Start turning the crank. */
      i1 = n;
      for (jj = 1; jj <= i1; jj += 512)
	{
	  /* Computing MIN */
	  i2 = 512;
	  i3 = n - jj + 1;
	  jsec = min(i2,i3);
	  ujsec = jsec - jsec % 4;
	  i2 = k;
	  for (ll = 1; ll <= i2; ll += 256)
	    {
	      /* Computing MIN */
	      i3 = 256;
	      i4 = k - ll + 1;
	      lsec = min(i3,i4);
	      ulsec = lsec - lsec % 2;

	      i3 = m;
	      for (ii = 1; ii <= i3; ii += 256)
		{
		  /* Computing MIN */
		  i4 = 256;
		  i5 = m - ii + 1;
		  isec = min(i4,i5);
		  uisec = isec - isec % 2;
		  i4 = ll + ulsec - 1;
		  for (l = ll; l <= i4; l += 2)
		    {
		      i5 = ii + uisec - 1;
		      for (i = ii; i <= i5; i += 2)
			{
			  t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
					a[i + l * a_dim1];
			  t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
					a[i + (l + 1) * a_dim1];
			  t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
					a[i + 1 + l * a_dim1];
			  t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
					a[i + 1 + (l + 1) * a_dim1];
			}
		      if (uisec < isec)
			{
			  t1[l - ll + 1 + (isec << 8) - 257] =
				    a[ii + isec - 1 + l * a_dim1];
			  t1[l - ll + 2 + (isec << 8) - 257] =
				    a[ii + isec - 1 + (l + 1) * a_dim1];
			}
		    }
		  if (ulsec < lsec)
		    {
		      i4 = ii + isec - 1;
		      for (i = ii; i<= i4; ++i)
			{
			  t1[lsec + ((i - ii + 1) << 8) - 257] =
				    a[i + (ll + lsec - 1) * a_dim1];
			}
		    }

		  uisec = isec - isec % 4;
		  i4 = jj + ujsec - 1;
		  for (j = jj; j <= i4; j += 4)
		    {
		      i5 = ii + uisec - 1;
		      for (i = ii; i <= i5; i += 4)
			{
			  f11 = c[i + j * c_dim1];
			  f21 = c[i + 1 + j * c_dim1];
			  f12 = c[i + (j + 1) * c_dim1];
			  f22 = c[i + 1 + (j + 1) * c_dim1];
			  f13 = c[i + (j + 2) * c_dim1];
			  f23 = c[i + 1 + (j + 2) * c_dim1];
			  f14 = c[i + (j + 3) * c_dim1];
			  f24 = c[i + 1 + (j + 3) * c_dim1];
			  f31 = c[i + 2 + j * c_dim1];
			  f41 = c[i + 3 + j * c_dim1];
			  f32 = c[i + 2 + (j + 1) * c_dim1];
			  f42 = c[i + 3 + (j + 1) * c_dim1];
			  f33 = c[i + 2 + (j + 2) * c_dim1];
			  f43 = c[i + 3 + (j + 2) * c_dim1];
			  f34 = c[i + 2 + (j + 3) * c_dim1];
			  f44 = c[i + 3 + (j + 3) * c_dim1];
			  i6 = ll + lsec - 1;
			  for (l = ll; l <= i6; ++l)
			    {
			      f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
				      * b[l + j * b_dim1];
			      f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
				      * b[l + j * b_dim1];
			      f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
				      * b[l + (j + 1) * b_dim1];
			      f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
				      * b[l + (j + 1) * b_dim1];
			      f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
				      * b[l + (j + 2) * b_dim1];
			      f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
				      * b[l + (j + 2) * b_dim1];
			      f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
				      * b[l + (j + 3) * b_dim1];
			      f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
				      * b[l + (j + 3) * b_dim1];
			      f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
				      * b[l + j * b_dim1];
			      f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
				      * b[l + j * b_dim1];
			      f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
				      * b[l + (j + 1) * b_dim1];
			      f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
				      * b[l + (j + 1) * b_dim1];
			      f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
				      * b[l + (j + 2) * b_dim1];
			      f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
				      * b[l + (j + 2) * b_dim1];
			      f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
				      * b[l + (j + 3) * b_dim1];
			      f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
				      * b[l + (j + 3) * b_dim1];
			    }
			  c[i + j * c_dim1] = f11;
			  c[i + 1 + j * c_dim1] = f21;
			  c[i + (j + 1) * c_dim1] = f12;
			  c[i + 1 + (j + 1) * c_dim1] = f22;
			  c[i + (j + 2) * c_dim1] = f13;
			  c[i + 1 + (j + 2) * c_dim1] = f23;
			  c[i + (j + 3) * c_dim1] = f14;
			  c[i + 1 + (j + 3) * c_dim1] = f24;
			  c[i + 2 + j * c_dim1] = f31;
			  c[i + 3 + j * c_dim1] = f41;
			  c[i + 2 + (j + 1) * c_dim1] = f32;
			  c[i + 3 + (j + 1) * c_dim1] = f42;
			  c[i + 2 + (j + 2) * c_dim1] = f33;
			  c[i + 3 + (j + 2) * c_dim1] = f43;
			  c[i + 2 + (j + 3) * c_dim1] = f34;
			  c[i + 3 + (j + 3) * c_dim1] = f44;
			}
		      if (uisec < isec)
			{
			  i5 = ii + isec - 1;
			  for (i = ii + uisec; i <= i5; ++i)
			    {
			      f11 = c[i + j * c_dim1];
			      f12 = c[i + (j + 1) * c_dim1];
			      f13 = c[i + (j + 2) * c_dim1];
			      f14 = c[i + (j + 3) * c_dim1];
			      i6 = ll + lsec - 1;
			      for (l = ll; l <= i6; ++l)
				{
				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
					  257] * b[l + j * b_dim1];
				  f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
					  257] * b[l + (j + 1) * b_dim1];
				  f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
					  257] * b[l + (j + 2) * b_dim1];
				  f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
					  257] * b[l + (j + 3) * b_dim1];
				}
			      c[i + j * c_dim1] = f11;
			      c[i + (j + 1) * c_dim1] = f12;
			      c[i + (j + 2) * c_dim1] = f13;
			      c[i + (j + 3) * c_dim1] = f14;
			    }
			}
		    }
		  if (ujsec < jsec)
		    {
		      i4 = jj + jsec - 1;
		      for (j = jj + ujsec; j <= i4; ++j)
			{
			  i5 = ii + uisec - 1;
			  for (i = ii; i <= i5; i += 4)
			    {
			      f11 = c[i + j * c_dim1];
			      f21 = c[i + 1 + j * c_dim1];
			      f31 = c[i + 2 + j * c_dim1];
			      f41 = c[i + 3 + j * c_dim1];
			      i6 = ll + lsec - 1;
			      for (l = ll; l <= i6; ++l)
				{
				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
					  257] * b[l + j * b_dim1];
				  f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
					  257] * b[l + j * b_dim1];
				  f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
					  257] * b[l + j * b_dim1];
				  f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
					  257] * b[l + j * b_dim1];
				}
			      c[i + j * c_dim1] = f11;
			      c[i + 1 + j * c_dim1] = f21;
			      c[i + 2 + j * c_dim1] = f31;
			      c[i + 3 + j * c_dim1] = f41;
			    }
			  i5 = ii + isec - 1;
			  for (i = ii + uisec; i <= i5; ++i)
			    {
			      f11 = c[i + j * c_dim1];
			      i6 = ll + lsec - 1;
			      for (l = ll; l <= i6; ++l)
				{
				  f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
					  257] * b[l + j * b_dim1];
				}
			      c[i + j * c_dim1] = f11;
			    }
			}
		    }
		}
	    }
	}
      return;
    }
  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      if (GFC_DESCRIPTOR_RANK (a) != 1)
	{
	  const GFC_INTEGER_1 *restrict abase_x;
	  const GFC_INTEGER_1 *restrict bbase_y;
	  GFC_INTEGER_1 *restrict dest_y;
	  GFC_INTEGER_1 s;

	  for (y = 0; y < ycount; y++)
	    {
	      bbase_y = &bbase[y*bystride];
	      dest_y = &dest[y*rystride];
	      for (x = 0; x < xcount; x++)
		{
		  abase_x = &abase[x*axstride];
		  s = (GFC_INTEGER_1) 0;
		  for (n = 0; n < count; n++)
		    s += abase_x[n] * bbase_y[n];
		  dest_y[x] = s;
		}
	    }
	}
      else
	{
	  const GFC_INTEGER_1 *restrict bbase_y;
	  GFC_INTEGER_1 s;

	  for (y = 0; y < ycount; y++)
	    {
	      bbase_y = &bbase[y*bystride];
	      s = (GFC_INTEGER_1) 0;
	      for (n = 0; n < count; n++)
		s += abase[n*axstride] * bbase_y[n];
	      dest[y*rystride] = s;
	    }
	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
	for (x = 0; x < xcount; x++)
	  dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;

      for (y = 0; y < ycount; y++)
	for (n = 0; n < count; n++)
	  for (x = 0; x < xcount; x++)
	    /* dest[x,y] += a[x,n] * b[n,y] */
	    dest[x*rxstride + y*rystride] +=
					abase[x*axstride + n*aystride] *
					bbase[n*bxstride + y*bystride];
    }
  else if (GFC_DESCRIPTOR_RANK (a) == 1)
    {
      const GFC_INTEGER_1 *restrict bbase_y;
      GFC_INTEGER_1 s;

      for (y = 0; y < ycount; y++)
	{
	  bbase_y = &bbase[y*bystride];
	  s = (GFC_INTEGER_1) 0;
	  for (n = 0; n < count; n++)
	    s += abase[n*axstride] * bbase_y[n*bxstride];
	  dest[y*rxstride] = s;
	}
    }
  else
    {
      const GFC_INTEGER_1 *restrict abase_x;
      const GFC_INTEGER_1 *restrict bbase_y;
      GFC_INTEGER_1 *restrict dest_y;
      GFC_INTEGER_1 s;

      for (y = 0; y < ycount; y++)
	{
	  bbase_y = &bbase[y*bystride];
	  dest_y = &dest[y*rystride];
	  for (x = 0; x < xcount; x++)
	    {
	      abase_x = &abase[x*axstride];
	      s = (GFC_INTEGER_1) 0;
	      for (n = 0; n < count; n++)
		s += abase_x[n*aystride] * bbase_y[n*bxstride];
	      dest_y[x*rxstride] = s;
	    }
	}
    }
}
#endif
Commit	Line	Data
567c915b	1	/* Implementation of the MATMUL intrinsic
818ab71a	2	Copyright (C) 2002-2016 Free Software Foundation, Inc.
567c915b TK	3	Contributed by Paul Brook <paul@nowt.org>
567c915b TK	4
21d1335b	5	This file is part of the GNU Fortran runtime library (libgfortran).
567c915b TK	6
	7	Libgfortran is free software; you can redistribute it and/or
	8	modify it under the terms of the GNU General Public
	9	License as published by the Free Software Foundation; either
748086b7	10	version 3 of the License, or (at your option) any later version.
567c915b TK	11
	12	Libgfortran is distributed in the hope that it will be useful,
	13	but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	GNU General Public License for more details.
	16
748086b7 JJ	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
567c915b	25
36ae8a61	26	#include "libgfortran.h"
567c915b TK	27	#include <stdlib.h>
	28	#include <string.h>
	29	#include <assert.h>
36ae8a61	30
567c915b TK	31
	32	#if defined (HAVE_GFC_INTEGER_1)
	33
	34	/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
5d70ab07	35	passed to us by the front-end, in which case we call it for large
567c915b TK	36	matrices. */
	37
	38	typedef void (blas_call)(const char , const char , const int , const int *,
	39	const int , const GFC_INTEGER_1 , const GFC_INTEGER_1 *,
	40	const int , const GFC_INTEGER_1 , const int *,
	41	const GFC_INTEGER_1 , GFC_INTEGER_1 , const int *,
	42	int, int);
	43
	44	/* The order of loops is different in the case of plain matrix
	45	multiplication C=MATMUL(A,B), and in the frequent special case where
	46	the argument A is the temporary result of a TRANSPOSE intrinsic:
	47	C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
	48	looking at their strides.
	49
	50	The equivalent Fortran pseudo-code is:
	51
	52	DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
	53	IF (.NOT.IS_TRANSPOSED(A)) THEN
	54	C = 0
	55	DO J=1,N
	56	DO K=1,COUNT
	57	DO I=1,M
	58	C(I,J) = C(I,J)+A(I,K)*B(K,J)
	59	ELSE
	60	DO J=1,N
	61	DO I=1,M
	62	S = 0
	63	DO K=1,COUNT
	64	S = S+A(I,K)*B(K,J)
	65	C(I,J) = S
	66	ENDIF
	67	*/
	68
	69	/* If try_blas is set to a nonzero value, then the matmul function will
	70	see if there is a way to perform the matrix multiplication by a call
	71	to the BLAS gemm function. */
	72
	73	extern void matmul_i1 (gfc_array_i1 * const restrict retarray,
	74	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
	75	int blas_limit, blas_call gemm);
	76	export_proto(matmul_i1);
	77
	78	void
	79	matmul_i1 (gfc_array_i1 * const restrict retarray,
	80	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
	81	int blas_limit, blas_call gemm)
	82	{
	83	const GFC_INTEGER_1 * restrict abase;
	84	const GFC_INTEGER_1 * restrict bbase;
	85	GFC_INTEGER_1 * restrict dest;
	86
	87	index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
	88	index_type x, y, n, count, xcount, ycount;
	89
	90	assert (GFC_DESCRIPTOR_RANK (a) == 2
	91	\|\| GFC_DESCRIPTOR_RANK (b) == 2);
	92
	93	/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
	94
	95	Either A or B (but not both) can be rank 1:
	96
	97	o One-dimensional argument A is implicitly treated as a row matrix
	98	dimensioned [1,count], so xcount=1.
	99
100	o One-dimensional argument B is implicitly treated as a column matrix
101	dimensioned [count, 1], so ycount=1.
5d70ab07	102	*/
567c915b	103
21d1335b	104	if (retarray->base_addr == NULL)
567c915b TK	105	{
	106	if (GFC_DESCRIPTOR_RANK (a) == 1)
	107	{
dfb55fdc TK	108	GFC_DIMENSION_SET(retarray->dim[0], 0,
dfb55fdc TK	109	GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
567c915b TK	110	}
	111	else if (GFC_DESCRIPTOR_RANK (b) == 1)
	112	{
dfb55fdc TK	113	GFC_DIMENSION_SET(retarray->dim[0], 0,
dfb55fdc TK	114	GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
567c915b TK	115	}
	116	else
	117	{
dfb55fdc TK	118	GFC_DIMENSION_SET(retarray->dim[0], 0,
dfb55fdc TK	119	GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
567c915b	120
dfb55fdc TK	121	GFC_DIMENSION_SET(retarray->dim[1], 0,
	122	GFC_DESCRIPTOR_EXTENT(b,1) - 1,
	123	GFC_DESCRIPTOR_EXTENT(retarray,0));
567c915b TK	124	}
567c915b TK	125
21d1335b	126	retarray->base_addr
92e6f3a4	127	= xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
567c915b TK	128	retarray->offset = 0;
567c915b TK	129	}
5d70ab07 JD	130	else if (unlikely (compile_options.bounds_check))
	131	{
	132	index_type ret_extent, arg_extent;
	133
	134	if (GFC_DESCRIPTOR_RANK (a) == 1)
	135	{
	136	arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
	137	ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
	138	if (arg_extent != ret_extent)
	139	runtime_error ("Incorrect extent in return array in"
	140	" MATMUL intrinsic: is %ld, should be %ld",
	141	(long int) ret_extent, (long int) arg_extent);
	142	}
	143	else if (GFC_DESCRIPTOR_RANK (b) == 1)
	144	{
	145	arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
	146	ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
	147	if (arg_extent != ret_extent)
	148	runtime_error ("Incorrect extent in return array in"
	149	" MATMUL intrinsic: is %ld, should be %ld",
	150	(long int) ret_extent, (long int) arg_extent);
	151	}
	152	else
	153	{
	154	arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
	155	ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
	156	if (arg_extent != ret_extent)
	157	runtime_error ("Incorrect extent in return array in"
	158	" MATMUL intrinsic for dimension 1:"
	159	" is %ld, should be %ld",
	160	(long int) ret_extent, (long int) arg_extent);
	161
	162	arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
	163	ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
	164	if (arg_extent != ret_extent)
	165	runtime_error ("Incorrect extent in return array in"
	166	" MATMUL intrinsic for dimension 2:"
	167	" is %ld, should be %ld",
	168	(long int) ret_extent, (long int) arg_extent);
	169	}
	170	}
567c915b TK	171
	172
	173	if (GFC_DESCRIPTOR_RANK (retarray) == 1)
	174	{
	175	/* One-dimensional result may be addressed in the code below
	176	either as a row or a column matrix. We want both cases to
	177	work. */
dfb55fdc	178	rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
567c915b TK	179	}
	180	else
	181	{
dfb55fdc TK	182	rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
dfb55fdc TK	183	rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
567c915b TK	184	}
	185
	186
	187	if (GFC_DESCRIPTOR_RANK (a) == 1)
	188	{
	189	/* Treat it as a a row matrix A[1,count]. */
dfb55fdc	190	axstride = GFC_DESCRIPTOR_STRIDE(a,0);
567c915b TK	191	aystride = 1;
	192
	193	xcount = 1;
dfb55fdc	194	count = GFC_DESCRIPTOR_EXTENT(a,0);
567c915b TK	195	}
	196	else
	197	{
dfb55fdc TK	198	axstride = GFC_DESCRIPTOR_STRIDE(a,0);
dfb55fdc TK	199	aystride = GFC_DESCRIPTOR_STRIDE(a,1);
567c915b	200
dfb55fdc TK	201	count = GFC_DESCRIPTOR_EXTENT(a,1);
dfb55fdc TK	202	xcount = GFC_DESCRIPTOR_EXTENT(a,0);
567c915b TK	203	}
567c915b TK	204
dfb55fdc	205	if (count != GFC_DESCRIPTOR_EXTENT(b,0))
7edc89d4	206	{
dfb55fdc	207	if (count > 0 \|\| GFC_DESCRIPTOR_EXTENT(b,0) > 0)
7edc89d4 TK	208	runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
7edc89d4 TK	209	}
567c915b TK	210
	211	if (GFC_DESCRIPTOR_RANK (b) == 1)
	212	{
	213	/* Treat it as a column matrix B[count,1] */
dfb55fdc	214	bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
567c915b TK	215
	216	/* bystride should never be used for 1-dimensional b.
	217	in case it is we want it to cause a segfault, rather than
	218	an incorrect result. */
	219	bystride = 0xDEADBEEF;
	220	ycount = 1;
	221	}
	222	else
	223	{
dfb55fdc TK	224	bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
	225	bystride = GFC_DESCRIPTOR_STRIDE(b,1);
	226	ycount = GFC_DESCRIPTOR_EXTENT(b,1);
567c915b TK	227	}
567c915b TK	228
21d1335b TB	229	abase = a->base_addr;
	230	bbase = b->base_addr;
	231	dest = retarray->base_addr;
567c915b	232
5d70ab07	233	/* Now that everything is set up, we perform the multiplication
567c915b TK	234	itself. */
	235
	236	#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
5d70ab07 JD	237	#define min(a,b) ((a) <= (b) ? (a) : (b))
5d70ab07 JD	238	#define max(a,b) ((a) >= (b) ? (a) : (b))
567c915b TK	239
	240	if (try_blas && rxstride == 1 && (axstride == 1 \|\| aystride == 1)
	241	&& (bxstride == 1 \|\| bystride == 1)
	242	&& (((float) xcount) * ((float) ycount) * ((float) count)
	243	> POW3(blas_limit)))
567c915b	244	{
5d70ab07 JD	245	const int m = xcount, n = ycount, k = count, ldc = rystride;
	246	const GFC_INTEGER_1 one = 1, zero = 0;
	247	const int lda = (axstride == 1) ? aystride : axstride,
	248	ldb = (bxstride == 1) ? bystride : bxstride;
567c915b	249
5d70ab07	250	if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
567c915b	251	{
5d70ab07 JD	252	assert (gemm != NULL);
	253	gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
	254	&n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
	255	&ldc, 1, 1);
	256	return;
567c915b	257	}
5d70ab07	258	}
567c915b	259
5d70ab07 JD	260	if (rxstride == 1 && axstride == 1 && bxstride == 1)
	261	{
	262	/* This block of code implements a tuned matmul, derived from
	263	Superscalar GEMM-based level 3 BLAS, Beta version 0.1
	264
	265	Bo Kagstrom and Per Ling
	266	Department of Computing Science
	267	Umea University
	268	S-901 87 Umea, Sweden
	269
	270	from netlib.org, translated to C, and modified for matmul.m4. */
	271
	272	const GFC_INTEGER_1 a, b;
	273	GFC_INTEGER_1 *c;
	274	const index_type m = xcount, n = ycount, k = count;
	275
	276	/* System generated locals */
	277	index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
	278	i1, i2, i3, i4, i5, i6;
	279
	280	/* Local variables */
	281	GFC_INTEGER_1 t1[65536], /* was [256][256] */
	282	f11, f12, f21, f22, f31, f32, f41, f42,
	283	f13, f14, f23, f24, f33, f34, f43, f44;
	284	index_type i, j, l, ii, jj, ll;
	285	index_type isec, jsec, lsec, uisec, ujsec, ulsec;
	286
	287	a = abase;
	288	b = bbase;
	289	c = retarray->base_addr;
	290
	291	/* Parameter adjustments */
	292	c_dim1 = rystride;
	293	c_offset = 1 + c_dim1;
	294	c -= c_offset;
	295	a_dim1 = aystride;
	296	a_offset = 1 + a_dim1;
	297	a -= a_offset;
	298	b_dim1 = bystride;
	299	b_offset = 1 + b_dim1;
	300	b -= b_offset;
	301
	302	/* Early exit if possible */
	303	if (m == 0 \|\| n == 0 \|\| k == 0)
	304	return;
	305
	306	/* Empty c first. */
	307	for (j=1; j<=n; j++)
	308	for (i=1; i<=m; i++)
	309	c[i + j * c_dim1] = (GFC_INTEGER_1)0;
	310
	311	/* Start turning the crank. */
	312	i1 = n;
	313	for (jj = 1; jj <= i1; jj += 512)
567c915b	314	{
5d70ab07 JD	315	/* Computing MIN */
	316	i2 = 512;
	317	i3 = n - jj + 1;
	318	jsec = min(i2,i3);
	319	ujsec = jsec - jsec % 4;
	320	i2 = k;
	321	for (ll = 1; ll <= i2; ll += 256)
567c915b	322	{
5d70ab07 JD	323	/* Computing MIN */
	324	i3 = 256;
	325	i4 = k - ll + 1;
	326	lsec = min(i3,i4);
	327	ulsec = lsec - lsec % 2;
	328
	329	i3 = m;
	330	for (ii = 1; ii <= i3; ii += 256)
567c915b	331	{
5d70ab07 JD	332	/* Computing MIN */
	333	i4 = 256;
	334	i5 = m - ii + 1;
	335	isec = min(i4,i5);
	336	uisec = isec - isec % 2;
	337	i4 = ll + ulsec - 1;
	338	for (l = ll; l <= i4; l += 2)
	339	{
	340	i5 = ii + uisec - 1;
	341	for (i = ii; i <= i5; i += 2)
	342	{
	343	t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
	344	a[i + l * a_dim1];
	345	t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
	346	a[i + (l + 1) * a_dim1];
	347	t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
	348	a[i + 1 + l * a_dim1];
	349	t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
	350	a[i + 1 + (l + 1) * a_dim1];
	351	}
	352	if (uisec < isec)
	353	{
	354	t1[l - ll + 1 + (isec << 8) - 257] =
	355	a[ii + isec - 1 + l * a_dim1];
	356	t1[l - ll + 2 + (isec << 8) - 257] =
	357	a[ii + isec - 1 + (l + 1) * a_dim1];
	358	}
	359	}
	360	if (ulsec < lsec)
	361	{
	362	i4 = ii + isec - 1;
	363	for (i = ii; i<= i4; ++i)
	364	{
	365	t1[lsec + ((i - ii + 1) << 8) - 257] =
	366	a[i + (ll + lsec - 1) * a_dim1];
	367	}
	368	}
	369
	370	uisec = isec - isec % 4;
	371	i4 = jj + ujsec - 1;
	372	for (j = jj; j <= i4; j += 4)
	373	{
	374	i5 = ii + uisec - 1;
	375	for (i = ii; i <= i5; i += 4)
	376	{
	377	f11 = c[i + j * c_dim1];
	378	f21 = c[i + 1 + j * c_dim1];
	379	f12 = c[i + (j + 1) * c_dim1];
	380	f22 = c[i + 1 + (j + 1) * c_dim1];
	381	f13 = c[i + (j + 2) * c_dim1];
	382	f23 = c[i + 1 + (j + 2) * c_dim1];
	383	f14 = c[i + (j + 3) * c_dim1];
	384	f24 = c[i + 1 + (j + 3) * c_dim1];
	385	f31 = c[i + 2 + j * c_dim1];
	386	f41 = c[i + 3 + j * c_dim1];
	387	f32 = c[i + 2 + (j + 1) * c_dim1];
	388	f42 = c[i + 3 + (j + 1) * c_dim1];
	389	f33 = c[i + 2 + (j + 2) * c_dim1];
	390	f43 = c[i + 3 + (j + 2) * c_dim1];
	391	f34 = c[i + 2 + (j + 3) * c_dim1];
	392	f44 = c[i + 3 + (j + 3) * c_dim1];
	393	i6 = ll + lsec - 1;
	394	for (l = ll; l <= i6; ++l)
	395	{
396	f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
397	* b[l + j * b_dim1];
398	f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
399	* b[l + j * b_dim1];
400	f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
401	* b[l + (j + 1) * b_dim1];
402	f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
403	* b[l + (j + 1) * b_dim1];
404	f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
405	* b[l + (j + 2) * b_dim1];
406	f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
407	* b[l + (j + 2) * b_dim1];
408	f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
409	* b[l + (j + 3) * b_dim1];
410	f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
411	* b[l + (j + 3) * b_dim1];
412	f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
413	* b[l + j * b_dim1];
414	f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
415	* b[l + j * b_dim1];
416	f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
417	* b[l + (j + 1) * b_dim1];
418	f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
419	* b[l + (j + 1) * b_dim1];
420	f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
421	* b[l + (j + 2) * b_dim1];
422	f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
423	* b[l + (j + 2) * b_dim1];
424	f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
425	* b[l + (j + 3) * b_dim1];
426	f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
427	* b[l + (j + 3) * b_dim1];
428	}
429	c[i + j * c_dim1] = f11;
430	c[i + 1 + j * c_dim1] = f21;
431	c[i + (j + 1) * c_dim1] = f12;
432	c[i + 1 + (j + 1) * c_dim1] = f22;
433	c[i + (j + 2) * c_dim1] = f13;
434	c[i + 1 + (j + 2) * c_dim1] = f23;
435	c[i + (j + 3) * c_dim1] = f14;
436	c[i + 1 + (j + 3) * c_dim1] = f24;
437	c[i + 2 + j * c_dim1] = f31;
438	c[i + 3 + j * c_dim1] = f41;
439	c[i + 2 + (j + 1) * c_dim1] = f32;
440	c[i + 3 + (j + 1) * c_dim1] = f42;
441	c[i + 2 + (j + 2) * c_dim1] = f33;
442	c[i + 3 + (j + 2) * c_dim1] = f43;
443	c[i + 2 + (j + 3) * c_dim1] = f34;
444	c[i + 3 + (j + 3) * c_dim1] = f44;
445	}
446	if (uisec < isec)
447	{
448	i5 = ii + isec - 1;
449	for (i = ii + uisec; i <= i5; ++i)
450	{
451	f11 = c[i + j * c_dim1];
452	f12 = c[i + (j + 1) * c_dim1];
453	f13 = c[i + (j + 2) * c_dim1];
454	f14 = c[i + (j + 3) * c_dim1];
455	i6 = ll + lsec - 1;
456	for (l = ll; l <= i6; ++l)
457	{
458	f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459	257] * b[l + j * b_dim1];
460	f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461	257] * b[l + (j + 1) * b_dim1];
462	f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463	257] * b[l + (j + 2) * b_dim1];
464	f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
465	257] * b[l + (j + 3) * b_dim1];
466	}
467	c[i + j * c_dim1] = f11;
468	c[i + (j + 1) * c_dim1] = f12;
469	c[i + (j + 2) * c_dim1] = f13;
470	c[i + (j + 3) * c_dim1] = f14;
471	}
472	}
473	}
474	if (ujsec < jsec)
475	{
476	i4 = jj + jsec - 1;
477	for (j = jj + ujsec; j <= i4; ++j)
478	{
479	i5 = ii + uisec - 1;
480	for (i = ii; i <= i5; i += 4)
481	{
482	f11 = c[i + j * c_dim1];
483	f21 = c[i + 1 + j * c_dim1];
484	f31 = c[i + 2 + j * c_dim1];
485	f41 = c[i + 3 + j * c_dim1];
486	i6 = ll + lsec - 1;
487	for (l = ll; l <= i6; ++l)
488	{
489	f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
490	257] * b[l + j * b_dim1];
491	f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
492	257] * b[l + j * b_dim1];
493	f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
494	257] * b[l + j * b_dim1];
495	f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
496	257] * b[l + j * b_dim1];
497	}
498	c[i + j * c_dim1] = f11;
499	c[i + 1 + j * c_dim1] = f21;
500	c[i + 2 + j * c_dim1] = f31;
501	c[i + 3 + j * c_dim1] = f41;
502	}
503	i5 = ii + isec - 1;
504	for (i = ii + uisec; i <= i5; ++i)
505	{
506	f11 = c[i + j * c_dim1];
507	i6 = ll + lsec - 1;
508	for (l = ll; l <= i6; ++l)
509	{
510	f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
511	257] * b[l + j * b_dim1];
512	}
513	c[i + j * c_dim1] = f11;
514	}
515	}
516	}
567c915b TK	517	}
	518	}
	519	}
5d70ab07	520	return;
567c915b TK	521	}
	522	else if (rxstride == 1 && aystride == 1 && bxstride == 1)
	523	{
	524	if (GFC_DESCRIPTOR_RANK (a) != 1)
	525	{
	526	const GFC_INTEGER_1 *restrict abase_x;
	527	const GFC_INTEGER_1 *restrict bbase_y;
	528	GFC_INTEGER_1 *restrict dest_y;
	529	GFC_INTEGER_1 s;
	530
	531	for (y = 0; y < ycount; y++)
	532	{
	533	bbase_y = &bbase[y*bystride];
	534	dest_y = &dest[y*rystride];
	535	for (x = 0; x < xcount; x++)
	536	{
	537	abase_x = &abase[x*axstride];
	538	s = (GFC_INTEGER_1) 0;
	539	for (n = 0; n < count; n++)
	540	s += abase_x[n] * bbase_y[n];
	541	dest_y[x] = s;
	542	}
	543	}
	544	}
	545	else
	546	{
	547	const GFC_INTEGER_1 *restrict bbase_y;
	548	GFC_INTEGER_1 s;
	549
	550	for (y = 0; y < ycount; y++)
	551	{
	552	bbase_y = &bbase[y*bystride];
	553	s = (GFC_INTEGER_1) 0;
	554	for (n = 0; n < count; n++)
	555	s += abase[naxstride] bbase_y[n];
	556	dest[y*rystride] = s;
	557	}
	558	}
	559	}
	560	else if (axstride < aystride)
	561	{
	562	for (y = 0; y < ycount; y++)
	563	for (x = 0; x < xcount; x++)
	564	dest[xrxstride + yrystride] = (GFC_INTEGER_1)0;
	565
	566	for (y = 0; y < ycount; y++)
	567	for (n = 0; n < count; n++)
	568	for (x = 0; x < xcount; x++)
	569	/* dest[x,y] += a[x,n] * b[n,y] */
5d70ab07 JD	570	dest[xrxstride + yrystride] +=
	571	abase[xaxstride + naystride] *
	572	bbase[nbxstride + ybystride];
567c915b TK	573	}
	574	else if (GFC_DESCRIPTOR_RANK (a) == 1)
	575	{
	576	const GFC_INTEGER_1 *restrict bbase_y;
	577	GFC_INTEGER_1 s;
	578
	579	for (y = 0; y < ycount; y++)
	580	{
	581	bbase_y = &bbase[y*bystride];
	582	s = (GFC_INTEGER_1) 0;
	583	for (n = 0; n < count; n++)
	584	s += abase[naxstride] bbase_y[n*bxstride];
	585	dest[y*rxstride] = s;
	586	}
	587	}
	588	else
	589	{
	590	const GFC_INTEGER_1 *restrict abase_x;
	591	const GFC_INTEGER_1 *restrict bbase_y;
	592	GFC_INTEGER_1 *restrict dest_y;
	593	GFC_INTEGER_1 s;
	594
	595	for (y = 0; y < ycount; y++)
	596	{
	597	bbase_y = &bbase[y*bystride];
	598	dest_y = &dest[y*rystride];
	599	for (x = 0; x < xcount; x++)
	600	{
	601	abase_x = &abase[x*axstride];
	602	s = (GFC_INTEGER_1) 0;
	603	for (n = 0; n < count; n++)
	604	s += abase_x[naystride] bbase_y[n*bxstride];
	605	dest_y[x*rxstride] = s;
	606	}
	607	}
	608	}
	609	}
567c915b	610	#endif