libgfortran/generated/matmul_r4.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran 95 runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 2 of the License, or (at your option) any later version.
  11
  12 In addition to the permissions in the GNU General Public License, the
  13 Free Software Foundation gives you unlimited permission to link the
  14 compiled version of this file into combinations with other programs,
  15 and to distribute those combinations without any restriction coming
  16 from the use of this file.  (The General Public License restrictions
  17 do apply in other respects; for example, they cover modification of
  18 the file, and distribution when not linked into a combine
  19 executable.)
  20
  21 Libgfortran is distributed in the hope that it will be useful,
  22 but WITHOUT ANY WARRANTY; without even the implied warranty of
  23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  24 GNU General Public License for more details.
  25
  26 You should have received a copy of the GNU General Public
  27 License along with libgfortran; see the file COPYING.  If not,
  28 write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  29 Boston, MA 02110-1301, USA.  */
  30
  31 #include "config.h"
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <assert.h>
  35 #include "libgfortran.h"
  36
  37 #if defined (HAVE_GFC_REAL_4)
  38
  39 /* The order of loops is different in the case of plain matrix
  40    multiplication C=MATMUL(A,B), and in the frequent special case where
  41    the argument A is the temporary result of a TRANSPOSE intrinsic:
  42    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  43    looking at their strides.
  44
  45    The equivalent Fortran pseudo-code is:
  46
  47    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  48    IF (.NOT.IS_TRANSPOSED(A)) THEN
  49      C = 0
  50      DO J=1,N
  51        DO K=1,COUNT
  52          DO I=1,M
  53            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  54    ELSE
  55      DO J=1,N
  56        DO I=1,M
  57          S = 0
  58          DO K=1,COUNT
  59            S = S+A(I,K)+B(K,J)
  60          C(I,J) = S
  61    ENDIF
  62 */
  63
  64 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
  65         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b);
  66 export_proto(matmul_r4);
  67
  68 void
  69 matmul_r4 (gfc_array_r4 * const restrict retarray,
  70         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b)
  71 {
  72   const GFC_REAL_4 * restrict abase;
  73   const GFC_REAL_4 * restrict bbase;
  74   GFC_REAL_4 * restrict dest;
  75
  76   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  77   index_type x, y, n, count, xcount, ycount;
  78
  79   assert (GFC_DESCRIPTOR_RANK (a) == 2
  80           || GFC_DESCRIPTOR_RANK (b) == 2);
  81
  82 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
  83
  84    Either A or B (but not both) can be rank 1:
  85
  86    o One-dimensional argument A is implicitly treated as a row matrix
  87      dimensioned [1,count], so xcount=1.
  88
  89    o One-dimensional argument B is implicitly treated as a column matrix
  90      dimensioned [count, 1], so ycount=1.
  91   */
  92
  93   if (retarray->data == NULL)
  94     {
  95       if (GFC_DESCRIPTOR_RANK (a) == 1)
  96         {
  97           retarray->dim[0].lbound = 0;
  98           retarray->dim[0].ubound = b->dim[1].ubound - b->dim[1].lbound;
  99           retarray->dim[0].stride = 1;
 100         }
 101       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 102         {
 103           retarray->dim[0].lbound = 0;
 104           retarray->dim[0].ubound = a->dim[0].ubound - a->dim[0].lbound;
 105           retarray->dim[0].stride = 1;
 106         }
 107       else
 108         {
 109           retarray->dim[0].lbound = 0;
 110           retarray->dim[0].ubound = a->dim[0].ubound - a->dim[0].lbound;
 111           retarray->dim[0].stride = 1;
 112
 113           retarray->dim[1].lbound = 0;
 114           retarray->dim[1].ubound = b->dim[1].ubound - b->dim[1].lbound;
 115           retarray->dim[1].stride = retarray->dim[0].ubound+1;
 116         }
 117
 118       retarray->data
 119         = internal_malloc_size (sizeof (GFC_REAL_4) * size0 ((array_t *) retarray));
 120       retarray->offset = 0;
 121     }
 122
 123
 124   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 125     {
 126       /* One-dimensional result may be addressed in the code below
 127          either as a row or a column matrix. We want both cases to
 128          work. */
 129       rxstride = rystride = retarray->dim[0].stride;
 130     }
 131   else
 132     {
 133       rxstride = retarray->dim[0].stride;
 134       rystride = retarray->dim[1].stride;
 135     }
 136
 137
 138   if (GFC_DESCRIPTOR_RANK (a) == 1)
 139     {
 140       /* Treat it as a a row matrix A[1,count]. */
 141       axstride = a->dim[0].stride;
 142       aystride = 1;
 143
 144       xcount = 1;
 145       count = a->dim[0].ubound + 1 - a->dim[0].lbound;
 146     }
 147   else
 148     {
 149       axstride = a->dim[0].stride;
 150       aystride = a->dim[1].stride;
 151
 152       count = a->dim[1].ubound + 1 - a->dim[1].lbound;
 153       xcount = a->dim[0].ubound + 1 - a->dim[0].lbound;
 154     }
 155
 156   assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound);
 157
 158   if (GFC_DESCRIPTOR_RANK (b) == 1)
 159     {
 160       /* Treat it as a column matrix B[count,1] */
 161       bxstride = b->dim[0].stride;
 162
 163       /* bystride should never be used for 1-dimensional b.
 164          in case it is we want it to cause a segfault, rather than
 165          an incorrect result. */
 166       bystride = 0xDEADBEEF;
 167       ycount = 1;
 168     }
 169   else
 170     {
 171       bxstride = b->dim[0].stride;
 172       bystride = b->dim[1].stride;
 173       ycount = b->dim[1].ubound + 1 - b->dim[1].lbound;
 174     }
 175
 176   abase = a->data;
 177   bbase = b->data;
 178   dest = retarray->data;
 179
 180   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 181     {
 182       const GFC_REAL_4 * restrict bbase_y;
 183       GFC_REAL_4 * restrict dest_y;
 184       const GFC_REAL_4 * restrict abase_n;
 185       GFC_REAL_4 bbase_yn;
 186
 187       if (rystride == xcount)
 188         memset (dest, 0, (sizeof (GFC_REAL_4) * xcount * ycount));
 189       else
 190         {
 191           for (y = 0; y < ycount; y++)
 192             for (x = 0; x < xcount; x++)
 193               dest[x + y*rystride] = (GFC_REAL_4)0;
 194         }
 195
 196       for (y = 0; y < ycount; y++)
 197         {
 198           bbase_y = bbase + y*bystride;
 199           dest_y = dest + y*rystride;
 200           for (n = 0; n < count; n++)
 201             {
 202               abase_n = abase + n*aystride;
 203               bbase_yn = bbase_y[n];
 204               for (x = 0; x < xcount; x++)
 205                 {
 206                   dest_y[x] += abase_n[x] * bbase_yn;
 207                 }
 208             }
 209         }
 210     }
 211   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 212     {
 213       if (GFC_DESCRIPTOR_RANK (a) != 1)
 214         {
 215           const GFC_REAL_4 *restrict abase_x;
 216           const GFC_REAL_4 *restrict bbase_y;
 217           GFC_REAL_4 *restrict dest_y;
 218           GFC_REAL_4 s;
 219
 220           for (y = 0; y < ycount; y++)
 221             {
 222               bbase_y = &bbase[y*bystride];
 223               dest_y = &dest[y*rystride];
 224               for (x = 0; x < xcount; x++)
 225                 {
 226                   abase_x = &abase[x*axstride];
 227                   s = (GFC_REAL_4) 0;
 228                   for (n = 0; n < count; n++)
 229                     s += abase_x[n] * bbase_y[n];
 230                   dest_y[x] = s;
 231                 }
 232             }
 233         }
 234       else
 235         {
 236           const GFC_REAL_4 *restrict bbase_y;
 237           GFC_REAL_4 s;
 238
 239           for (y = 0; y < ycount; y++)
 240             {
 241               bbase_y = &bbase[y*bystride];
 242               s = (GFC_REAL_4) 0;
 243               for (n = 0; n < count; n++)
 244                 s += abase[n*axstride] * bbase_y[n];
 245               dest[y*rystride] = s;
 246             }
 247         }
 248     }
 249   else if (axstride < aystride)
 250     {
 251       for (y = 0; y < ycount; y++)
 252         for (x = 0; x < xcount; x++)
 253           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
 254
 255       for (y = 0; y < ycount; y++)
 256         for (n = 0; n < count; n++)
 257           for (x = 0; x < xcount; x++)
 258             /* dest[x,y] += a[x,n] * b[n,y] */
 259             dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
 260     }
 261   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 262     {
 263       const GFC_REAL_4 *restrict bbase_y;
 264       GFC_REAL_4 s;
 265
 266       for (y = 0; y < ycount; y++)
 267         {
 268           bbase_y = &bbase[y*bystride];
 269           s = (GFC_REAL_4) 0;
 270           for (n = 0; n < count; n++)
 271             s += abase[n*axstride] * bbase_y[n*bxstride];
 272           dest[y*rxstride] = s;
 273         }
 274     }
 275   else
 276     {
 277       const GFC_REAL_4 *restrict abase_x;
 278       const GFC_REAL_4 *restrict bbase_y;
 279       GFC_REAL_4 *restrict dest_y;
 280       GFC_REAL_4 s;
 281
 282       for (y = 0; y < ycount; y++)
 283         {
 284           bbase_y = &bbase[y*bystride];
 285           dest_y = &dest[y*rystride];
 286           for (x = 0; x < xcount; x++)
 287             {
 288               abase_x = &abase[x*axstride];
 289               s = (GFC_REAL_4) 0;
 290               for (n = 0; n < count; n++)
 291                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 292               dest_y[x*rxstride] = s;
 293             }
 294         }
 295     }
 296 }
 297
 298 #endif