libgfortran/generated/matmul_r8.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_REAL_8)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
  39                           const int *, const GFC_REAL_8 *, const int *,
  40                           const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
  73         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_r8);
  76
  77 /* Put exhaustive list of possible architectures here here, ORed together.  */
  78
  79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  80
  81 #ifdef HAVE_AVX
  82 static void
  83 matmul_r8_avx (gfc_array_r8 * const restrict retarray,
  84         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
  85         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  86 static void
  87 matmul_r8_avx (gfc_array_r8 * const restrict retarray,
  88         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
  89         int blas_limit, blas_call gemm)
  90 {
  91   const GFC_REAL_8 * restrict abase;
  92   const GFC_REAL_8 * restrict bbase;
  93   GFC_REAL_8 * restrict dest;
  94
  95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  96   index_type x, y, n, count, xcount, ycount;
  97
  98   assert (GFC_DESCRIPTOR_RANK (a) == 2
  99           || GFC_DESCRIPTOR_RANK (b) == 2);
 100
 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 102
 103    Either A or B (but not both) can be rank 1:
 104
 105    o One-dimensional argument A is implicitly treated as a row matrix
 106      dimensioned [1,count], so xcount=1.
 107
 108    o One-dimensional argument B is implicitly treated as a column matrix
 109      dimensioned [count, 1], so ycount=1.
 110 */
 111
 112   if (retarray->base_addr == NULL)
 113     {
 114       if (GFC_DESCRIPTOR_RANK (a) == 1)
 115         {
 116           GFC_DIMENSION_SET(retarray->dim[0], 0,
 117                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 118         }
 119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 120         {
 121           GFC_DIMENSION_SET(retarray->dim[0], 0,
 122                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 123         }
 124       else
 125         {
 126           GFC_DIMENSION_SET(retarray->dim[0], 0,
 127                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 128
 129           GFC_DIMENSION_SET(retarray->dim[1], 0,
 130                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 131                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 132         }
 133
 134       retarray->base_addr
 135         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
 136       retarray->offset = 0;
 137     }
 138   else if (unlikely (compile_options.bounds_check))
 139     {
 140       index_type ret_extent, arg_extent;
 141
 142       if (GFC_DESCRIPTOR_RANK (a) == 1)
 143         {
 144           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 145           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 146           if (arg_extent != ret_extent)
 147             runtime_error ("Array bound mismatch for dimension 1 of "
 148                            "array (%ld/%ld) ",
 149                            (long int) ret_extent, (long int) arg_extent);
 150         }
 151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 152         {
 153           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 154           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 155           if (arg_extent != ret_extent)
 156             runtime_error ("Array bound mismatch for dimension 1 of "
 157                            "array (%ld/%ld) ",
 158                            (long int) ret_extent, (long int) arg_extent);
 159         }
 160       else
 161         {
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Array bound mismatch for dimension 1 of "
 166                            "array (%ld/%ld) ",
 167                            (long int) ret_extent, (long int) arg_extent);
 168
 169           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 170           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 171           if (arg_extent != ret_extent)
 172             runtime_error ("Array bound mismatch for dimension 2 of "
 173                            "array (%ld/%ld) ",
 174                            (long int) ret_extent, (long int) arg_extent);
 175         }
 176     }
 177
 178
 179   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 180     {
 181       /* One-dimensional result may be addressed in the code below
 182          either as a row or a column matrix. We want both cases to
 183          work. */
 184       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 185     }
 186   else
 187     {
 188       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 189       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 190     }
 191
 192
 193   if (GFC_DESCRIPTOR_RANK (a) == 1)
 194     {
 195       /* Treat it as a a row matrix A[1,count]. */
 196       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 197       aystride = 1;
 198
 199       xcount = 1;
 200       count = GFC_DESCRIPTOR_EXTENT(a,0);
 201     }
 202   else
 203     {
 204       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 205       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 206
 207       count = GFC_DESCRIPTOR_EXTENT(a,1);
 208       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 209     }
 210
 211   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 212     {
 213       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 214         runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
 215                        "in dimension 1: is %ld, should be %ld",
 216                        (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
 217     }
 218
 219   if (GFC_DESCRIPTOR_RANK (b) == 1)
 220     {
 221       /* Treat it as a column matrix B[count,1] */
 222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 223
 224       /* bystride should never be used for 1-dimensional b.
 225          The value is only used for calculation of the
 226          memory by the buffer.  */
 227       bystride = 256;
 228       ycount = 1;
 229     }
 230   else
 231     {
 232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 235     }
 236
 237   abase = a->base_addr;
 238   bbase = b->base_addr;
 239   dest = retarray->base_addr;
 240
 241   /* Now that everything is set up, we perform the multiplication
 242      itself.  */
 243
 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 245 #define min(a,b) ((a) <= (b) ? (a) : (b))
 246 #define max(a,b) ((a) >= (b) ? (a) : (b))
 247
 248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 249       && (bxstride == 1 || bystride == 1)
 250       && (((float) xcount) * ((float) ycount) * ((float) count)
 251           > POW3(blas_limit)))
 252     {
 253       const int m = xcount, n = ycount, k = count, ldc = rystride;
 254       const GFC_REAL_8 one = 1, zero = 0;
 255       const int lda = (axstride == 1) ? aystride : axstride,
 256                 ldb = (bxstride == 1) ? bystride : bxstride;
 257
 258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 259         {
 260           assert (gemm != NULL);
 261           const char *transa, *transb;
 262           if (try_blas & 2)
 263             transa = "C";
 264           else
 265             transa = axstride == 1 ? "N" : "T";
 266
 267           if (try_blas & 4)
 268             transb = "C";
 269           else
 270             transb = bxstride == 1 ? "N" : "T";
 271
 272           gemm (transa, transb , &m,
 273                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 274                 &ldc, 1, 1);
 275           return;
 276         }
 277     }
 278
 279   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 280     {
 281       /* This block of code implements a tuned matmul, derived from
 282          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 283
 284                Bo Kagstrom and Per Ling
 285                Department of Computing Science
 286                Umea University
 287                S-901 87 Umea, Sweden
 288
 289          from netlib.org, translated to C, and modified for matmul.m4.  */
 290
 291       const GFC_REAL_8 *a, *b;
 292       GFC_REAL_8 *c;
 293       const index_type m = xcount, n = ycount, k = count;
 294
 295       /* System generated locals */
 296       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 297                  i1, i2, i3, i4, i5, i6;
 298
 299       /* Local variables */
 300       GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
 301                  f13, f14, f23, f24, f33, f34, f43, f44;
 302       index_type i, j, l, ii, jj, ll;
 303       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 304       GFC_REAL_8 *t1;
 305
 306       a = abase;
 307       b = bbase;
 308       c = retarray->base_addr;
 309
 310       /* Parameter adjustments */
 311       c_dim1 = rystride;
 312       c_offset = 1 + c_dim1;
 313       c -= c_offset;
 314       a_dim1 = aystride;
 315       a_offset = 1 + a_dim1;
 316       a -= a_offset;
 317       b_dim1 = bystride;
 318       b_offset = 1 + b_dim1;
 319       b -= b_offset;
 320
 321       /* Empty c first.  */
 322       for (j=1; j<=n; j++)
 323         for (i=1; i<=m; i++)
 324           c[i + j * c_dim1] = (GFC_REAL_8)0;
 325
 326       /* Early exit if possible */
 327       if (m == 0 || n == 0 || k == 0)
 328         return;
 329
 330       /* Adjust size of t1 to what is needed.  */
 331       index_type t1_dim, a_sz;
 332       if (aystride == 1)
 333         a_sz = rystride;
 334       else
 335         a_sz = a_dim1;
 336
 337       t1_dim = a_sz * 256 + b_dim1;
 338       if (t1_dim > 65536)
 339         t1_dim = 65536;
 340
 341       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 342
 343       /* Start turning the crank. */
 344       i1 = n;
 345       for (jj = 1; jj <= i1; jj += 512)
 346         {
 347           /* Computing MIN */
 348           i2 = 512;
 349           i3 = n - jj + 1;
 350           jsec = min(i2,i3);
 351           ujsec = jsec - jsec % 4;
 352           i2 = k;
 353           for (ll = 1; ll <= i2; ll += 256)
 354             {
 355               /* Computing MIN */
 356               i3 = 256;
 357               i4 = k - ll + 1;
 358               lsec = min(i3,i4);
 359               ulsec = lsec - lsec % 2;
 360
 361               i3 = m;
 362               for (ii = 1; ii <= i3; ii += 256)
 363                 {
 364                   /* Computing MIN */
 365                   i4 = 256;
 366                   i5 = m - ii + 1;
 367                   isec = min(i4,i5);
 368                   uisec = isec - isec % 2;
 369                   i4 = ll + ulsec - 1;
 370                   for (l = ll; l <= i4; l += 2)
 371                     {
 372                       i5 = ii + uisec - 1;
 373                       for (i = ii; i <= i5; i += 2)
 374                         {
 375                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 376                                         a[i + l * a_dim1];
 377                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 378                                         a[i + (l + 1) * a_dim1];
 379                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 380                                         a[i + 1 + l * a_dim1];
 381                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 382                                         a[i + 1 + (l + 1) * a_dim1];
 383                         }
 384                       if (uisec < isec)
 385                         {
 386                           t1[l - ll + 1 + (isec << 8) - 257] =
 387                                     a[ii + isec - 1 + l * a_dim1];
 388                           t1[l - ll + 2 + (isec << 8) - 257] =
 389                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 390                         }
 391                     }
 392                   if (ulsec < lsec)
 393                     {
 394                       i4 = ii + isec - 1;
 395                       for (i = ii; i<= i4; ++i)
 396                         {
 397                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 398                                     a[i + (ll + lsec - 1) * a_dim1];
 399                         }
 400                     }
 401
 402                   uisec = isec - isec % 4;
 403                   i4 = jj + ujsec - 1;
 404                   for (j = jj; j <= i4; j += 4)
 405                     {
 406                       i5 = ii + uisec - 1;
 407                       for (i = ii; i <= i5; i += 4)
 408                         {
 409                           f11 = c[i + j * c_dim1];
 410                           f21 = c[i + 1 + j * c_dim1];
 411                           f12 = c[i + (j + 1) * c_dim1];
 412                           f22 = c[i + 1 + (j + 1) * c_dim1];
 413                           f13 = c[i + (j + 2) * c_dim1];
 414                           f23 = c[i + 1 + (j + 2) * c_dim1];
 415                           f14 = c[i + (j + 3) * c_dim1];
 416                           f24 = c[i + 1 + (j + 3) * c_dim1];
 417                           f31 = c[i + 2 + j * c_dim1];
 418                           f41 = c[i + 3 + j * c_dim1];
 419                           f32 = c[i + 2 + (j + 1) * c_dim1];
 420                           f42 = c[i + 3 + (j + 1) * c_dim1];
 421                           f33 = c[i + 2 + (j + 2) * c_dim1];
 422                           f43 = c[i + 3 + (j + 2) * c_dim1];
 423                           f34 = c[i + 2 + (j + 3) * c_dim1];
 424                           f44 = c[i + 3 + (j + 3) * c_dim1];
 425                           i6 = ll + lsec - 1;
 426                           for (l = ll; l <= i6; ++l)
 427                             {
 428                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 429                                       * b[l + j * b_dim1];
 430                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 431                                       * b[l + j * b_dim1];
 432                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 433                                       * b[l + (j + 1) * b_dim1];
 434                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 435                                       * b[l + (j + 1) * b_dim1];
 436                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 437                                       * b[l + (j + 2) * b_dim1];
 438                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 439                                       * b[l + (j + 2) * b_dim1];
 440                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 441                                       * b[l + (j + 3) * b_dim1];
 442                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 443                                       * b[l + (j + 3) * b_dim1];
 444                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 445                                       * b[l + j * b_dim1];
 446                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 447                                       * b[l + j * b_dim1];
 448                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 449                                       * b[l + (j + 1) * b_dim1];
 450                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 451                                       * b[l + (j + 1) * b_dim1];
 452                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 453                                       * b[l + (j + 2) * b_dim1];
 454                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 455                                       * b[l + (j + 2) * b_dim1];
 456                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 457                                       * b[l + (j + 3) * b_dim1];
 458                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 459                                       * b[l + (j + 3) * b_dim1];
 460                             }
 461                           c[i + j * c_dim1] = f11;
 462                           c[i + 1 + j * c_dim1] = f21;
 463                           c[i + (j + 1) * c_dim1] = f12;
 464                           c[i + 1 + (j + 1) * c_dim1] = f22;
 465                           c[i + (j + 2) * c_dim1] = f13;
 466                           c[i + 1 + (j + 2) * c_dim1] = f23;
 467                           c[i + (j + 3) * c_dim1] = f14;
 468                           c[i + 1 + (j + 3) * c_dim1] = f24;
 469                           c[i + 2 + j * c_dim1] = f31;
 470                           c[i + 3 + j * c_dim1] = f41;
 471                           c[i + 2 + (j + 1) * c_dim1] = f32;
 472                           c[i + 3 + (j + 1) * c_dim1] = f42;
 473                           c[i + 2 + (j + 2) * c_dim1] = f33;
 474                           c[i + 3 + (j + 2) * c_dim1] = f43;
 475                           c[i + 2 + (j + 3) * c_dim1] = f34;
 476                           c[i + 3 + (j + 3) * c_dim1] = f44;
 477                         }
 478                       if (uisec < isec)
 479                         {
 480                           i5 = ii + isec - 1;
 481                           for (i = ii + uisec; i <= i5; ++i)
 482                             {
 483                               f11 = c[i + j * c_dim1];
 484                               f12 = c[i + (j + 1) * c_dim1];
 485                               f13 = c[i + (j + 2) * c_dim1];
 486                               f14 = c[i + (j + 3) * c_dim1];
 487                               i6 = ll + lsec - 1;
 488                               for (l = ll; l <= i6; ++l)
 489                                 {
 490                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 491                                           257] * b[l + j * b_dim1];
 492                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 493                                           257] * b[l + (j + 1) * b_dim1];
 494                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 495                                           257] * b[l + (j + 2) * b_dim1];
 496                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 497                                           257] * b[l + (j + 3) * b_dim1];
 498                                 }
 499                               c[i + j * c_dim1] = f11;
 500                               c[i + (j + 1) * c_dim1] = f12;
 501                               c[i + (j + 2) * c_dim1] = f13;
 502                               c[i + (j + 3) * c_dim1] = f14;
 503                             }
 504                         }
 505                     }
 506                   if (ujsec < jsec)
 507                     {
 508                       i4 = jj + jsec - 1;
 509                       for (j = jj + ujsec; j <= i4; ++j)
 510                         {
 511                           i5 = ii + uisec - 1;
 512                           for (i = ii; i <= i5; i += 4)
 513                             {
 514                               f11 = c[i + j * c_dim1];
 515                               f21 = c[i + 1 + j * c_dim1];
 516                               f31 = c[i + 2 + j * c_dim1];
 517                               f41 = c[i + 3 + j * c_dim1];
 518                               i6 = ll + lsec - 1;
 519                               for (l = ll; l <= i6; ++l)
 520                                 {
 521                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 522                                           257] * b[l + j * b_dim1];
 523                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 524                                           257] * b[l + j * b_dim1];
 525                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 526                                           257] * b[l + j * b_dim1];
 527                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 528                                           257] * b[l + j * b_dim1];
 529                                 }
 530                               c[i + j * c_dim1] = f11;
 531                               c[i + 1 + j * c_dim1] = f21;
 532                               c[i + 2 + j * c_dim1] = f31;
 533                               c[i + 3 + j * c_dim1] = f41;
 534                             }
 535                           i5 = ii + isec - 1;
 536                           for (i = ii + uisec; i <= i5; ++i)
 537                             {
 538                               f11 = c[i + j * c_dim1];
 539                               i6 = ll + lsec - 1;
 540                               for (l = ll; l <= i6; ++l)
 541                                 {
 542                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 543                                           257] * b[l + j * b_dim1];
 544                                 }
 545                               c[i + j * c_dim1] = f11;
 546                             }
 547                         }
 548                     }
 549                 }
 550             }
 551         }
 552       free(t1);
 553       return;
 554     }
 555   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 556     {
 557       if (GFC_DESCRIPTOR_RANK (a) != 1)
 558         {
 559           const GFC_REAL_8 *restrict abase_x;
 560           const GFC_REAL_8 *restrict bbase_y;
 561           GFC_REAL_8 *restrict dest_y;
 562           GFC_REAL_8 s;
 563
 564           for (y = 0; y < ycount; y++)
 565             {
 566               bbase_y = &bbase[y*bystride];
 567               dest_y = &dest[y*rystride];
 568               for (x = 0; x < xcount; x++)
 569                 {
 570                   abase_x = &abase[x*axstride];
 571                   s = (GFC_REAL_8) 0;
 572                   for (n = 0; n < count; n++)
 573                     s += abase_x[n] * bbase_y[n];
 574                   dest_y[x] = s;
 575                 }
 576             }
 577         }
 578       else
 579         {
 580           const GFC_REAL_8 *restrict bbase_y;
 581           GFC_REAL_8 s;
 582
 583           for (y = 0; y < ycount; y++)
 584             {
 585               bbase_y = &bbase[y*bystride];
 586               s = (GFC_REAL_8) 0;
 587               for (n = 0; n < count; n++)
 588                 s += abase[n*axstride] * bbase_y[n];
 589               dest[y*rystride] = s;
 590             }
 591         }
 592     }
 593   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 594     {
 595       const GFC_REAL_8 *restrict bbase_y;
 596       GFC_REAL_8 s;
 597
 598       for (y = 0; y < ycount; y++)
 599         {
 600           bbase_y = &bbase[y*bystride];
 601           s = (GFC_REAL_8) 0;
 602           for (n = 0; n < count; n++)
 603             s += abase[n*axstride] * bbase_y[n*bxstride];
 604           dest[y*rxstride] = s;
 605         }
 606     }
 607   else if (axstride < aystride)
 608     {
 609       for (y = 0; y < ycount; y++)
 610         for (x = 0; x < xcount; x++)
 611           dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
 612
 613       for (y = 0; y < ycount; y++)
 614         for (n = 0; n < count; n++)
 615           for (x = 0; x < xcount; x++)
 616             /* dest[x,y] += a[x,n] * b[n,y] */
 617             dest[x*rxstride + y*rystride] +=
 618                                         abase[x*axstride + n*aystride] *
 619                                         bbase[n*bxstride + y*bystride];
 620     }
 621   else
 622     {
 623       const GFC_REAL_8 *restrict abase_x;
 624       const GFC_REAL_8 *restrict bbase_y;
 625       GFC_REAL_8 *restrict dest_y;
 626       GFC_REAL_8 s;
 627
 628       for (y = 0; y < ycount; y++)
 629         {
 630           bbase_y = &bbase[y*bystride];
 631           dest_y = &dest[y*rystride];
 632           for (x = 0; x < xcount; x++)
 633             {
 634               abase_x = &abase[x*axstride];
 635               s = (GFC_REAL_8) 0;
 636               for (n = 0; n < count; n++)
 637                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 638               dest_y[x*rxstride] = s;
 639             }
 640         }
 641     }
 642 }
 643 #undef POW3
 644 #undef min
 645 #undef max
 646
 647 #endif /* HAVE_AVX */
 648
 649 #ifdef HAVE_AVX2
 650 static void
 651 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
 652         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
 653         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 654 static void
 655 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
 656         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
 657         int blas_limit, blas_call gemm)
 658 {
 659   const GFC_REAL_8 * restrict abase;
 660   const GFC_REAL_8 * restrict bbase;
 661   GFC_REAL_8 * restrict dest;
 662
 663   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 664   index_type x, y, n, count, xcount, ycount;
 665
 666   assert (GFC_DESCRIPTOR_RANK (a) == 2
 667           || GFC_DESCRIPTOR_RANK (b) == 2);
 668
 669 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 670
 671    Either A or B (but not both) can be rank 1:
 672
 673    o One-dimensional argument A is implicitly treated as a row matrix
 674      dimensioned [1,count], so xcount=1.
 675
 676    o One-dimensional argument B is implicitly treated as a column matrix
 677      dimensioned [count, 1], so ycount=1.
 678 */
 679
 680   if (retarray->base_addr == NULL)
 681     {
 682       if (GFC_DESCRIPTOR_RANK (a) == 1)
 683         {
 684           GFC_DIMENSION_SET(retarray->dim[0], 0,
 685                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 686         }
 687       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 688         {
 689           GFC_DIMENSION_SET(retarray->dim[0], 0,
 690                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 691         }
 692       else
 693         {
 694           GFC_DIMENSION_SET(retarray->dim[0], 0,
 695                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 696
 697           GFC_DIMENSION_SET(retarray->dim[1], 0,
 698                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 699                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 700         }
 701
 702       retarray->base_addr
 703         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
 704       retarray->offset = 0;
 705     }
 706   else if (unlikely (compile_options.bounds_check))
 707     {
 708       index_type ret_extent, arg_extent;
 709
 710       if (GFC_DESCRIPTOR_RANK (a) == 1)
 711         {
 712           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 713           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 714           if (arg_extent != ret_extent)
 715             runtime_error ("Array bound mismatch for dimension 1 of "
 716                            "array (%ld/%ld) ",
 717                            (long int) ret_extent, (long int) arg_extent);
 718         }
 719       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 720         {
 721           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 722           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 723           if (arg_extent != ret_extent)
 724             runtime_error ("Array bound mismatch for dimension 1 of "
 725                            "array (%ld/%ld) ",
 726                            (long int) ret_extent, (long int) arg_extent);
 727         }
 728       else
 729         {
 730           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 731           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 732           if (arg_extent != ret_extent)
 733             runtime_error ("Array bound mismatch for dimension 1 of "
 734                            "array (%ld/%ld) ",
 735                            (long int) ret_extent, (long int) arg_extent);
 736
 737           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 738           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 739           if (arg_extent != ret_extent)
 740             runtime_error ("Array bound mismatch for dimension 2 of "
 741                            "array (%ld/%ld) ",
 742                            (long int) ret_extent, (long int) arg_extent);
 743         }
 744     }
 745
 746
 747   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 748     {
 749       /* One-dimensional result may be addressed in the code below
 750          either as a row or a column matrix. We want both cases to
 751          work. */
 752       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 753     }
 754   else
 755     {
 756       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 757       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 758     }
 759
 760
 761   if (GFC_DESCRIPTOR_RANK (a) == 1)
 762     {
 763       /* Treat it as a a row matrix A[1,count]. */
 764       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 765       aystride = 1;
 766
 767       xcount = 1;
 768       count = GFC_DESCRIPTOR_EXTENT(a,0);
 769     }
 770   else
 771     {
 772       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 773       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 774
 775       count = GFC_DESCRIPTOR_EXTENT(a,1);
 776       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 777     }
 778
 779   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 780     {
 781       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 782         runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
 783                        "in dimension 1: is %ld, should be %ld",
 784                        (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
 785     }
 786
 787   if (GFC_DESCRIPTOR_RANK (b) == 1)
 788     {
 789       /* Treat it as a column matrix B[count,1] */
 790       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 791
 792       /* bystride should never be used for 1-dimensional b.
 793          The value is only used for calculation of the
 794          memory by the buffer.  */
 795       bystride = 256;
 796       ycount = 1;
 797     }
 798   else
 799     {
 800       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 801       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 802       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 803     }
 804
 805   abase = a->base_addr;
 806   bbase = b->base_addr;
 807   dest = retarray->base_addr;
 808
 809   /* Now that everything is set up, we perform the multiplication
 810      itself.  */
 811
 812 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 813 #define min(a,b) ((a) <= (b) ? (a) : (b))
 814 #define max(a,b) ((a) >= (b) ? (a) : (b))
 815
 816   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 817       && (bxstride == 1 || bystride == 1)
 818       && (((float) xcount) * ((float) ycount) * ((float) count)
 819           > POW3(blas_limit)))
 820     {
 821       const int m = xcount, n = ycount, k = count, ldc = rystride;
 822       const GFC_REAL_8 one = 1, zero = 0;
 823       const int lda = (axstride == 1) ? aystride : axstride,
 824                 ldb = (bxstride == 1) ? bystride : bxstride;
 825
 826       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 827         {
 828           assert (gemm != NULL);
 829           const char *transa, *transb;
 830           if (try_blas & 2)
 831             transa = "C";
 832           else
 833             transa = axstride == 1 ? "N" : "T";
 834
 835           if (try_blas & 4)
 836             transb = "C";
 837           else
 838             transb = bxstride == 1 ? "N" : "T";
 839
 840           gemm (transa, transb , &m,
 841                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 842                 &ldc, 1, 1);
 843           return;
 844         }
 845     }
 846
 847   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 848     {
 849       /* This block of code implements a tuned matmul, derived from
 850          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 851
 852                Bo Kagstrom and Per Ling
 853                Department of Computing Science
 854                Umea University
 855                S-901 87 Umea, Sweden
 856
 857          from netlib.org, translated to C, and modified for matmul.m4.  */
 858
 859       const GFC_REAL_8 *a, *b;
 860       GFC_REAL_8 *c;
 861       const index_type m = xcount, n = ycount, k = count;
 862
 863       /* System generated locals */
 864       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 865                  i1, i2, i3, i4, i5, i6;
 866
 867       /* Local variables */
 868       GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
 869                  f13, f14, f23, f24, f33, f34, f43, f44;
 870       index_type i, j, l, ii, jj, ll;
 871       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 872       GFC_REAL_8 *t1;
 873
 874       a = abase;
 875       b = bbase;
 876       c = retarray->base_addr;
 877
 878       /* Parameter adjustments */
 879       c_dim1 = rystride;
 880       c_offset = 1 + c_dim1;
 881       c -= c_offset;
 882       a_dim1 = aystride;
 883       a_offset = 1 + a_dim1;
 884       a -= a_offset;
 885       b_dim1 = bystride;
 886       b_offset = 1 + b_dim1;
 887       b -= b_offset;
 888
 889       /* Empty c first.  */
 890       for (j=1; j<=n; j++)
 891         for (i=1; i<=m; i++)
 892           c[i + j * c_dim1] = (GFC_REAL_8)0;
 893
 894       /* Early exit if possible */
 895       if (m == 0 || n == 0 || k == 0)
 896         return;
 897
 898       /* Adjust size of t1 to what is needed.  */
 899       index_type t1_dim, a_sz;
 900       if (aystride == 1)
 901         a_sz = rystride;
 902       else
 903         a_sz = a_dim1;
 904
 905       t1_dim = a_sz * 256 + b_dim1;
 906       if (t1_dim > 65536)
 907         t1_dim = 65536;
 908
 909       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
 910
 911       /* Start turning the crank. */
 912       i1 = n;
 913       for (jj = 1; jj <= i1; jj += 512)
 914         {
 915           /* Computing MIN */
 916           i2 = 512;
 917           i3 = n - jj + 1;
 918           jsec = min(i2,i3);
 919           ujsec = jsec - jsec % 4;
 920           i2 = k;
 921           for (ll = 1; ll <= i2; ll += 256)
 922             {
 923               /* Computing MIN */
 924               i3 = 256;
 925               i4 = k - ll + 1;
 926               lsec = min(i3,i4);
 927               ulsec = lsec - lsec % 2;
 928
 929               i3 = m;
 930               for (ii = 1; ii <= i3; ii += 256)
 931                 {
 932                   /* Computing MIN */
 933                   i4 = 256;
 934                   i5 = m - ii + 1;
 935                   isec = min(i4,i5);
 936                   uisec = isec - isec % 2;
 937                   i4 = ll + ulsec - 1;
 938                   for (l = ll; l <= i4; l += 2)
 939                     {
 940                       i5 = ii + uisec - 1;
 941                       for (i = ii; i <= i5; i += 2)
 942                         {
 943                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 944                                         a[i + l * a_dim1];
 945                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 946                                         a[i + (l + 1) * a_dim1];
 947                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 948                                         a[i + 1 + l * a_dim1];
 949                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 950                                         a[i + 1 + (l + 1) * a_dim1];
 951                         }
 952                       if (uisec < isec)
 953                         {
 954                           t1[l - ll + 1 + (isec << 8) - 257] =
 955                                     a[ii + isec - 1 + l * a_dim1];
 956                           t1[l - ll + 2 + (isec << 8) - 257] =
 957                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 958                         }
 959                     }
 960                   if (ulsec < lsec)
 961                     {
 962                       i4 = ii + isec - 1;
 963                       for (i = ii; i<= i4; ++i)
 964                         {
 965                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 966                                     a[i + (ll + lsec - 1) * a_dim1];
 967                         }
 968                     }
 969
 970                   uisec = isec - isec % 4;
 971                   i4 = jj + ujsec - 1;
 972                   for (j = jj; j <= i4; j += 4)
 973                     {
 974                       i5 = ii + uisec - 1;
 975                       for (i = ii; i <= i5; i += 4)
 976                         {
 977                           f11 = c[i + j * c_dim1];
 978                           f21 = c[i + 1 + j * c_dim1];
 979                           f12 = c[i + (j + 1) * c_dim1];
 980                           f22 = c[i + 1 + (j + 1) * c_dim1];
 981                           f13 = c[i + (j + 2) * c_dim1];
 982                           f23 = c[i + 1 + (j + 2) * c_dim1];
 983                           f14 = c[i + (j + 3) * c_dim1];
 984                           f24 = c[i + 1 + (j + 3) * c_dim1];
 985                           f31 = c[i + 2 + j * c_dim1];
 986                           f41 = c[i + 3 + j * c_dim1];
 987                           f32 = c[i + 2 + (j + 1) * c_dim1];
 988                           f42 = c[i + 3 + (j + 1) * c_dim1];
 989                           f33 = c[i + 2 + (j + 2) * c_dim1];
 990                           f43 = c[i + 3 + (j + 2) * c_dim1];
 991                           f34 = c[i + 2 + (j + 3) * c_dim1];
 992                           f44 = c[i + 3 + (j + 3) * c_dim1];
 993                           i6 = ll + lsec - 1;
 994                           for (l = ll; l <= i6; ++l)
 995                             {
 996                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 997                                       * b[l + j * b_dim1];
 998                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 999                                       * b[l + j * b_dim1];
1000                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1001                                       * b[l + (j + 1) * b_dim1];
1002                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1003                                       * b[l + (j + 1) * b_dim1];
1004                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1005                                       * b[l + (j + 2) * b_dim1];
1006                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1007                                       * b[l + (j + 2) * b_dim1];
1008                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1009                                       * b[l + (j + 3) * b_dim1];
1010                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1011                                       * b[l + (j + 3) * b_dim1];
1012                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1013                                       * b[l + j * b_dim1];
1014                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1015                                       * b[l + j * b_dim1];
1016                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1017                                       * b[l + (j + 1) * b_dim1];
1018                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1019                                       * b[l + (j + 1) * b_dim1];
1020                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1021                                       * b[l + (j + 2) * b_dim1];
1022                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1023                                       * b[l + (j + 2) * b_dim1];
1024                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1025                                       * b[l + (j + 3) * b_dim1];
1026                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1027                                       * b[l + (j + 3) * b_dim1];
1028                             }
1029                           c[i + j * c_dim1] = f11;
1030                           c[i + 1 + j * c_dim1] = f21;
1031                           c[i + (j + 1) * c_dim1] = f12;
1032                           c[i + 1 + (j + 1) * c_dim1] = f22;
1033                           c[i + (j + 2) * c_dim1] = f13;
1034                           c[i + 1 + (j + 2) * c_dim1] = f23;
1035                           c[i + (j + 3) * c_dim1] = f14;
1036                           c[i + 1 + (j + 3) * c_dim1] = f24;
1037                           c[i + 2 + j * c_dim1] = f31;
1038                           c[i + 3 + j * c_dim1] = f41;
1039                           c[i + 2 + (j + 1) * c_dim1] = f32;
1040                           c[i + 3 + (j + 1) * c_dim1] = f42;
1041                           c[i + 2 + (j + 2) * c_dim1] = f33;
1042                           c[i + 3 + (j + 2) * c_dim1] = f43;
1043                           c[i + 2 + (j + 3) * c_dim1] = f34;
1044                           c[i + 3 + (j + 3) * c_dim1] = f44;
1045                         }
1046                       if (uisec < isec)
1047                         {
1048                           i5 = ii + isec - 1;
1049                           for (i = ii + uisec; i <= i5; ++i)
1050                             {
1051                               f11 = c[i + j * c_dim1];
1052                               f12 = c[i + (j + 1) * c_dim1];
1053                               f13 = c[i + (j + 2) * c_dim1];
1054                               f14 = c[i + (j + 3) * c_dim1];
1055                               i6 = ll + lsec - 1;
1056                               for (l = ll; l <= i6; ++l)
1057                                 {
1058                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1059                                           257] * b[l + j * b_dim1];
1060                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061                                           257] * b[l + (j + 1) * b_dim1];
1062                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063                                           257] * b[l + (j + 2) * b_dim1];
1064                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065                                           257] * b[l + (j + 3) * b_dim1];
1066                                 }
1067                               c[i + j * c_dim1] = f11;
1068                               c[i + (j + 1) * c_dim1] = f12;
1069                               c[i + (j + 2) * c_dim1] = f13;
1070                               c[i + (j + 3) * c_dim1] = f14;
1071                             }
1072                         }
1073                     }
1074                   if (ujsec < jsec)
1075                     {
1076                       i4 = jj + jsec - 1;
1077                       for (j = jj + ujsec; j <= i4; ++j)
1078                         {
1079                           i5 = ii + uisec - 1;
1080                           for (i = ii; i <= i5; i += 4)
1081                             {
1082                               f11 = c[i + j * c_dim1];
1083                               f21 = c[i + 1 + j * c_dim1];
1084                               f31 = c[i + 2 + j * c_dim1];
1085                               f41 = c[i + 3 + j * c_dim1];
1086                               i6 = ll + lsec - 1;
1087                               for (l = ll; l <= i6; ++l)
1088                                 {
1089                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1090                                           257] * b[l + j * b_dim1];
1091                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1092                                           257] * b[l + j * b_dim1];
1093                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1094                                           257] * b[l + j * b_dim1];
1095                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1096                                           257] * b[l + j * b_dim1];
1097                                 }
1098                               c[i + j * c_dim1] = f11;
1099                               c[i + 1 + j * c_dim1] = f21;
1100                               c[i + 2 + j * c_dim1] = f31;
1101                               c[i + 3 + j * c_dim1] = f41;
1102                             }
1103                           i5 = ii + isec - 1;
1104                           for (i = ii + uisec; i <= i5; ++i)
1105                             {
1106                               f11 = c[i + j * c_dim1];
1107                               i6 = ll + lsec - 1;
1108                               for (l = ll; l <= i6; ++l)
1109                                 {
1110                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1111                                           257] * b[l + j * b_dim1];
1112                                 }
1113                               c[i + j * c_dim1] = f11;
1114                             }
1115                         }
1116                     }
1117                 }
1118             }
1119         }
1120       free(t1);
1121       return;
1122     }
1123   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1124     {
1125       if (GFC_DESCRIPTOR_RANK (a) != 1)
1126         {
1127           const GFC_REAL_8 *restrict abase_x;
1128           const GFC_REAL_8 *restrict bbase_y;
1129           GFC_REAL_8 *restrict dest_y;
1130           GFC_REAL_8 s;
1131
1132           for (y = 0; y < ycount; y++)
1133             {
1134               bbase_y = &bbase[y*bystride];
1135               dest_y = &dest[y*rystride];
1136               for (x = 0; x < xcount; x++)
1137                 {
1138                   abase_x = &abase[x*axstride];
1139                   s = (GFC_REAL_8) 0;
1140                   for (n = 0; n < count; n++)
1141                     s += abase_x[n] * bbase_y[n];
1142                   dest_y[x] = s;
1143                 }
1144             }
1145         }
1146       else
1147         {
1148           const GFC_REAL_8 *restrict bbase_y;
1149           GFC_REAL_8 s;
1150
1151           for (y = 0; y < ycount; y++)
1152             {
1153               bbase_y = &bbase[y*bystride];
1154               s = (GFC_REAL_8) 0;
1155               for (n = 0; n < count; n++)
1156                 s += abase[n*axstride] * bbase_y[n];
1157               dest[y*rystride] = s;
1158             }
1159         }
1160     }
1161   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1162     {
1163       const GFC_REAL_8 *restrict bbase_y;
1164       GFC_REAL_8 s;
1165
1166       for (y = 0; y < ycount; y++)
1167         {
1168           bbase_y = &bbase[y*bystride];
1169           s = (GFC_REAL_8) 0;
1170           for (n = 0; n < count; n++)
1171             s += abase[n*axstride] * bbase_y[n*bxstride];
1172           dest[y*rxstride] = s;
1173         }
1174     }
1175   else if (axstride < aystride)
1176     {
1177       for (y = 0; y < ycount; y++)
1178         for (x = 0; x < xcount; x++)
1179           dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
1180
1181       for (y = 0; y < ycount; y++)
1182         for (n = 0; n < count; n++)
1183           for (x = 0; x < xcount; x++)
1184             /* dest[x,y] += a[x,n] * b[n,y] */
1185             dest[x*rxstride + y*rystride] +=
1186                                         abase[x*axstride + n*aystride] *
1187                                         bbase[n*bxstride + y*bystride];
1188     }
1189   else
1190     {
1191       const GFC_REAL_8 *restrict abase_x;
1192       const GFC_REAL_8 *restrict bbase_y;
1193       GFC_REAL_8 *restrict dest_y;
1194       GFC_REAL_8 s;
1195
1196       for (y = 0; y < ycount; y++)
1197         {
1198           bbase_y = &bbase[y*bystride];
1199           dest_y = &dest[y*rystride];
1200           for (x = 0; x < xcount; x++)
1201             {
1202               abase_x = &abase[x*axstride];
1203               s = (GFC_REAL_8) 0;
1204               for (n = 0; n < count; n++)
1205                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1206               dest_y[x*rxstride] = s;
1207             }
1208         }
1209     }
1210 }
1211 #undef POW3
1212 #undef min
1213 #undef max
1214
1215 #endif /* HAVE_AVX2 */
1216
1217 #ifdef HAVE_AVX512F
1218 static void
1219 matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
1220         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1221         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1222 static void
1223 matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
1224         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1225         int blas_limit, blas_call gemm)
1226 {
1227   const GFC_REAL_8 * restrict abase;
1228   const GFC_REAL_8 * restrict bbase;
1229   GFC_REAL_8 * restrict dest;
1230
1231   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1232   index_type x, y, n, count, xcount, ycount;
1233
1234   assert (GFC_DESCRIPTOR_RANK (a) == 2
1235           || GFC_DESCRIPTOR_RANK (b) == 2);
1236
1237 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1238
1239    Either A or B (but not both) can be rank 1:
1240
1241    o One-dimensional argument A is implicitly treated as a row matrix
1242      dimensioned [1,count], so xcount=1.
1243
1244    o One-dimensional argument B is implicitly treated as a column matrix
1245      dimensioned [count, 1], so ycount=1.
1246 */
1247
1248   if (retarray->base_addr == NULL)
1249     {
1250       if (GFC_DESCRIPTOR_RANK (a) == 1)
1251         {
1252           GFC_DIMENSION_SET(retarray->dim[0], 0,
1253                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1254         }
1255       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1256         {
1257           GFC_DIMENSION_SET(retarray->dim[0], 0,
1258                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1259         }
1260       else
1261         {
1262           GFC_DIMENSION_SET(retarray->dim[0], 0,
1263                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1264
1265           GFC_DIMENSION_SET(retarray->dim[1], 0,
1266                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1267                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1268         }
1269
1270       retarray->base_addr
1271         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
1272       retarray->offset = 0;
1273     }
1274   else if (unlikely (compile_options.bounds_check))
1275     {
1276       index_type ret_extent, arg_extent;
1277
1278       if (GFC_DESCRIPTOR_RANK (a) == 1)
1279         {
1280           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1281           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1282           if (arg_extent != ret_extent)
1283             runtime_error ("Array bound mismatch for dimension 1 of "
1284                            "array (%ld/%ld) ",
1285                            (long int) ret_extent, (long int) arg_extent);
1286         }
1287       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1288         {
1289           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1290           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1291           if (arg_extent != ret_extent)
1292             runtime_error ("Array bound mismatch for dimension 1 of "
1293                            "array (%ld/%ld) ",
1294                            (long int) ret_extent, (long int) arg_extent);
1295         }
1296       else
1297         {
1298           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1299           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1300           if (arg_extent != ret_extent)
1301             runtime_error ("Array bound mismatch for dimension 1 of "
1302                            "array (%ld/%ld) ",
1303                            (long int) ret_extent, (long int) arg_extent);
1304
1305           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1306           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1307           if (arg_extent != ret_extent)
1308             runtime_error ("Array bound mismatch for dimension 2 of "
1309                            "array (%ld/%ld) ",
1310                            (long int) ret_extent, (long int) arg_extent);
1311         }
1312     }
1313
1314
1315   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1316     {
1317       /* One-dimensional result may be addressed in the code below
1318          either as a row or a column matrix. We want both cases to
1319          work. */
1320       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1321     }
1322   else
1323     {
1324       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1325       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1326     }
1327
1328
1329   if (GFC_DESCRIPTOR_RANK (a) == 1)
1330     {
1331       /* Treat it as a a row matrix A[1,count]. */
1332       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1333       aystride = 1;
1334
1335       xcount = 1;
1336       count = GFC_DESCRIPTOR_EXTENT(a,0);
1337     }
1338   else
1339     {
1340       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1341       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1342
1343       count = GFC_DESCRIPTOR_EXTENT(a,1);
1344       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1345     }
1346
1347   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1348     {
1349       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1350         runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1351                        "in dimension 1: is %ld, should be %ld",
1352                        (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1353     }
1354
1355   if (GFC_DESCRIPTOR_RANK (b) == 1)
1356     {
1357       /* Treat it as a column matrix B[count,1] */
1358       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1359
1360       /* bystride should never be used for 1-dimensional b.
1361          The value is only used for calculation of the
1362          memory by the buffer.  */
1363       bystride = 256;
1364       ycount = 1;
1365     }
1366   else
1367     {
1368       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1369       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1370       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1371     }
1372
1373   abase = a->base_addr;
1374   bbase = b->base_addr;
1375   dest = retarray->base_addr;
1376
1377   /* Now that everything is set up, we perform the multiplication
1378      itself.  */
1379
1380 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1381 #define min(a,b) ((a) <= (b) ? (a) : (b))
1382 #define max(a,b) ((a) >= (b) ? (a) : (b))
1383
1384   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1385       && (bxstride == 1 || bystride == 1)
1386       && (((float) xcount) * ((float) ycount) * ((float) count)
1387           > POW3(blas_limit)))
1388     {
1389       const int m = xcount, n = ycount, k = count, ldc = rystride;
1390       const GFC_REAL_8 one = 1, zero = 0;
1391       const int lda = (axstride == 1) ? aystride : axstride,
1392                 ldb = (bxstride == 1) ? bystride : bxstride;
1393
1394       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1395         {
1396           assert (gemm != NULL);
1397           const char *transa, *transb;
1398           if (try_blas & 2)
1399             transa = "C";
1400           else
1401             transa = axstride == 1 ? "N" : "T";
1402
1403           if (try_blas & 4)
1404             transb = "C";
1405           else
1406             transb = bxstride == 1 ? "N" : "T";
1407
1408           gemm (transa, transb , &m,
1409                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1410                 &ldc, 1, 1);
1411           return;
1412         }
1413     }
1414
1415   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1416     {
1417       /* This block of code implements a tuned matmul, derived from
1418          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1419
1420                Bo Kagstrom and Per Ling
1421                Department of Computing Science
1422                Umea University
1423                S-901 87 Umea, Sweden
1424
1425          from netlib.org, translated to C, and modified for matmul.m4.  */
1426
1427       const GFC_REAL_8 *a, *b;
1428       GFC_REAL_8 *c;
1429       const index_type m = xcount, n = ycount, k = count;
1430
1431       /* System generated locals */
1432       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1433                  i1, i2, i3, i4, i5, i6;
1434
1435       /* Local variables */
1436       GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
1437                  f13, f14, f23, f24, f33, f34, f43, f44;
1438       index_type i, j, l, ii, jj, ll;
1439       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1440       GFC_REAL_8 *t1;
1441
1442       a = abase;
1443       b = bbase;
1444       c = retarray->base_addr;
1445
1446       /* Parameter adjustments */
1447       c_dim1 = rystride;
1448       c_offset = 1 + c_dim1;
1449       c -= c_offset;
1450       a_dim1 = aystride;
1451       a_offset = 1 + a_dim1;
1452       a -= a_offset;
1453       b_dim1 = bystride;
1454       b_offset = 1 + b_dim1;
1455       b -= b_offset;
1456
1457       /* Empty c first.  */
1458       for (j=1; j<=n; j++)
1459         for (i=1; i<=m; i++)
1460           c[i + j * c_dim1] = (GFC_REAL_8)0;
1461
1462       /* Early exit if possible */
1463       if (m == 0 || n == 0 || k == 0)
1464         return;
1465
1466       /* Adjust size of t1 to what is needed.  */
1467       index_type t1_dim, a_sz;
1468       if (aystride == 1)
1469         a_sz = rystride;
1470       else
1471         a_sz = a_dim1;
1472
1473       t1_dim = a_sz * 256 + b_dim1;
1474       if (t1_dim > 65536)
1475         t1_dim = 65536;
1476
1477       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
1478
1479       /* Start turning the crank. */
1480       i1 = n;
1481       for (jj = 1; jj <= i1; jj += 512)
1482         {
1483           /* Computing MIN */
1484           i2 = 512;
1485           i3 = n - jj + 1;
1486           jsec = min(i2,i3);
1487           ujsec = jsec - jsec % 4;
1488           i2 = k;
1489           for (ll = 1; ll <= i2; ll += 256)
1490             {
1491               /* Computing MIN */
1492               i3 = 256;
1493               i4 = k - ll + 1;
1494               lsec = min(i3,i4);
1495               ulsec = lsec - lsec % 2;
1496
1497               i3 = m;
1498               for (ii = 1; ii <= i3; ii += 256)
1499                 {
1500                   /* Computing MIN */
1501                   i4 = 256;
1502                   i5 = m - ii + 1;
1503                   isec = min(i4,i5);
1504                   uisec = isec - isec % 2;
1505                   i4 = ll + ulsec - 1;
1506                   for (l = ll; l <= i4; l += 2)
1507                     {
1508                       i5 = ii + uisec - 1;
1509                       for (i = ii; i <= i5; i += 2)
1510                         {
1511                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1512                                         a[i + l * a_dim1];
1513                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1514                                         a[i + (l + 1) * a_dim1];
1515                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1516                                         a[i + 1 + l * a_dim1];
1517                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1518                                         a[i + 1 + (l + 1) * a_dim1];
1519                         }
1520                       if (uisec < isec)
1521                         {
1522                           t1[l - ll + 1 + (isec << 8) - 257] =
1523                                     a[ii + isec - 1 + l * a_dim1];
1524                           t1[l - ll + 2 + (isec << 8) - 257] =
1525                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1526                         }
1527                     }
1528                   if (ulsec < lsec)
1529                     {
1530                       i4 = ii + isec - 1;
1531                       for (i = ii; i<= i4; ++i)
1532                         {
1533                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1534                                     a[i + (ll + lsec - 1) * a_dim1];
1535                         }
1536                     }
1537
1538                   uisec = isec - isec % 4;
1539                   i4 = jj + ujsec - 1;
1540                   for (j = jj; j <= i4; j += 4)
1541                     {
1542                       i5 = ii + uisec - 1;
1543                       for (i = ii; i <= i5; i += 4)
1544                         {
1545                           f11 = c[i + j * c_dim1];
1546                           f21 = c[i + 1 + j * c_dim1];
1547                           f12 = c[i + (j + 1) * c_dim1];
1548                           f22 = c[i + 1 + (j + 1) * c_dim1];
1549                           f13 = c[i + (j + 2) * c_dim1];
1550                           f23 = c[i + 1 + (j + 2) * c_dim1];
1551                           f14 = c[i + (j + 3) * c_dim1];
1552                           f24 = c[i + 1 + (j + 3) * c_dim1];
1553                           f31 = c[i + 2 + j * c_dim1];
1554                           f41 = c[i + 3 + j * c_dim1];
1555                           f32 = c[i + 2 + (j + 1) * c_dim1];
1556                           f42 = c[i + 3 + (j + 1) * c_dim1];
1557                           f33 = c[i + 2 + (j + 2) * c_dim1];
1558                           f43 = c[i + 3 + (j + 2) * c_dim1];
1559                           f34 = c[i + 2 + (j + 3) * c_dim1];
1560                           f44 = c[i + 3 + (j + 3) * c_dim1];
1561                           i6 = ll + lsec - 1;
1562                           for (l = ll; l <= i6; ++l)
1563                             {
1564                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1565                                       * b[l + j * b_dim1];
1566                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1567                                       * b[l + j * b_dim1];
1568                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1569                                       * b[l + (j + 1) * b_dim1];
1570                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1571                                       * b[l + (j + 1) * b_dim1];
1572                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1573                                       * b[l + (j + 2) * b_dim1];
1574                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1575                                       * b[l + (j + 2) * b_dim1];
1576                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1577                                       * b[l + (j + 3) * b_dim1];
1578                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1579                                       * b[l + (j + 3) * b_dim1];
1580                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1581                                       * b[l + j * b_dim1];
1582                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1583                                       * b[l + j * b_dim1];
1584                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1585                                       * b[l + (j + 1) * b_dim1];
1586                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1587                                       * b[l + (j + 1) * b_dim1];
1588                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1589                                       * b[l + (j + 2) * b_dim1];
1590                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1591                                       * b[l + (j + 2) * b_dim1];
1592                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1593                                       * b[l + (j + 3) * b_dim1];
1594                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1595                                       * b[l + (j + 3) * b_dim1];
1596                             }
1597                           c[i + j * c_dim1] = f11;
1598                           c[i + 1 + j * c_dim1] = f21;
1599                           c[i + (j + 1) * c_dim1] = f12;
1600                           c[i + 1 + (j + 1) * c_dim1] = f22;
1601                           c[i + (j + 2) * c_dim1] = f13;
1602                           c[i + 1 + (j + 2) * c_dim1] = f23;
1603                           c[i + (j + 3) * c_dim1] = f14;
1604                           c[i + 1 + (j + 3) * c_dim1] = f24;
1605                           c[i + 2 + j * c_dim1] = f31;
1606                           c[i + 3 + j * c_dim1] = f41;
1607                           c[i + 2 + (j + 1) * c_dim1] = f32;
1608                           c[i + 3 + (j + 1) * c_dim1] = f42;
1609                           c[i + 2 + (j + 2) * c_dim1] = f33;
1610                           c[i + 3 + (j + 2) * c_dim1] = f43;
1611                           c[i + 2 + (j + 3) * c_dim1] = f34;
1612                           c[i + 3 + (j + 3) * c_dim1] = f44;
1613                         }
1614                       if (uisec < isec)
1615                         {
1616                           i5 = ii + isec - 1;
1617                           for (i = ii + uisec; i <= i5; ++i)
1618                             {
1619                               f11 = c[i + j * c_dim1];
1620                               f12 = c[i + (j + 1) * c_dim1];
1621                               f13 = c[i + (j + 2) * c_dim1];
1622                               f14 = c[i + (j + 3) * c_dim1];
1623                               i6 = ll + lsec - 1;
1624                               for (l = ll; l <= i6; ++l)
1625                                 {
1626                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1627                                           257] * b[l + j * b_dim1];
1628                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1629                                           257] * b[l + (j + 1) * b_dim1];
1630                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1631                                           257] * b[l + (j + 2) * b_dim1];
1632                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1633                                           257] * b[l + (j + 3) * b_dim1];
1634                                 }
1635                               c[i + j * c_dim1] = f11;
1636                               c[i + (j + 1) * c_dim1] = f12;
1637                               c[i + (j + 2) * c_dim1] = f13;
1638                               c[i + (j + 3) * c_dim1] = f14;
1639                             }
1640                         }
1641                     }
1642                   if (ujsec < jsec)
1643                     {
1644                       i4 = jj + jsec - 1;
1645                       for (j = jj + ujsec; j <= i4; ++j)
1646                         {
1647                           i5 = ii + uisec - 1;
1648                           for (i = ii; i <= i5; i += 4)
1649                             {
1650                               f11 = c[i + j * c_dim1];
1651                               f21 = c[i + 1 + j * c_dim1];
1652                               f31 = c[i + 2 + j * c_dim1];
1653                               f41 = c[i + 3 + j * c_dim1];
1654                               i6 = ll + lsec - 1;
1655                               for (l = ll; l <= i6; ++l)
1656                                 {
1657                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1658                                           257] * b[l + j * b_dim1];
1659                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1660                                           257] * b[l + j * b_dim1];
1661                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1662                                           257] * b[l + j * b_dim1];
1663                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1664                                           257] * b[l + j * b_dim1];
1665                                 }
1666                               c[i + j * c_dim1] = f11;
1667                               c[i + 1 + j * c_dim1] = f21;
1668                               c[i + 2 + j * c_dim1] = f31;
1669                               c[i + 3 + j * c_dim1] = f41;
1670                             }
1671                           i5 = ii + isec - 1;
1672                           for (i = ii + uisec; i <= i5; ++i)
1673                             {
1674                               f11 = c[i + j * c_dim1];
1675                               i6 = ll + lsec - 1;
1676                               for (l = ll; l <= i6; ++l)
1677                                 {
1678                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1679                                           257] * b[l + j * b_dim1];
1680                                 }
1681                               c[i + j * c_dim1] = f11;
1682                             }
1683                         }
1684                     }
1685                 }
1686             }
1687         }
1688       free(t1);
1689       return;
1690     }
1691   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1692     {
1693       if (GFC_DESCRIPTOR_RANK (a) != 1)
1694         {
1695           const GFC_REAL_8 *restrict abase_x;
1696           const GFC_REAL_8 *restrict bbase_y;
1697           GFC_REAL_8 *restrict dest_y;
1698           GFC_REAL_8 s;
1699
1700           for (y = 0; y < ycount; y++)
1701             {
1702               bbase_y = &bbase[y*bystride];
1703               dest_y = &dest[y*rystride];
1704               for (x = 0; x < xcount; x++)
1705                 {
1706                   abase_x = &abase[x*axstride];
1707                   s = (GFC_REAL_8) 0;
1708                   for (n = 0; n < count; n++)
1709                     s += abase_x[n] * bbase_y[n];
1710                   dest_y[x] = s;
1711                 }
1712             }
1713         }
1714       else
1715         {
1716           const GFC_REAL_8 *restrict bbase_y;
1717           GFC_REAL_8 s;
1718
1719           for (y = 0; y < ycount; y++)
1720             {
1721               bbase_y = &bbase[y*bystride];
1722               s = (GFC_REAL_8) 0;
1723               for (n = 0; n < count; n++)
1724                 s += abase[n*axstride] * bbase_y[n];
1725               dest[y*rystride] = s;
1726             }
1727         }
1728     }
1729   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1730     {
1731       const GFC_REAL_8 *restrict bbase_y;
1732       GFC_REAL_8 s;
1733
1734       for (y = 0; y < ycount; y++)
1735         {
1736           bbase_y = &bbase[y*bystride];
1737           s = (GFC_REAL_8) 0;
1738           for (n = 0; n < count; n++)
1739             s += abase[n*axstride] * bbase_y[n*bxstride];
1740           dest[y*rxstride] = s;
1741         }
1742     }
1743   else if (axstride < aystride)
1744     {
1745       for (y = 0; y < ycount; y++)
1746         for (x = 0; x < xcount; x++)
1747           dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
1748
1749       for (y = 0; y < ycount; y++)
1750         for (n = 0; n < count; n++)
1751           for (x = 0; x < xcount; x++)
1752             /* dest[x,y] += a[x,n] * b[n,y] */
1753             dest[x*rxstride + y*rystride] +=
1754                                         abase[x*axstride + n*aystride] *
1755                                         bbase[n*bxstride + y*bystride];
1756     }
1757   else
1758     {
1759       const GFC_REAL_8 *restrict abase_x;
1760       const GFC_REAL_8 *restrict bbase_y;
1761       GFC_REAL_8 *restrict dest_y;
1762       GFC_REAL_8 s;
1763
1764       for (y = 0; y < ycount; y++)
1765         {
1766           bbase_y = &bbase[y*bystride];
1767           dest_y = &dest[y*rystride];
1768           for (x = 0; x < xcount; x++)
1769             {
1770               abase_x = &abase[x*axstride];
1771               s = (GFC_REAL_8) 0;
1772               for (n = 0; n < count; n++)
1773                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1774               dest_y[x*rxstride] = s;
1775             }
1776         }
1777     }
1778 }
1779 #undef POW3
1780 #undef min
1781 #undef max
1782
1783 #endif  /* HAVE_AVX512F */
1784
1785 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1786
1787 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1788 void
1789 matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
1790         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1791         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1792 internal_proto(matmul_r8_avx128_fma3);
1793 #endif
1794
1795 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1796 void
1797 matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
1798         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1799         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1800 internal_proto(matmul_r8_avx128_fma4);
1801 #endif
1802
1803 /* Function to fall back to if there is no special processor-specific version.  */
1804 static void
1805 matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
1806         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1807         int blas_limit, blas_call gemm)
1808 {
1809   const GFC_REAL_8 * restrict abase;
1810   const GFC_REAL_8 * restrict bbase;
1811   GFC_REAL_8 * restrict dest;
1812
1813   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1814   index_type x, y, n, count, xcount, ycount;
1815
1816   assert (GFC_DESCRIPTOR_RANK (a) == 2
1817           || GFC_DESCRIPTOR_RANK (b) == 2);
1818
1819 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1820
1821    Either A or B (but not both) can be rank 1:
1822
1823    o One-dimensional argument A is implicitly treated as a row matrix
1824      dimensioned [1,count], so xcount=1.
1825
1826    o One-dimensional argument B is implicitly treated as a column matrix
1827      dimensioned [count, 1], so ycount=1.
1828 */
1829
1830   if (retarray->base_addr == NULL)
1831     {
1832       if (GFC_DESCRIPTOR_RANK (a) == 1)
1833         {
1834           GFC_DIMENSION_SET(retarray->dim[0], 0,
1835                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1836         }
1837       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1838         {
1839           GFC_DIMENSION_SET(retarray->dim[0], 0,
1840                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1841         }
1842       else
1843         {
1844           GFC_DIMENSION_SET(retarray->dim[0], 0,
1845                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1846
1847           GFC_DIMENSION_SET(retarray->dim[1], 0,
1848                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1849                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1850         }
1851
1852       retarray->base_addr
1853         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
1854       retarray->offset = 0;
1855     }
1856   else if (unlikely (compile_options.bounds_check))
1857     {
1858       index_type ret_extent, arg_extent;
1859
1860       if (GFC_DESCRIPTOR_RANK (a) == 1)
1861         {
1862           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1863           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1864           if (arg_extent != ret_extent)
1865             runtime_error ("Array bound mismatch for dimension 1 of "
1866                            "array (%ld/%ld) ",
1867                            (long int) ret_extent, (long int) arg_extent);
1868         }
1869       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1870         {
1871           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1872           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1873           if (arg_extent != ret_extent)
1874             runtime_error ("Array bound mismatch for dimension 1 of "
1875                            "array (%ld/%ld) ",
1876                            (long int) ret_extent, (long int) arg_extent);
1877         }
1878       else
1879         {
1880           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1881           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1882           if (arg_extent != ret_extent)
1883             runtime_error ("Array bound mismatch for dimension 1 of "
1884                            "array (%ld/%ld) ",
1885                            (long int) ret_extent, (long int) arg_extent);
1886
1887           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1888           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1889           if (arg_extent != ret_extent)
1890             runtime_error ("Array bound mismatch for dimension 2 of "
1891                            "array (%ld/%ld) ",
1892                            (long int) ret_extent, (long int) arg_extent);
1893         }
1894     }
1895
1896
1897   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1898     {
1899       /* One-dimensional result may be addressed in the code below
1900          either as a row or a column matrix. We want both cases to
1901          work. */
1902       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1903     }
1904   else
1905     {
1906       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1907       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1908     }
1909
1910
1911   if (GFC_DESCRIPTOR_RANK (a) == 1)
1912     {
1913       /* Treat it as a a row matrix A[1,count]. */
1914       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1915       aystride = 1;
1916
1917       xcount = 1;
1918       count = GFC_DESCRIPTOR_EXTENT(a,0);
1919     }
1920   else
1921     {
1922       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1923       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1924
1925       count = GFC_DESCRIPTOR_EXTENT(a,1);
1926       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1927     }
1928
1929   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1930     {
1931       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1932         runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1933                        "in dimension 1: is %ld, should be %ld",
1934                        (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1935     }
1936
1937   if (GFC_DESCRIPTOR_RANK (b) == 1)
1938     {
1939       /* Treat it as a column matrix B[count,1] */
1940       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1941
1942       /* bystride should never be used for 1-dimensional b.
1943          The value is only used for calculation of the
1944          memory by the buffer.  */
1945       bystride = 256;
1946       ycount = 1;
1947     }
1948   else
1949     {
1950       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1951       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1952       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1953     }
1954
1955   abase = a->base_addr;
1956   bbase = b->base_addr;
1957   dest = retarray->base_addr;
1958
1959   /* Now that everything is set up, we perform the multiplication
1960      itself.  */
1961
1962 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1963 #define min(a,b) ((a) <= (b) ? (a) : (b))
1964 #define max(a,b) ((a) >= (b) ? (a) : (b))
1965
1966   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1967       && (bxstride == 1 || bystride == 1)
1968       && (((float) xcount) * ((float) ycount) * ((float) count)
1969           > POW3(blas_limit)))
1970     {
1971       const int m = xcount, n = ycount, k = count, ldc = rystride;
1972       const GFC_REAL_8 one = 1, zero = 0;
1973       const int lda = (axstride == 1) ? aystride : axstride,
1974                 ldb = (bxstride == 1) ? bystride : bxstride;
1975
1976       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1977         {
1978           assert (gemm != NULL);
1979           const char *transa, *transb;
1980           if (try_blas & 2)
1981             transa = "C";
1982           else
1983             transa = axstride == 1 ? "N" : "T";
1984
1985           if (try_blas & 4)
1986             transb = "C";
1987           else
1988             transb = bxstride == 1 ? "N" : "T";
1989
1990           gemm (transa, transb , &m,
1991                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1992                 &ldc, 1, 1);
1993           return;
1994         }
1995     }
1996
1997   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1998     {
1999       /* This block of code implements a tuned matmul, derived from
2000          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2001
2002                Bo Kagstrom and Per Ling
2003                Department of Computing Science
2004                Umea University
2005                S-901 87 Umea, Sweden
2006
2007          from netlib.org, translated to C, and modified for matmul.m4.  */
2008
2009       const GFC_REAL_8 *a, *b;
2010       GFC_REAL_8 *c;
2011       const index_type m = xcount, n = ycount, k = count;
2012
2013       /* System generated locals */
2014       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2015                  i1, i2, i3, i4, i5, i6;
2016
2017       /* Local variables */
2018       GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
2019                  f13, f14, f23, f24, f33, f34, f43, f44;
2020       index_type i, j, l, ii, jj, ll;
2021       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2022       GFC_REAL_8 *t1;
2023
2024       a = abase;
2025       b = bbase;
2026       c = retarray->base_addr;
2027
2028       /* Parameter adjustments */
2029       c_dim1 = rystride;
2030       c_offset = 1 + c_dim1;
2031       c -= c_offset;
2032       a_dim1 = aystride;
2033       a_offset = 1 + a_dim1;
2034       a -= a_offset;
2035       b_dim1 = bystride;
2036       b_offset = 1 + b_dim1;
2037       b -= b_offset;
2038
2039       /* Empty c first.  */
2040       for (j=1; j<=n; j++)
2041         for (i=1; i<=m; i++)
2042           c[i + j * c_dim1] = (GFC_REAL_8)0;
2043
2044       /* Early exit if possible */
2045       if (m == 0 || n == 0 || k == 0)
2046         return;
2047
2048       /* Adjust size of t1 to what is needed.  */
2049       index_type t1_dim, a_sz;
2050       if (aystride == 1)
2051         a_sz = rystride;
2052       else
2053         a_sz = a_dim1;
2054
2055       t1_dim = a_sz * 256 + b_dim1;
2056       if (t1_dim > 65536)
2057         t1_dim = 65536;
2058
2059       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
2060
2061       /* Start turning the crank. */
2062       i1 = n;
2063       for (jj = 1; jj <= i1; jj += 512)
2064         {
2065           /* Computing MIN */
2066           i2 = 512;
2067           i3 = n - jj + 1;
2068           jsec = min(i2,i3);
2069           ujsec = jsec - jsec % 4;
2070           i2 = k;
2071           for (ll = 1; ll <= i2; ll += 256)
2072             {
2073               /* Computing MIN */
2074               i3 = 256;
2075               i4 = k - ll + 1;
2076               lsec = min(i3,i4);
2077               ulsec = lsec - lsec % 2;
2078
2079               i3 = m;
2080               for (ii = 1; ii <= i3; ii += 256)
2081                 {
2082                   /* Computing MIN */
2083                   i4 = 256;
2084                   i5 = m - ii + 1;
2085                   isec = min(i4,i5);
2086                   uisec = isec - isec % 2;
2087                   i4 = ll + ulsec - 1;
2088                   for (l = ll; l <= i4; l += 2)
2089                     {
2090                       i5 = ii + uisec - 1;
2091                       for (i = ii; i <= i5; i += 2)
2092                         {
2093                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2094                                         a[i + l * a_dim1];
2095                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2096                                         a[i + (l + 1) * a_dim1];
2097                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2098                                         a[i + 1 + l * a_dim1];
2099                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2100                                         a[i + 1 + (l + 1) * a_dim1];
2101                         }
2102                       if (uisec < isec)
2103                         {
2104                           t1[l - ll + 1 + (isec << 8) - 257] =
2105                                     a[ii + isec - 1 + l * a_dim1];
2106                           t1[l - ll + 2 + (isec << 8) - 257] =
2107                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2108                         }
2109                     }
2110                   if (ulsec < lsec)
2111                     {
2112                       i4 = ii + isec - 1;
2113                       for (i = ii; i<= i4; ++i)
2114                         {
2115                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2116                                     a[i + (ll + lsec - 1) * a_dim1];
2117                         }
2118                     }
2119
2120                   uisec = isec - isec % 4;
2121                   i4 = jj + ujsec - 1;
2122                   for (j = jj; j <= i4; j += 4)
2123                     {
2124                       i5 = ii + uisec - 1;
2125                       for (i = ii; i <= i5; i += 4)
2126                         {
2127                           f11 = c[i + j * c_dim1];
2128                           f21 = c[i + 1 + j * c_dim1];
2129                           f12 = c[i + (j + 1) * c_dim1];
2130                           f22 = c[i + 1 + (j + 1) * c_dim1];
2131                           f13 = c[i + (j + 2) * c_dim1];
2132                           f23 = c[i + 1 + (j + 2) * c_dim1];
2133                           f14 = c[i + (j + 3) * c_dim1];
2134                           f24 = c[i + 1 + (j + 3) * c_dim1];
2135                           f31 = c[i + 2 + j * c_dim1];
2136                           f41 = c[i + 3 + j * c_dim1];
2137                           f32 = c[i + 2 + (j + 1) * c_dim1];
2138                           f42 = c[i + 3 + (j + 1) * c_dim1];
2139                           f33 = c[i + 2 + (j + 2) * c_dim1];
2140                           f43 = c[i + 3 + (j + 2) * c_dim1];
2141                           f34 = c[i + 2 + (j + 3) * c_dim1];
2142                           f44 = c[i + 3 + (j + 3) * c_dim1];
2143                           i6 = ll + lsec - 1;
2144                           for (l = ll; l <= i6; ++l)
2145                             {
2146                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2147                                       * b[l + j * b_dim1];
2148                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2149                                       * b[l + j * b_dim1];
2150                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151                                       * b[l + (j + 1) * b_dim1];
2152                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153                                       * b[l + (j + 1) * b_dim1];
2154                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155                                       * b[l + (j + 2) * b_dim1];
2156                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157                                       * b[l + (j + 2) * b_dim1];
2158                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159                                       * b[l + (j + 3) * b_dim1];
2160                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161                                       * b[l + (j + 3) * b_dim1];
2162                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2163                                       * b[l + j * b_dim1];
2164                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2165                                       * b[l + j * b_dim1];
2166                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167                                       * b[l + (j + 1) * b_dim1];
2168                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169                                       * b[l + (j + 1) * b_dim1];
2170                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171                                       * b[l + (j + 2) * b_dim1];
2172                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173                                       * b[l + (j + 2) * b_dim1];
2174                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175                                       * b[l + (j + 3) * b_dim1];
2176                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177                                       * b[l + (j + 3) * b_dim1];
2178                             }
2179                           c[i + j * c_dim1] = f11;
2180                           c[i + 1 + j * c_dim1] = f21;
2181                           c[i + (j + 1) * c_dim1] = f12;
2182                           c[i + 1 + (j + 1) * c_dim1] = f22;
2183                           c[i + (j + 2) * c_dim1] = f13;
2184                           c[i + 1 + (j + 2) * c_dim1] = f23;
2185                           c[i + (j + 3) * c_dim1] = f14;
2186                           c[i + 1 + (j + 3) * c_dim1] = f24;
2187                           c[i + 2 + j * c_dim1] = f31;
2188                           c[i + 3 + j * c_dim1] = f41;
2189                           c[i + 2 + (j + 1) * c_dim1] = f32;
2190                           c[i + 3 + (j + 1) * c_dim1] = f42;
2191                           c[i + 2 + (j + 2) * c_dim1] = f33;
2192                           c[i + 3 + (j + 2) * c_dim1] = f43;
2193                           c[i + 2 + (j + 3) * c_dim1] = f34;
2194                           c[i + 3 + (j + 3) * c_dim1] = f44;
2195                         }
2196                       if (uisec < isec)
2197                         {
2198                           i5 = ii + isec - 1;
2199                           for (i = ii + uisec; i <= i5; ++i)
2200                             {
2201                               f11 = c[i + j * c_dim1];
2202                               f12 = c[i + (j + 1) * c_dim1];
2203                               f13 = c[i + (j + 2) * c_dim1];
2204                               f14 = c[i + (j + 3) * c_dim1];
2205                               i6 = ll + lsec - 1;
2206                               for (l = ll; l <= i6; ++l)
2207                                 {
2208                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2209                                           257] * b[l + j * b_dim1];
2210                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2211                                           257] * b[l + (j + 1) * b_dim1];
2212                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213                                           257] * b[l + (j + 2) * b_dim1];
2214                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215                                           257] * b[l + (j + 3) * b_dim1];
2216                                 }
2217                               c[i + j * c_dim1] = f11;
2218                               c[i + (j + 1) * c_dim1] = f12;
2219                               c[i + (j + 2) * c_dim1] = f13;
2220                               c[i + (j + 3) * c_dim1] = f14;
2221                             }
2222                         }
2223                     }
2224                   if (ujsec < jsec)
2225                     {
2226                       i4 = jj + jsec - 1;
2227                       for (j = jj + ujsec; j <= i4; ++j)
2228                         {
2229                           i5 = ii + uisec - 1;
2230                           for (i = ii; i <= i5; i += 4)
2231                             {
2232                               f11 = c[i + j * c_dim1];
2233                               f21 = c[i + 1 + j * c_dim1];
2234                               f31 = c[i + 2 + j * c_dim1];
2235                               f41 = c[i + 3 + j * c_dim1];
2236                               i6 = ll + lsec - 1;
2237                               for (l = ll; l <= i6; ++l)
2238                                 {
2239                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2240                                           257] * b[l + j * b_dim1];
2241                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2242                                           257] * b[l + j * b_dim1];
2243                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2244                                           257] * b[l + j * b_dim1];
2245                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2246                                           257] * b[l + j * b_dim1];
2247                                 }
2248                               c[i + j * c_dim1] = f11;
2249                               c[i + 1 + j * c_dim1] = f21;
2250                               c[i + 2 + j * c_dim1] = f31;
2251                               c[i + 3 + j * c_dim1] = f41;
2252                             }
2253                           i5 = ii + isec - 1;
2254                           for (i = ii + uisec; i <= i5; ++i)
2255                             {
2256                               f11 = c[i + j * c_dim1];
2257                               i6 = ll + lsec - 1;
2258                               for (l = ll; l <= i6; ++l)
2259                                 {
2260                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2261                                           257] * b[l + j * b_dim1];
2262                                 }
2263                               c[i + j * c_dim1] = f11;
2264                             }
2265                         }
2266                     }
2267                 }
2268             }
2269         }
2270       free(t1);
2271       return;
2272     }
2273   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2274     {
2275       if (GFC_DESCRIPTOR_RANK (a) != 1)
2276         {
2277           const GFC_REAL_8 *restrict abase_x;
2278           const GFC_REAL_8 *restrict bbase_y;
2279           GFC_REAL_8 *restrict dest_y;
2280           GFC_REAL_8 s;
2281
2282           for (y = 0; y < ycount; y++)
2283             {
2284               bbase_y = &bbase[y*bystride];
2285               dest_y = &dest[y*rystride];
2286               for (x = 0; x < xcount; x++)
2287                 {
2288                   abase_x = &abase[x*axstride];
2289                   s = (GFC_REAL_8) 0;
2290                   for (n = 0; n < count; n++)
2291                     s += abase_x[n] * bbase_y[n];
2292                   dest_y[x] = s;
2293                 }
2294             }
2295         }
2296       else
2297         {
2298           const GFC_REAL_8 *restrict bbase_y;
2299           GFC_REAL_8 s;
2300
2301           for (y = 0; y < ycount; y++)
2302             {
2303               bbase_y = &bbase[y*bystride];
2304               s = (GFC_REAL_8) 0;
2305               for (n = 0; n < count; n++)
2306                 s += abase[n*axstride] * bbase_y[n];
2307               dest[y*rystride] = s;
2308             }
2309         }
2310     }
2311   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2312     {
2313       const GFC_REAL_8 *restrict bbase_y;
2314       GFC_REAL_8 s;
2315
2316       for (y = 0; y < ycount; y++)
2317         {
2318           bbase_y = &bbase[y*bystride];
2319           s = (GFC_REAL_8) 0;
2320           for (n = 0; n < count; n++)
2321             s += abase[n*axstride] * bbase_y[n*bxstride];
2322           dest[y*rxstride] = s;
2323         }
2324     }
2325   else if (axstride < aystride)
2326     {
2327       for (y = 0; y < ycount; y++)
2328         for (x = 0; x < xcount; x++)
2329           dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
2330
2331       for (y = 0; y < ycount; y++)
2332         for (n = 0; n < count; n++)
2333           for (x = 0; x < xcount; x++)
2334             /* dest[x,y] += a[x,n] * b[n,y] */
2335             dest[x*rxstride + y*rystride] +=
2336                                         abase[x*axstride + n*aystride] *
2337                                         bbase[n*bxstride + y*bystride];
2338     }
2339   else
2340     {
2341       const GFC_REAL_8 *restrict abase_x;
2342       const GFC_REAL_8 *restrict bbase_y;
2343       GFC_REAL_8 *restrict dest_y;
2344       GFC_REAL_8 s;
2345
2346       for (y = 0; y < ycount; y++)
2347         {
2348           bbase_y = &bbase[y*bystride];
2349           dest_y = &dest[y*rystride];
2350           for (x = 0; x < xcount; x++)
2351             {
2352               abase_x = &abase[x*axstride];
2353               s = (GFC_REAL_8) 0;
2354               for (n = 0; n < count; n++)
2355                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2356               dest_y[x*rxstride] = s;
2357             }
2358         }
2359     }
2360 }
2361 #undef POW3
2362 #undef min
2363 #undef max
2364
2365
2366 /* Compiling main function, with selection code for the processor.  */
2367
2368 /* Currently, this is i386 only.  Adjust for other architectures.  */
2369
2370 void matmul_r8 (gfc_array_r8 * const restrict retarray,
2371         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2372         int blas_limit, blas_call gemm)
2373 {
2374   static void (*matmul_p) (gfc_array_r8 * const restrict retarray,
2375         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2376         int blas_limit, blas_call gemm);
2377
2378   void (*matmul_fn) (gfc_array_r8 * const restrict retarray,
2379         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2380         int blas_limit, blas_call gemm);
2381
2382   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2383   if (matmul_fn == NULL)
2384     {
2385       matmul_fn = matmul_r8_vanilla;
2386       if (__builtin_cpu_is ("intel"))
2387         {
2388           /* Run down the available processors in order of preference.  */
2389 #ifdef HAVE_AVX512F
2390           if (__builtin_cpu_supports ("avx512f"))
2391             {
2392               matmul_fn = matmul_r8_avx512f;
2393               goto store;
2394             }
2395
2396 #endif  /* HAVE_AVX512F */
2397
2398 #ifdef HAVE_AVX2
2399           if (__builtin_cpu_supports ("avx2")
2400               && __builtin_cpu_supports ("fma"))
2401             {
2402               matmul_fn = matmul_r8_avx2;
2403               goto store;
2404             }
2405
2406 #endif
2407
2408 #ifdef HAVE_AVX
2409           if (__builtin_cpu_supports ("avx"))
2410             {
2411               matmul_fn = matmul_r8_avx;
2412               goto store;
2413             }
2414 #endif  /* HAVE_AVX */
2415         }
2416     else if (__builtin_cpu_is ("amd"))
2417       {
2418 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2419         if (__builtin_cpu_supports ("avx")
2420             && __builtin_cpu_supports ("fma"))
2421           {
2422             matmul_fn = matmul_r8_avx128_fma3;
2423             goto store;
2424           }
2425 #endif
2426 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2427         if (__builtin_cpu_supports ("avx")
2428             && __builtin_cpu_supports ("fma4"))
2429           {
2430             matmul_fn = matmul_r8_avx128_fma4;
2431             goto store;
2432           }
2433 #endif
2434
2435       }
2436    store:
2437       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2438    }
2439
2440    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2441 }
2442
2443 #else  /* Just the vanilla function.  */
2444
2445 void
2446 matmul_r8 (gfc_array_r8 * const restrict retarray,
2447         gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2448         int blas_limit, blas_call gemm)
2449 {
2450   const GFC_REAL_8 * restrict abase;
2451   const GFC_REAL_8 * restrict bbase;
2452   GFC_REAL_8 * restrict dest;
2453
2454   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2455   index_type x, y, n, count, xcount, ycount;
2456
2457   assert (GFC_DESCRIPTOR_RANK (a) == 2
2458           || GFC_DESCRIPTOR_RANK (b) == 2);
2459
2460 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2461
2462    Either A or B (but not both) can be rank 1:
2463
2464    o One-dimensional argument A is implicitly treated as a row matrix
2465      dimensioned [1,count], so xcount=1.
2466
2467    o One-dimensional argument B is implicitly treated as a column matrix
2468      dimensioned [count, 1], so ycount=1.
2469 */
2470
2471   if (retarray->base_addr == NULL)
2472     {
2473       if (GFC_DESCRIPTOR_RANK (a) == 1)
2474         {
2475           GFC_DIMENSION_SET(retarray->dim[0], 0,
2476                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2477         }
2478       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2479         {
2480           GFC_DIMENSION_SET(retarray->dim[0], 0,
2481                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2482         }
2483       else
2484         {
2485           GFC_DIMENSION_SET(retarray->dim[0], 0,
2486                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2487
2488           GFC_DIMENSION_SET(retarray->dim[1], 0,
2489                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2490                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2491         }
2492
2493       retarray->base_addr
2494         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
2495       retarray->offset = 0;
2496     }
2497   else if (unlikely (compile_options.bounds_check))
2498     {
2499       index_type ret_extent, arg_extent;
2500
2501       if (GFC_DESCRIPTOR_RANK (a) == 1)
2502         {
2503           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2504           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2505           if (arg_extent != ret_extent)
2506             runtime_error ("Array bound mismatch for dimension 1 of "
2507                            "array (%ld/%ld) ",
2508                            (long int) ret_extent, (long int) arg_extent);
2509         }
2510       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2511         {
2512           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2513           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2514           if (arg_extent != ret_extent)
2515             runtime_error ("Array bound mismatch for dimension 1 of "
2516                            "array (%ld/%ld) ",
2517                            (long int) ret_extent, (long int) arg_extent);
2518         }
2519       else
2520         {
2521           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2522           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2523           if (arg_extent != ret_extent)
2524             runtime_error ("Array bound mismatch for dimension 1 of "
2525                            "array (%ld/%ld) ",
2526                            (long int) ret_extent, (long int) arg_extent);
2527
2528           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2529           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2530           if (arg_extent != ret_extent)
2531             runtime_error ("Array bound mismatch for dimension 2 of "
2532                            "array (%ld/%ld) ",
2533                            (long int) ret_extent, (long int) arg_extent);
2534         }
2535     }
2536
2537
2538   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2539     {
2540       /* One-dimensional result may be addressed in the code below
2541          either as a row or a column matrix. We want both cases to
2542          work. */
2543       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2544     }
2545   else
2546     {
2547       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2548       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2549     }
2550
2551
2552   if (GFC_DESCRIPTOR_RANK (a) == 1)
2553     {
2554       /* Treat it as a a row matrix A[1,count]. */
2555       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2556       aystride = 1;
2557
2558       xcount = 1;
2559       count = GFC_DESCRIPTOR_EXTENT(a,0);
2560     }
2561   else
2562     {
2563       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2564       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2565
2566       count = GFC_DESCRIPTOR_EXTENT(a,1);
2567       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2568     }
2569
2570   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2571     {
2572       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2573         runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2574                        "in dimension 1: is %ld, should be %ld",
2575                        (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2576     }
2577
2578   if (GFC_DESCRIPTOR_RANK (b) == 1)
2579     {
2580       /* Treat it as a column matrix B[count,1] */
2581       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2582
2583       /* bystride should never be used for 1-dimensional b.
2584          The value is only used for calculation of the
2585          memory by the buffer.  */
2586       bystride = 256;
2587       ycount = 1;
2588     }
2589   else
2590     {
2591       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2592       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2593       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2594     }
2595
2596   abase = a->base_addr;
2597   bbase = b->base_addr;
2598   dest = retarray->base_addr;
2599
2600   /* Now that everything is set up, we perform the multiplication
2601      itself.  */
2602
2603 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2604 #define min(a,b) ((a) <= (b) ? (a) : (b))
2605 #define max(a,b) ((a) >= (b) ? (a) : (b))
2606
2607   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2608       && (bxstride == 1 || bystride == 1)
2609       && (((float) xcount) * ((float) ycount) * ((float) count)
2610           > POW3(blas_limit)))
2611     {
2612       const int m = xcount, n = ycount, k = count, ldc = rystride;
2613       const GFC_REAL_8 one = 1, zero = 0;
2614       const int lda = (axstride == 1) ? aystride : axstride,
2615                 ldb = (bxstride == 1) ? bystride : bxstride;
2616
2617       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2618         {
2619           assert (gemm != NULL);
2620           const char *transa, *transb;
2621           if (try_blas & 2)
2622             transa = "C";
2623           else
2624             transa = axstride == 1 ? "N" : "T";
2625
2626           if (try_blas & 4)
2627             transb = "C";
2628           else
2629             transb = bxstride == 1 ? "N" : "T";
2630
2631           gemm (transa, transb , &m,
2632                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2633                 &ldc, 1, 1);
2634           return;
2635         }
2636     }
2637
2638   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2639     {
2640       /* This block of code implements a tuned matmul, derived from
2641          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2642
2643                Bo Kagstrom and Per Ling
2644                Department of Computing Science
2645                Umea University
2646                S-901 87 Umea, Sweden
2647
2648          from netlib.org, translated to C, and modified for matmul.m4.  */
2649
2650       const GFC_REAL_8 *a, *b;
2651       GFC_REAL_8 *c;
2652       const index_type m = xcount, n = ycount, k = count;
2653
2654       /* System generated locals */
2655       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2656                  i1, i2, i3, i4, i5, i6;
2657
2658       /* Local variables */
2659       GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
2660                  f13, f14, f23, f24, f33, f34, f43, f44;
2661       index_type i, j, l, ii, jj, ll;
2662       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2663       GFC_REAL_8 *t1;
2664
2665       a = abase;
2666       b = bbase;
2667       c = retarray->base_addr;
2668
2669       /* Parameter adjustments */
2670       c_dim1 = rystride;
2671       c_offset = 1 + c_dim1;
2672       c -= c_offset;
2673       a_dim1 = aystride;
2674       a_offset = 1 + a_dim1;
2675       a -= a_offset;
2676       b_dim1 = bystride;
2677       b_offset = 1 + b_dim1;
2678       b -= b_offset;
2679
2680       /* Empty c first.  */
2681       for (j=1; j<=n; j++)
2682         for (i=1; i<=m; i++)
2683           c[i + j * c_dim1] = (GFC_REAL_8)0;
2684
2685       /* Early exit if possible */
2686       if (m == 0 || n == 0 || k == 0)
2687         return;
2688
2689       /* Adjust size of t1 to what is needed.  */
2690       index_type t1_dim, a_sz;
2691       if (aystride == 1)
2692         a_sz = rystride;
2693       else
2694         a_sz = a_dim1;
2695
2696       t1_dim = a_sz * 256 + b_dim1;
2697       if (t1_dim > 65536)
2698         t1_dim = 65536;
2699
2700       t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
2701
2702       /* Start turning the crank. */
2703       i1 = n;
2704       for (jj = 1; jj <= i1; jj += 512)
2705         {
2706           /* Computing MIN */
2707           i2 = 512;
2708           i3 = n - jj + 1;
2709           jsec = min(i2,i3);
2710           ujsec = jsec - jsec % 4;
2711           i2 = k;
2712           for (ll = 1; ll <= i2; ll += 256)
2713             {
2714               /* Computing MIN */
2715               i3 = 256;
2716               i4 = k - ll + 1;
2717               lsec = min(i3,i4);
2718               ulsec = lsec - lsec % 2;
2719
2720               i3 = m;
2721               for (ii = 1; ii <= i3; ii += 256)
2722                 {
2723                   /* Computing MIN */
2724                   i4 = 256;
2725                   i5 = m - ii + 1;
2726                   isec = min(i4,i5);
2727                   uisec = isec - isec % 2;
2728                   i4 = ll + ulsec - 1;
2729                   for (l = ll; l <= i4; l += 2)
2730                     {
2731                       i5 = ii + uisec - 1;
2732                       for (i = ii; i <= i5; i += 2)
2733                         {
2734                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2735                                         a[i + l * a_dim1];
2736                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2737                                         a[i + (l + 1) * a_dim1];
2738                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2739                                         a[i + 1 + l * a_dim1];
2740                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2741                                         a[i + 1 + (l + 1) * a_dim1];
2742                         }
2743                       if (uisec < isec)
2744                         {
2745                           t1[l - ll + 1 + (isec << 8) - 257] =
2746                                     a[ii + isec - 1 + l * a_dim1];
2747                           t1[l - ll + 2 + (isec << 8) - 257] =
2748                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2749                         }
2750                     }
2751                   if (ulsec < lsec)
2752                     {
2753                       i4 = ii + isec - 1;
2754                       for (i = ii; i<= i4; ++i)
2755                         {
2756                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2757                                     a[i + (ll + lsec - 1) * a_dim1];
2758                         }
2759                     }
2760
2761                   uisec = isec - isec % 4;
2762                   i4 = jj + ujsec - 1;
2763                   for (j = jj; j <= i4; j += 4)
2764                     {
2765                       i5 = ii + uisec - 1;
2766                       for (i = ii; i <= i5; i += 4)
2767                         {
2768                           f11 = c[i + j * c_dim1];
2769                           f21 = c[i + 1 + j * c_dim1];
2770                           f12 = c[i + (j + 1) * c_dim1];
2771                           f22 = c[i + 1 + (j + 1) * c_dim1];
2772                           f13 = c[i + (j + 2) * c_dim1];
2773                           f23 = c[i + 1 + (j + 2) * c_dim1];
2774                           f14 = c[i + (j + 3) * c_dim1];
2775                           f24 = c[i + 1 + (j + 3) * c_dim1];
2776                           f31 = c[i + 2 + j * c_dim1];
2777                           f41 = c[i + 3 + j * c_dim1];
2778                           f32 = c[i + 2 + (j + 1) * c_dim1];
2779                           f42 = c[i + 3 + (j + 1) * c_dim1];
2780                           f33 = c[i + 2 + (j + 2) * c_dim1];
2781                           f43 = c[i + 3 + (j + 2) * c_dim1];
2782                           f34 = c[i + 2 + (j + 3) * c_dim1];
2783                           f44 = c[i + 3 + (j + 3) * c_dim1];
2784                           i6 = ll + lsec - 1;
2785                           for (l = ll; l <= i6; ++l)
2786                             {
2787                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2788                                       * b[l + j * b_dim1];
2789                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2790                                       * b[l + j * b_dim1];
2791                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2792                                       * b[l + (j + 1) * b_dim1];
2793                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2794                                       * b[l + (j + 1) * b_dim1];
2795                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2796                                       * b[l + (j + 2) * b_dim1];
2797                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2798                                       * b[l + (j + 2) * b_dim1];
2799                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2800                                       * b[l + (j + 3) * b_dim1];
2801                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2802                                       * b[l + (j + 3) * b_dim1];
2803                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2804                                       * b[l + j * b_dim1];
2805                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2806                                       * b[l + j * b_dim1];
2807                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2808                                       * b[l + (j + 1) * b_dim1];
2809                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2810                                       * b[l + (j + 1) * b_dim1];
2811                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2812                                       * b[l + (j + 2) * b_dim1];
2813                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2814                                       * b[l + (j + 2) * b_dim1];
2815                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2816                                       * b[l + (j + 3) * b_dim1];
2817                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2818                                       * b[l + (j + 3) * b_dim1];
2819                             }
2820                           c[i + j * c_dim1] = f11;
2821                           c[i + 1 + j * c_dim1] = f21;
2822                           c[i + (j + 1) * c_dim1] = f12;
2823                           c[i + 1 + (j + 1) * c_dim1] = f22;
2824                           c[i + (j + 2) * c_dim1] = f13;
2825                           c[i + 1 + (j + 2) * c_dim1] = f23;
2826                           c[i + (j + 3) * c_dim1] = f14;
2827                           c[i + 1 + (j + 3) * c_dim1] = f24;
2828                           c[i + 2 + j * c_dim1] = f31;
2829                           c[i + 3 + j * c_dim1] = f41;
2830                           c[i + 2 + (j + 1) * c_dim1] = f32;
2831                           c[i + 3 + (j + 1) * c_dim1] = f42;
2832                           c[i + 2 + (j + 2) * c_dim1] = f33;
2833                           c[i + 3 + (j + 2) * c_dim1] = f43;
2834                           c[i + 2 + (j + 3) * c_dim1] = f34;
2835                           c[i + 3 + (j + 3) * c_dim1] = f44;
2836                         }
2837                       if (uisec < isec)
2838                         {
2839                           i5 = ii + isec - 1;
2840                           for (i = ii + uisec; i <= i5; ++i)
2841                             {
2842                               f11 = c[i + j * c_dim1];
2843                               f12 = c[i + (j + 1) * c_dim1];
2844                               f13 = c[i + (j + 2) * c_dim1];
2845                               f14 = c[i + (j + 3) * c_dim1];
2846                               i6 = ll + lsec - 1;
2847                               for (l = ll; l <= i6; ++l)
2848                                 {
2849                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2850                                           257] * b[l + j * b_dim1];
2851                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2852                                           257] * b[l + (j + 1) * b_dim1];
2853                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2854                                           257] * b[l + (j + 2) * b_dim1];
2855                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2856                                           257] * b[l + (j + 3) * b_dim1];
2857                                 }
2858                               c[i + j * c_dim1] = f11;
2859                               c[i + (j + 1) * c_dim1] = f12;
2860                               c[i + (j + 2) * c_dim1] = f13;
2861                               c[i + (j + 3) * c_dim1] = f14;
2862                             }
2863                         }
2864                     }
2865                   if (ujsec < jsec)
2866                     {
2867                       i4 = jj + jsec - 1;
2868                       for (j = jj + ujsec; j <= i4; ++j)
2869                         {
2870                           i5 = ii + uisec - 1;
2871                           for (i = ii; i <= i5; i += 4)
2872                             {
2873                               f11 = c[i + j * c_dim1];
2874                               f21 = c[i + 1 + j * c_dim1];
2875                               f31 = c[i + 2 + j * c_dim1];
2876                               f41 = c[i + 3 + j * c_dim1];
2877                               i6 = ll + lsec - 1;
2878                               for (l = ll; l <= i6; ++l)
2879                                 {
2880                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2881                                           257] * b[l + j * b_dim1];
2882                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2883                                           257] * b[l + j * b_dim1];
2884                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2885                                           257] * b[l + j * b_dim1];
2886                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2887                                           257] * b[l + j * b_dim1];
2888                                 }
2889                               c[i + j * c_dim1] = f11;
2890                               c[i + 1 + j * c_dim1] = f21;
2891                               c[i + 2 + j * c_dim1] = f31;
2892                               c[i + 3 + j * c_dim1] = f41;
2893                             }
2894                           i5 = ii + isec - 1;
2895                           for (i = ii + uisec; i <= i5; ++i)
2896                             {
2897                               f11 = c[i + j * c_dim1];
2898                               i6 = ll + lsec - 1;
2899                               for (l = ll; l <= i6; ++l)
2900                                 {
2901                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2902                                           257] * b[l + j * b_dim1];
2903                                 }
2904                               c[i + j * c_dim1] = f11;
2905                             }
2906                         }
2907                     }
2908                 }
2909             }
2910         }
2911       free(t1);
2912       return;
2913     }
2914   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2915     {
2916       if (GFC_DESCRIPTOR_RANK (a) != 1)
2917         {
2918           const GFC_REAL_8 *restrict abase_x;
2919           const GFC_REAL_8 *restrict bbase_y;
2920           GFC_REAL_8 *restrict dest_y;
2921           GFC_REAL_8 s;
2922
2923           for (y = 0; y < ycount; y++)
2924             {
2925               bbase_y = &bbase[y*bystride];
2926               dest_y = &dest[y*rystride];
2927               for (x = 0; x < xcount; x++)
2928                 {
2929                   abase_x = &abase[x*axstride];
2930                   s = (GFC_REAL_8) 0;
2931                   for (n = 0; n < count; n++)
2932                     s += abase_x[n] * bbase_y[n];
2933                   dest_y[x] = s;
2934                 }
2935             }
2936         }
2937       else
2938         {
2939           const GFC_REAL_8 *restrict bbase_y;
2940           GFC_REAL_8 s;
2941
2942           for (y = 0; y < ycount; y++)
2943             {
2944               bbase_y = &bbase[y*bystride];
2945               s = (GFC_REAL_8) 0;
2946               for (n = 0; n < count; n++)
2947                 s += abase[n*axstride] * bbase_y[n];
2948               dest[y*rystride] = s;
2949             }
2950         }
2951     }
2952   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2953     {
2954       const GFC_REAL_8 *restrict bbase_y;
2955       GFC_REAL_8 s;
2956
2957       for (y = 0; y < ycount; y++)
2958         {
2959           bbase_y = &bbase[y*bystride];
2960           s = (GFC_REAL_8) 0;
2961           for (n = 0; n < count; n++)
2962             s += abase[n*axstride] * bbase_y[n*bxstride];
2963           dest[y*rxstride] = s;
2964         }
2965     }
2966   else if (axstride < aystride)
2967     {
2968       for (y = 0; y < ycount; y++)
2969         for (x = 0; x < xcount; x++)
2970           dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
2971
2972       for (y = 0; y < ycount; y++)
2973         for (n = 0; n < count; n++)
2974           for (x = 0; x < xcount; x++)
2975             /* dest[x,y] += a[x,n] * b[n,y] */
2976             dest[x*rxstride + y*rystride] +=
2977                                         abase[x*axstride + n*aystride] *
2978                                         bbase[n*bxstride + y*bystride];
2979     }
2980   else
2981     {
2982       const GFC_REAL_8 *restrict abase_x;
2983       const GFC_REAL_8 *restrict bbase_y;
2984       GFC_REAL_8 *restrict dest_y;
2985       GFC_REAL_8 s;
2986
2987       for (y = 0; y < ycount; y++)
2988         {
2989           bbase_y = &bbase[y*bystride];
2990           dest_y = &dest[y*rystride];
2991           for (x = 0; x < xcount; x++)
2992             {
2993               abase_x = &abase[x*axstride];
2994               s = (GFC_REAL_8) 0;
2995               for (n = 0; n < count; n++)
2996                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2997               dest_y[x*rxstride] = s;
2998             }
2999         }
3000     }
3001 }
3002 #undef POW3
3003 #undef min
3004 #undef max
3005
3006 #endif
3007 #endif
3008