libgfortran/generated/matmul_c8.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_COMPLEX_8)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
  39                           const int *, const GFC_COMPLEX_8 *, const int *,
  40                           const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
  73         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_c8);
  76
  77 /* Put exhaustive list of possible architectures here here, ORed together.  */
  78
  79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  80
  81 #ifdef HAVE_AVX
  82 static void
  83 matmul_c8_avx (gfc_array_c8 * const restrict retarray,
  84         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
  85         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  86 static void
  87 matmul_c8_avx (gfc_array_c8 * const restrict retarray,
  88         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
  89         int blas_limit, blas_call gemm)
  90 {
  91   const GFC_COMPLEX_8 * restrict abase;
  92   const GFC_COMPLEX_8 * restrict bbase;
  93   GFC_COMPLEX_8 * restrict dest;
  94
  95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  96   index_type x, y, n, count, xcount, ycount;
  97
  98   assert (GFC_DESCRIPTOR_RANK (a) == 2
  99           || GFC_DESCRIPTOR_RANK (b) == 2);
 100
 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 102
 103    Either A or B (but not both) can be rank 1:
 104
 105    o One-dimensional argument A is implicitly treated as a row matrix
 106      dimensioned [1,count], so xcount=1.
 107
 108    o One-dimensional argument B is implicitly treated as a column matrix
 109      dimensioned [count, 1], so ycount=1.
 110 */
 111
 112   if (retarray->base_addr == NULL)
 113     {
 114       if (GFC_DESCRIPTOR_RANK (a) == 1)
 115         {
 116           GFC_DIMENSION_SET(retarray->dim[0], 0,
 117                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 118         }
 119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 120         {
 121           GFC_DIMENSION_SET(retarray->dim[0], 0,
 122                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 123         }
 124       else
 125         {
 126           GFC_DIMENSION_SET(retarray->dim[0], 0,
 127                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 128
 129           GFC_DIMENSION_SET(retarray->dim[1], 0,
 130                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 131                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 132         }
 133
 134       retarray->base_addr
 135         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
 136       retarray->offset = 0;
 137     }
 138   else if (unlikely (compile_options.bounds_check))
 139     {
 140       index_type ret_extent, arg_extent;
 141
 142       if (GFC_DESCRIPTOR_RANK (a) == 1)
 143         {
 144           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 145           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 146           if (arg_extent != ret_extent)
 147             runtime_error ("Incorrect extent in return array in"
 148                            " MATMUL intrinsic: is %ld, should be %ld",
 149                            (long int) ret_extent, (long int) arg_extent);
 150         }
 151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 152         {
 153           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 154           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 155           if (arg_extent != ret_extent)
 156             runtime_error ("Incorrect extent in return array in"
 157                            " MATMUL intrinsic: is %ld, should be %ld",
 158                            (long int) ret_extent, (long int) arg_extent);
 159         }
 160       else
 161         {
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Incorrect extent in return array in"
 166                            " MATMUL intrinsic for dimension 1:"
 167                            " is %ld, should be %ld",
 168                            (long int) ret_extent, (long int) arg_extent);
 169
 170           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 171           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 172           if (arg_extent != ret_extent)
 173             runtime_error ("Incorrect extent in return array in"
 174                            " MATMUL intrinsic for dimension 2:"
 175                            " is %ld, should be %ld",
 176                            (long int) ret_extent, (long int) arg_extent);
 177         }
 178     }
 179
 180
 181   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 182     {
 183       /* One-dimensional result may be addressed in the code below
 184          either as a row or a column matrix. We want both cases to
 185          work. */
 186       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 187     }
 188   else
 189     {
 190       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 191       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 192     }
 193
 194
 195   if (GFC_DESCRIPTOR_RANK (a) == 1)
 196     {
 197       /* Treat it as a a row matrix A[1,count]. */
 198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 199       aystride = 1;
 200
 201       xcount = 1;
 202       count = GFC_DESCRIPTOR_EXTENT(a,0);
 203     }
 204   else
 205     {
 206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 207       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 208
 209       count = GFC_DESCRIPTOR_EXTENT(a,1);
 210       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 211     }
 212
 213   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 214     {
 215       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 216         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 217     }
 218
 219   if (GFC_DESCRIPTOR_RANK (b) == 1)
 220     {
 221       /* Treat it as a column matrix B[count,1] */
 222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 223
 224       /* bystride should never be used for 1-dimensional b.
 225          in case it is we want it to cause a segfault, rather than
 226          an incorrect result. */
 227       bystride = 0xDEADBEEF;
 228       ycount = 1;
 229     }
 230   else
 231     {
 232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 235     }
 236
 237   abase = a->base_addr;
 238   bbase = b->base_addr;
 239   dest = retarray->base_addr;
 240
 241   /* Now that everything is set up, we perform the multiplication
 242      itself.  */
 243
 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 245 #define min(a,b) ((a) <= (b) ? (a) : (b))
 246 #define max(a,b) ((a) >= (b) ? (a) : (b))
 247
 248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 249       && (bxstride == 1 || bystride == 1)
 250       && (((float) xcount) * ((float) ycount) * ((float) count)
 251           > POW3(blas_limit)))
 252     {
 253       const int m = xcount, n = ycount, k = count, ldc = rystride;
 254       const GFC_COMPLEX_8 one = 1, zero = 0;
 255       const int lda = (axstride == 1) ? aystride : axstride,
 256                 ldb = (bxstride == 1) ? bystride : bxstride;
 257
 258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 259         {
 260           assert (gemm != NULL);
 261           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 262                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 263                 &ldc, 1, 1);
 264           return;
 265         }
 266     }
 267
 268   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 269     {
 270       /* This block of code implements a tuned matmul, derived from
 271          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 272
 273                Bo Kagstrom and Per Ling
 274                Department of Computing Science
 275                Umea University
 276                S-901 87 Umea, Sweden
 277
 278          from netlib.org, translated to C, and modified for matmul.m4.  */
 279
 280       const GFC_COMPLEX_8 *a, *b;
 281       GFC_COMPLEX_8 *c;
 282       const index_type m = xcount, n = ycount, k = count;
 283
 284       /* System generated locals */
 285       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 286                  i1, i2, i3, i4, i5, i6;
 287
 288       /* Local variables */
 289       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
 290                  f11, f12, f21, f22, f31, f32, f41, f42,
 291                  f13, f14, f23, f24, f33, f34, f43, f44;
 292       index_type i, j, l, ii, jj, ll;
 293       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 294
 295       a = abase;
 296       b = bbase;
 297       c = retarray->base_addr;
 298
 299       /* Parameter adjustments */
 300       c_dim1 = rystride;
 301       c_offset = 1 + c_dim1;
 302       c -= c_offset;
 303       a_dim1 = aystride;
 304       a_offset = 1 + a_dim1;
 305       a -= a_offset;
 306       b_dim1 = bystride;
 307       b_offset = 1 + b_dim1;
 308       b -= b_offset;
 309
 310       /* Early exit if possible */
 311       if (m == 0 || n == 0 || k == 0)
 312         return;
 313
 314       /* Empty c first.  */
 315       for (j=1; j<=n; j++)
 316         for (i=1; i<=m; i++)
 317           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
 318
 319       /* Start turning the crank. */
 320       i1 = n;
 321       for (jj = 1; jj <= i1; jj += 512)
 322         {
 323           /* Computing MIN */
 324           i2 = 512;
 325           i3 = n - jj + 1;
 326           jsec = min(i2,i3);
 327           ujsec = jsec - jsec % 4;
 328           i2 = k;
 329           for (ll = 1; ll <= i2; ll += 256)
 330             {
 331               /* Computing MIN */
 332               i3 = 256;
 333               i4 = k - ll + 1;
 334               lsec = min(i3,i4);
 335               ulsec = lsec - lsec % 2;
 336
 337               i3 = m;
 338               for (ii = 1; ii <= i3; ii += 256)
 339                 {
 340                   /* Computing MIN */
 341                   i4 = 256;
 342                   i5 = m - ii + 1;
 343                   isec = min(i4,i5);
 344                   uisec = isec - isec % 2;
 345                   i4 = ll + ulsec - 1;
 346                   for (l = ll; l <= i4; l += 2)
 347                     {
 348                       i5 = ii + uisec - 1;
 349                       for (i = ii; i <= i5; i += 2)
 350                         {
 351                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 352                                         a[i + l * a_dim1];
 353                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 354                                         a[i + (l + 1) * a_dim1];
 355                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 356                                         a[i + 1 + l * a_dim1];
 357                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 358                                         a[i + 1 + (l + 1) * a_dim1];
 359                         }
 360                       if (uisec < isec)
 361                         {
 362                           t1[l - ll + 1 + (isec << 8) - 257] =
 363                                     a[ii + isec - 1 + l * a_dim1];
 364                           t1[l - ll + 2 + (isec << 8) - 257] =
 365                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 366                         }
 367                     }
 368                   if (ulsec < lsec)
 369                     {
 370                       i4 = ii + isec - 1;
 371                       for (i = ii; i<= i4; ++i)
 372                         {
 373                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 374                                     a[i + (ll + lsec - 1) * a_dim1];
 375                         }
 376                     }
 377
 378                   uisec = isec - isec % 4;
 379                   i4 = jj + ujsec - 1;
 380                   for (j = jj; j <= i4; j += 4)
 381                     {
 382                       i5 = ii + uisec - 1;
 383                       for (i = ii; i <= i5; i += 4)
 384                         {
 385                           f11 = c[i + j * c_dim1];
 386                           f21 = c[i + 1 + j * c_dim1];
 387                           f12 = c[i + (j + 1) * c_dim1];
 388                           f22 = c[i + 1 + (j + 1) * c_dim1];
 389                           f13 = c[i + (j + 2) * c_dim1];
 390                           f23 = c[i + 1 + (j + 2) * c_dim1];
 391                           f14 = c[i + (j + 3) * c_dim1];
 392                           f24 = c[i + 1 + (j + 3) * c_dim1];
 393                           f31 = c[i + 2 + j * c_dim1];
 394                           f41 = c[i + 3 + j * c_dim1];
 395                           f32 = c[i + 2 + (j + 1) * c_dim1];
 396                           f42 = c[i + 3 + (j + 1) * c_dim1];
 397                           f33 = c[i + 2 + (j + 2) * c_dim1];
 398                           f43 = c[i + 3 + (j + 2) * c_dim1];
 399                           f34 = c[i + 2 + (j + 3) * c_dim1];
 400                           f44 = c[i + 3 + (j + 3) * c_dim1];
 401                           i6 = ll + lsec - 1;
 402                           for (l = ll; l <= i6; ++l)
 403                             {
 404                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 405                                       * b[l + j * b_dim1];
 406                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 407                                       * b[l + j * b_dim1];
 408                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 409                                       * b[l + (j + 1) * b_dim1];
 410                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 411                                       * b[l + (j + 1) * b_dim1];
 412                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 413                                       * b[l + (j + 2) * b_dim1];
 414                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 415                                       * b[l + (j + 2) * b_dim1];
 416                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 417                                       * b[l + (j + 3) * b_dim1];
 418                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 419                                       * b[l + (j + 3) * b_dim1];
 420                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 421                                       * b[l + j * b_dim1];
 422                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 423                                       * b[l + j * b_dim1];
 424                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 425                                       * b[l + (j + 1) * b_dim1];
 426                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 427                                       * b[l + (j + 1) * b_dim1];
 428                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 429                                       * b[l + (j + 2) * b_dim1];
 430                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 431                                       * b[l + (j + 2) * b_dim1];
 432                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 433                                       * b[l + (j + 3) * b_dim1];
 434                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 435                                       * b[l + (j + 3) * b_dim1];
 436                             }
 437                           c[i + j * c_dim1] = f11;
 438                           c[i + 1 + j * c_dim1] = f21;
 439                           c[i + (j + 1) * c_dim1] = f12;
 440                           c[i + 1 + (j + 1) * c_dim1] = f22;
 441                           c[i + (j + 2) * c_dim1] = f13;
 442                           c[i + 1 + (j + 2) * c_dim1] = f23;
 443                           c[i + (j + 3) * c_dim1] = f14;
 444                           c[i + 1 + (j + 3) * c_dim1] = f24;
 445                           c[i + 2 + j * c_dim1] = f31;
 446                           c[i + 3 + j * c_dim1] = f41;
 447                           c[i + 2 + (j + 1) * c_dim1] = f32;
 448                           c[i + 3 + (j + 1) * c_dim1] = f42;
 449                           c[i + 2 + (j + 2) * c_dim1] = f33;
 450                           c[i + 3 + (j + 2) * c_dim1] = f43;
 451                           c[i + 2 + (j + 3) * c_dim1] = f34;
 452                           c[i + 3 + (j + 3) * c_dim1] = f44;
 453                         }
 454                       if (uisec < isec)
 455                         {
 456                           i5 = ii + isec - 1;
 457                           for (i = ii + uisec; i <= i5; ++i)
 458                             {
 459                               f11 = c[i + j * c_dim1];
 460                               f12 = c[i + (j + 1) * c_dim1];
 461                               f13 = c[i + (j + 2) * c_dim1];
 462                               f14 = c[i + (j + 3) * c_dim1];
 463                               i6 = ll + lsec - 1;
 464                               for (l = ll; l <= i6; ++l)
 465                                 {
 466                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 467                                           257] * b[l + j * b_dim1];
 468                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 469                                           257] * b[l + (j + 1) * b_dim1];
 470                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 471                                           257] * b[l + (j + 2) * b_dim1];
 472                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 473                                           257] * b[l + (j + 3) * b_dim1];
 474                                 }
 475                               c[i + j * c_dim1] = f11;
 476                               c[i + (j + 1) * c_dim1] = f12;
 477                               c[i + (j + 2) * c_dim1] = f13;
 478                               c[i + (j + 3) * c_dim1] = f14;
 479                             }
 480                         }
 481                     }
 482                   if (ujsec < jsec)
 483                     {
 484                       i4 = jj + jsec - 1;
 485                       for (j = jj + ujsec; j <= i4; ++j)
 486                         {
 487                           i5 = ii + uisec - 1;
 488                           for (i = ii; i <= i5; i += 4)
 489                             {
 490                               f11 = c[i + j * c_dim1];
 491                               f21 = c[i + 1 + j * c_dim1];
 492                               f31 = c[i + 2 + j * c_dim1];
 493                               f41 = c[i + 3 + j * c_dim1];
 494                               i6 = ll + lsec - 1;
 495                               for (l = ll; l <= i6; ++l)
 496                                 {
 497                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 498                                           257] * b[l + j * b_dim1];
 499                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 500                                           257] * b[l + j * b_dim1];
 501                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 502                                           257] * b[l + j * b_dim1];
 503                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 504                                           257] * b[l + j * b_dim1];
 505                                 }
 506                               c[i + j * c_dim1] = f11;
 507                               c[i + 1 + j * c_dim1] = f21;
 508                               c[i + 2 + j * c_dim1] = f31;
 509                               c[i + 3 + j * c_dim1] = f41;
 510                             }
 511                           i5 = ii + isec - 1;
 512                           for (i = ii + uisec; i <= i5; ++i)
 513                             {
 514                               f11 = c[i + j * c_dim1];
 515                               i6 = ll + lsec - 1;
 516                               for (l = ll; l <= i6; ++l)
 517                                 {
 518                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 519                                           257] * b[l + j * b_dim1];
 520                                 }
 521                               c[i + j * c_dim1] = f11;
 522                             }
 523                         }
 524                     }
 525                 }
 526             }
 527         }
 528       return;
 529     }
 530   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 531     {
 532       if (GFC_DESCRIPTOR_RANK (a) != 1)
 533         {
 534           const GFC_COMPLEX_8 *restrict abase_x;
 535           const GFC_COMPLEX_8 *restrict bbase_y;
 536           GFC_COMPLEX_8 *restrict dest_y;
 537           GFC_COMPLEX_8 s;
 538
 539           for (y = 0; y < ycount; y++)
 540             {
 541               bbase_y = &bbase[y*bystride];
 542               dest_y = &dest[y*rystride];
 543               for (x = 0; x < xcount; x++)
 544                 {
 545                   abase_x = &abase[x*axstride];
 546                   s = (GFC_COMPLEX_8) 0;
 547                   for (n = 0; n < count; n++)
 548                     s += abase_x[n] * bbase_y[n];
 549                   dest_y[x] = s;
 550                 }
 551             }
 552         }
 553       else
 554         {
 555           const GFC_COMPLEX_8 *restrict bbase_y;
 556           GFC_COMPLEX_8 s;
 557
 558           for (y = 0; y < ycount; y++)
 559             {
 560               bbase_y = &bbase[y*bystride];
 561               s = (GFC_COMPLEX_8) 0;
 562               for (n = 0; n < count; n++)
 563                 s += abase[n*axstride] * bbase_y[n];
 564               dest[y*rystride] = s;
 565             }
 566         }
 567     }
 568   else if (axstride < aystride)
 569     {
 570       for (y = 0; y < ycount; y++)
 571         for (x = 0; x < xcount; x++)
 572           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
 573
 574       for (y = 0; y < ycount; y++)
 575         for (n = 0; n < count; n++)
 576           for (x = 0; x < xcount; x++)
 577             /* dest[x,y] += a[x,n] * b[n,y] */
 578             dest[x*rxstride + y*rystride] +=
 579                                         abase[x*axstride + n*aystride] *
 580                                         bbase[n*bxstride + y*bystride];
 581     }
 582   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 583     {
 584       const GFC_COMPLEX_8 *restrict bbase_y;
 585       GFC_COMPLEX_8 s;
 586
 587       for (y = 0; y < ycount; y++)
 588         {
 589           bbase_y = &bbase[y*bystride];
 590           s = (GFC_COMPLEX_8) 0;
 591           for (n = 0; n < count; n++)
 592             s += abase[n*axstride] * bbase_y[n*bxstride];
 593           dest[y*rxstride] = s;
 594         }
 595     }
 596   else
 597     {
 598       const GFC_COMPLEX_8 *restrict abase_x;
 599       const GFC_COMPLEX_8 *restrict bbase_y;
 600       GFC_COMPLEX_8 *restrict dest_y;
 601       GFC_COMPLEX_8 s;
 602
 603       for (y = 0; y < ycount; y++)
 604         {
 605           bbase_y = &bbase[y*bystride];
 606           dest_y = &dest[y*rystride];
 607           for (x = 0; x < xcount; x++)
 608             {
 609               abase_x = &abase[x*axstride];
 610               s = (GFC_COMPLEX_8) 0;
 611               for (n = 0; n < count; n++)
 612                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 613               dest_y[x*rxstride] = s;
 614             }
 615         }
 616     }
 617 }
 618 #undef POW3
 619 #undef min
 620 #undef max
 621
 622 #endif /* HAVE_AVX */
 623
 624 #ifdef HAVE_AVX2
 625 static void
 626 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
 627         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
 628         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 629 static void
 630 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
 631         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
 632         int blas_limit, blas_call gemm)
 633 {
 634   const GFC_COMPLEX_8 * restrict abase;
 635   const GFC_COMPLEX_8 * restrict bbase;
 636   GFC_COMPLEX_8 * restrict dest;
 637
 638   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 639   index_type x, y, n, count, xcount, ycount;
 640
 641   assert (GFC_DESCRIPTOR_RANK (a) == 2
 642           || GFC_DESCRIPTOR_RANK (b) == 2);
 643
 644 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 645
 646    Either A or B (but not both) can be rank 1:
 647
 648    o One-dimensional argument A is implicitly treated as a row matrix
 649      dimensioned [1,count], so xcount=1.
 650
 651    o One-dimensional argument B is implicitly treated as a column matrix
 652      dimensioned [count, 1], so ycount=1.
 653 */
 654
 655   if (retarray->base_addr == NULL)
 656     {
 657       if (GFC_DESCRIPTOR_RANK (a) == 1)
 658         {
 659           GFC_DIMENSION_SET(retarray->dim[0], 0,
 660                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 661         }
 662       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 663         {
 664           GFC_DIMENSION_SET(retarray->dim[0], 0,
 665                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 666         }
 667       else
 668         {
 669           GFC_DIMENSION_SET(retarray->dim[0], 0,
 670                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 671
 672           GFC_DIMENSION_SET(retarray->dim[1], 0,
 673                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 674                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 675         }
 676
 677       retarray->base_addr
 678         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
 679       retarray->offset = 0;
 680     }
 681   else if (unlikely (compile_options.bounds_check))
 682     {
 683       index_type ret_extent, arg_extent;
 684
 685       if (GFC_DESCRIPTOR_RANK (a) == 1)
 686         {
 687           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 688           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 689           if (arg_extent != ret_extent)
 690             runtime_error ("Incorrect extent in return array in"
 691                            " MATMUL intrinsic: is %ld, should be %ld",
 692                            (long int) ret_extent, (long int) arg_extent);
 693         }
 694       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 695         {
 696           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 697           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 698           if (arg_extent != ret_extent)
 699             runtime_error ("Incorrect extent in return array in"
 700                            " MATMUL intrinsic: is %ld, should be %ld",
 701                            (long int) ret_extent, (long int) arg_extent);
 702         }
 703       else
 704         {
 705           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 706           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 707           if (arg_extent != ret_extent)
 708             runtime_error ("Incorrect extent in return array in"
 709                            " MATMUL intrinsic for dimension 1:"
 710                            " is %ld, should be %ld",
 711                            (long int) ret_extent, (long int) arg_extent);
 712
 713           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 714           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 715           if (arg_extent != ret_extent)
 716             runtime_error ("Incorrect extent in return array in"
 717                            " MATMUL intrinsic for dimension 2:"
 718                            " is %ld, should be %ld",
 719                            (long int) ret_extent, (long int) arg_extent);
 720         }
 721     }
 722
 723
 724   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 725     {
 726       /* One-dimensional result may be addressed in the code below
 727          either as a row or a column matrix. We want both cases to
 728          work. */
 729       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 730     }
 731   else
 732     {
 733       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 734       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 735     }
 736
 737
 738   if (GFC_DESCRIPTOR_RANK (a) == 1)
 739     {
 740       /* Treat it as a a row matrix A[1,count]. */
 741       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 742       aystride = 1;
 743
 744       xcount = 1;
 745       count = GFC_DESCRIPTOR_EXTENT(a,0);
 746     }
 747   else
 748     {
 749       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 750       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 751
 752       count = GFC_DESCRIPTOR_EXTENT(a,1);
 753       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 754     }
 755
 756   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 757     {
 758       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 759         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 760     }
 761
 762   if (GFC_DESCRIPTOR_RANK (b) == 1)
 763     {
 764       /* Treat it as a column matrix B[count,1] */
 765       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 766
 767       /* bystride should never be used for 1-dimensional b.
 768          in case it is we want it to cause a segfault, rather than
 769          an incorrect result. */
 770       bystride = 0xDEADBEEF;
 771       ycount = 1;
 772     }
 773   else
 774     {
 775       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 776       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 777       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 778     }
 779
 780   abase = a->base_addr;
 781   bbase = b->base_addr;
 782   dest = retarray->base_addr;
 783
 784   /* Now that everything is set up, we perform the multiplication
 785      itself.  */
 786
 787 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 788 #define min(a,b) ((a) <= (b) ? (a) : (b))
 789 #define max(a,b) ((a) >= (b) ? (a) : (b))
 790
 791   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 792       && (bxstride == 1 || bystride == 1)
 793       && (((float) xcount) * ((float) ycount) * ((float) count)
 794           > POW3(blas_limit)))
 795     {
 796       const int m = xcount, n = ycount, k = count, ldc = rystride;
 797       const GFC_COMPLEX_8 one = 1, zero = 0;
 798       const int lda = (axstride == 1) ? aystride : axstride,
 799                 ldb = (bxstride == 1) ? bystride : bxstride;
 800
 801       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 802         {
 803           assert (gemm != NULL);
 804           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 805                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 806                 &ldc, 1, 1);
 807           return;
 808         }
 809     }
 810
 811   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 812     {
 813       /* This block of code implements a tuned matmul, derived from
 814          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 815
 816                Bo Kagstrom and Per Ling
 817                Department of Computing Science
 818                Umea University
 819                S-901 87 Umea, Sweden
 820
 821          from netlib.org, translated to C, and modified for matmul.m4.  */
 822
 823       const GFC_COMPLEX_8 *a, *b;
 824       GFC_COMPLEX_8 *c;
 825       const index_type m = xcount, n = ycount, k = count;
 826
 827       /* System generated locals */
 828       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 829                  i1, i2, i3, i4, i5, i6;
 830
 831       /* Local variables */
 832       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
 833                  f11, f12, f21, f22, f31, f32, f41, f42,
 834                  f13, f14, f23, f24, f33, f34, f43, f44;
 835       index_type i, j, l, ii, jj, ll;
 836       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 837
 838       a = abase;
 839       b = bbase;
 840       c = retarray->base_addr;
 841
 842       /* Parameter adjustments */
 843       c_dim1 = rystride;
 844       c_offset = 1 + c_dim1;
 845       c -= c_offset;
 846       a_dim1 = aystride;
 847       a_offset = 1 + a_dim1;
 848       a -= a_offset;
 849       b_dim1 = bystride;
 850       b_offset = 1 + b_dim1;
 851       b -= b_offset;
 852
 853       /* Early exit if possible */
 854       if (m == 0 || n == 0 || k == 0)
 855         return;
 856
 857       /* Empty c first.  */
 858       for (j=1; j<=n; j++)
 859         for (i=1; i<=m; i++)
 860           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
 861
 862       /* Start turning the crank. */
 863       i1 = n;
 864       for (jj = 1; jj <= i1; jj += 512)
 865         {
 866           /* Computing MIN */
 867           i2 = 512;
 868           i3 = n - jj + 1;
 869           jsec = min(i2,i3);
 870           ujsec = jsec - jsec % 4;
 871           i2 = k;
 872           for (ll = 1; ll <= i2; ll += 256)
 873             {
 874               /* Computing MIN */
 875               i3 = 256;
 876               i4 = k - ll + 1;
 877               lsec = min(i3,i4);
 878               ulsec = lsec - lsec % 2;
 879
 880               i3 = m;
 881               for (ii = 1; ii <= i3; ii += 256)
 882                 {
 883                   /* Computing MIN */
 884                   i4 = 256;
 885                   i5 = m - ii + 1;
 886                   isec = min(i4,i5);
 887                   uisec = isec - isec % 2;
 888                   i4 = ll + ulsec - 1;
 889                   for (l = ll; l <= i4; l += 2)
 890                     {
 891                       i5 = ii + uisec - 1;
 892                       for (i = ii; i <= i5; i += 2)
 893                         {
 894                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 895                                         a[i + l * a_dim1];
 896                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 897                                         a[i + (l + 1) * a_dim1];
 898                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 899                                         a[i + 1 + l * a_dim1];
 900                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 901                                         a[i + 1 + (l + 1) * a_dim1];
 902                         }
 903                       if (uisec < isec)
 904                         {
 905                           t1[l - ll + 1 + (isec << 8) - 257] =
 906                                     a[ii + isec - 1 + l * a_dim1];
 907                           t1[l - ll + 2 + (isec << 8) - 257] =
 908                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 909                         }
 910                     }
 911                   if (ulsec < lsec)
 912                     {
 913                       i4 = ii + isec - 1;
 914                       for (i = ii; i<= i4; ++i)
 915                         {
 916                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 917                                     a[i + (ll + lsec - 1) * a_dim1];
 918                         }
 919                     }
 920
 921                   uisec = isec - isec % 4;
 922                   i4 = jj + ujsec - 1;
 923                   for (j = jj; j <= i4; j += 4)
 924                     {
 925                       i5 = ii + uisec - 1;
 926                       for (i = ii; i <= i5; i += 4)
 927                         {
 928                           f11 = c[i + j * c_dim1];
 929                           f21 = c[i + 1 + j * c_dim1];
 930                           f12 = c[i + (j + 1) * c_dim1];
 931                           f22 = c[i + 1 + (j + 1) * c_dim1];
 932                           f13 = c[i + (j + 2) * c_dim1];
 933                           f23 = c[i + 1 + (j + 2) * c_dim1];
 934                           f14 = c[i + (j + 3) * c_dim1];
 935                           f24 = c[i + 1 + (j + 3) * c_dim1];
 936                           f31 = c[i + 2 + j * c_dim1];
 937                           f41 = c[i + 3 + j * c_dim1];
 938                           f32 = c[i + 2 + (j + 1) * c_dim1];
 939                           f42 = c[i + 3 + (j + 1) * c_dim1];
 940                           f33 = c[i + 2 + (j + 2) * c_dim1];
 941                           f43 = c[i + 3 + (j + 2) * c_dim1];
 942                           f34 = c[i + 2 + (j + 3) * c_dim1];
 943                           f44 = c[i + 3 + (j + 3) * c_dim1];
 944                           i6 = ll + lsec - 1;
 945                           for (l = ll; l <= i6; ++l)
 946                             {
 947                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 948                                       * b[l + j * b_dim1];
 949                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 950                                       * b[l + j * b_dim1];
 951                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 952                                       * b[l + (j + 1) * b_dim1];
 953                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 954                                       * b[l + (j + 1) * b_dim1];
 955                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 956                                       * b[l + (j + 2) * b_dim1];
 957                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 958                                       * b[l + (j + 2) * b_dim1];
 959                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 960                                       * b[l + (j + 3) * b_dim1];
 961                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 962                                       * b[l + (j + 3) * b_dim1];
 963                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 964                                       * b[l + j * b_dim1];
 965                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 966                                       * b[l + j * b_dim1];
 967                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 968                                       * b[l + (j + 1) * b_dim1];
 969                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 970                                       * b[l + (j + 1) * b_dim1];
 971                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 972                                       * b[l + (j + 2) * b_dim1];
 973                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 974                                       * b[l + (j + 2) * b_dim1];
 975                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 976                                       * b[l + (j + 3) * b_dim1];
 977                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 978                                       * b[l + (j + 3) * b_dim1];
 979                             }
 980                           c[i + j * c_dim1] = f11;
 981                           c[i + 1 + j * c_dim1] = f21;
 982                           c[i + (j + 1) * c_dim1] = f12;
 983                           c[i + 1 + (j + 1) * c_dim1] = f22;
 984                           c[i + (j + 2) * c_dim1] = f13;
 985                           c[i + 1 + (j + 2) * c_dim1] = f23;
 986                           c[i + (j + 3) * c_dim1] = f14;
 987                           c[i + 1 + (j + 3) * c_dim1] = f24;
 988                           c[i + 2 + j * c_dim1] = f31;
 989                           c[i + 3 + j * c_dim1] = f41;
 990                           c[i + 2 + (j + 1) * c_dim1] = f32;
 991                           c[i + 3 + (j + 1) * c_dim1] = f42;
 992                           c[i + 2 + (j + 2) * c_dim1] = f33;
 993                           c[i + 3 + (j + 2) * c_dim1] = f43;
 994                           c[i + 2 + (j + 3) * c_dim1] = f34;
 995                           c[i + 3 + (j + 3) * c_dim1] = f44;
 996                         }
 997                       if (uisec < isec)
 998                         {
 999                           i5 = ii + isec - 1;
1000                           for (i = ii + uisec; i <= i5; ++i)
1001                             {
1002                               f11 = c[i + j * c_dim1];
1003                               f12 = c[i + (j + 1) * c_dim1];
1004                               f13 = c[i + (j + 2) * c_dim1];
1005                               f14 = c[i + (j + 3) * c_dim1];
1006                               i6 = ll + lsec - 1;
1007                               for (l = ll; l <= i6; ++l)
1008                                 {
1009                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1010                                           257] * b[l + j * b_dim1];
1011                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1012                                           257] * b[l + (j + 1) * b_dim1];
1013                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1014                                           257] * b[l + (j + 2) * b_dim1];
1015                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1016                                           257] * b[l + (j + 3) * b_dim1];
1017                                 }
1018                               c[i + j * c_dim1] = f11;
1019                               c[i + (j + 1) * c_dim1] = f12;
1020                               c[i + (j + 2) * c_dim1] = f13;
1021                               c[i + (j + 3) * c_dim1] = f14;
1022                             }
1023                         }
1024                     }
1025                   if (ujsec < jsec)
1026                     {
1027                       i4 = jj + jsec - 1;
1028                       for (j = jj + ujsec; j <= i4; ++j)
1029                         {
1030                           i5 = ii + uisec - 1;
1031                           for (i = ii; i <= i5; i += 4)
1032                             {
1033                               f11 = c[i + j * c_dim1];
1034                               f21 = c[i + 1 + j * c_dim1];
1035                               f31 = c[i + 2 + j * c_dim1];
1036                               f41 = c[i + 3 + j * c_dim1];
1037                               i6 = ll + lsec - 1;
1038                               for (l = ll; l <= i6; ++l)
1039                                 {
1040                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041                                           257] * b[l + j * b_dim1];
1042                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1043                                           257] * b[l + j * b_dim1];
1044                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1045                                           257] * b[l + j * b_dim1];
1046                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1047                                           257] * b[l + j * b_dim1];
1048                                 }
1049                               c[i + j * c_dim1] = f11;
1050                               c[i + 1 + j * c_dim1] = f21;
1051                               c[i + 2 + j * c_dim1] = f31;
1052                               c[i + 3 + j * c_dim1] = f41;
1053                             }
1054                           i5 = ii + isec - 1;
1055                           for (i = ii + uisec; i <= i5; ++i)
1056                             {
1057                               f11 = c[i + j * c_dim1];
1058                               i6 = ll + lsec - 1;
1059                               for (l = ll; l <= i6; ++l)
1060                                 {
1061                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1062                                           257] * b[l + j * b_dim1];
1063                                 }
1064                               c[i + j * c_dim1] = f11;
1065                             }
1066                         }
1067                     }
1068                 }
1069             }
1070         }
1071       return;
1072     }
1073   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1074     {
1075       if (GFC_DESCRIPTOR_RANK (a) != 1)
1076         {
1077           const GFC_COMPLEX_8 *restrict abase_x;
1078           const GFC_COMPLEX_8 *restrict bbase_y;
1079           GFC_COMPLEX_8 *restrict dest_y;
1080           GFC_COMPLEX_8 s;
1081
1082           for (y = 0; y < ycount; y++)
1083             {
1084               bbase_y = &bbase[y*bystride];
1085               dest_y = &dest[y*rystride];
1086               for (x = 0; x < xcount; x++)
1087                 {
1088                   abase_x = &abase[x*axstride];
1089                   s = (GFC_COMPLEX_8) 0;
1090                   for (n = 0; n < count; n++)
1091                     s += abase_x[n] * bbase_y[n];
1092                   dest_y[x] = s;
1093                 }
1094             }
1095         }
1096       else
1097         {
1098           const GFC_COMPLEX_8 *restrict bbase_y;
1099           GFC_COMPLEX_8 s;
1100
1101           for (y = 0; y < ycount; y++)
1102             {
1103               bbase_y = &bbase[y*bystride];
1104               s = (GFC_COMPLEX_8) 0;
1105               for (n = 0; n < count; n++)
1106                 s += abase[n*axstride] * bbase_y[n];
1107               dest[y*rystride] = s;
1108             }
1109         }
1110     }
1111   else if (axstride < aystride)
1112     {
1113       for (y = 0; y < ycount; y++)
1114         for (x = 0; x < xcount; x++)
1115           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1116
1117       for (y = 0; y < ycount; y++)
1118         for (n = 0; n < count; n++)
1119           for (x = 0; x < xcount; x++)
1120             /* dest[x,y] += a[x,n] * b[n,y] */
1121             dest[x*rxstride + y*rystride] +=
1122                                         abase[x*axstride + n*aystride] *
1123                                         bbase[n*bxstride + y*bystride];
1124     }
1125   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1126     {
1127       const GFC_COMPLEX_8 *restrict bbase_y;
1128       GFC_COMPLEX_8 s;
1129
1130       for (y = 0; y < ycount; y++)
1131         {
1132           bbase_y = &bbase[y*bystride];
1133           s = (GFC_COMPLEX_8) 0;
1134           for (n = 0; n < count; n++)
1135             s += abase[n*axstride] * bbase_y[n*bxstride];
1136           dest[y*rxstride] = s;
1137         }
1138     }
1139   else
1140     {
1141       const GFC_COMPLEX_8 *restrict abase_x;
1142       const GFC_COMPLEX_8 *restrict bbase_y;
1143       GFC_COMPLEX_8 *restrict dest_y;
1144       GFC_COMPLEX_8 s;
1145
1146       for (y = 0; y < ycount; y++)
1147         {
1148           bbase_y = &bbase[y*bystride];
1149           dest_y = &dest[y*rystride];
1150           for (x = 0; x < xcount; x++)
1151             {
1152               abase_x = &abase[x*axstride];
1153               s = (GFC_COMPLEX_8) 0;
1154               for (n = 0; n < count; n++)
1155                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1156               dest_y[x*rxstride] = s;
1157             }
1158         }
1159     }
1160 }
1161 #undef POW3
1162 #undef min
1163 #undef max
1164
1165 #endif /* HAVE_AVX2 */
1166
1167 #ifdef HAVE_AVX512F
1168 static void
1169 matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1170         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1171         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1172 static void
1173 matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1174         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1175         int blas_limit, blas_call gemm)
1176 {
1177   const GFC_COMPLEX_8 * restrict abase;
1178   const GFC_COMPLEX_8 * restrict bbase;
1179   GFC_COMPLEX_8 * restrict dest;
1180
1181   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1182   index_type x, y, n, count, xcount, ycount;
1183
1184   assert (GFC_DESCRIPTOR_RANK (a) == 2
1185           || GFC_DESCRIPTOR_RANK (b) == 2);
1186
1187 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1188
1189    Either A or B (but not both) can be rank 1:
1190
1191    o One-dimensional argument A is implicitly treated as a row matrix
1192      dimensioned [1,count], so xcount=1.
1193
1194    o One-dimensional argument B is implicitly treated as a column matrix
1195      dimensioned [count, 1], so ycount=1.
1196 */
1197
1198   if (retarray->base_addr == NULL)
1199     {
1200       if (GFC_DESCRIPTOR_RANK (a) == 1)
1201         {
1202           GFC_DIMENSION_SET(retarray->dim[0], 0,
1203                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1204         }
1205       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1206         {
1207           GFC_DIMENSION_SET(retarray->dim[0], 0,
1208                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1209         }
1210       else
1211         {
1212           GFC_DIMENSION_SET(retarray->dim[0], 0,
1213                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1214
1215           GFC_DIMENSION_SET(retarray->dim[1], 0,
1216                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1217                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1218         }
1219
1220       retarray->base_addr
1221         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1222       retarray->offset = 0;
1223     }
1224   else if (unlikely (compile_options.bounds_check))
1225     {
1226       index_type ret_extent, arg_extent;
1227
1228       if (GFC_DESCRIPTOR_RANK (a) == 1)
1229         {
1230           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1231           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1232           if (arg_extent != ret_extent)
1233             runtime_error ("Incorrect extent in return array in"
1234                            " MATMUL intrinsic: is %ld, should be %ld",
1235                            (long int) ret_extent, (long int) arg_extent);
1236         }
1237       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1238         {
1239           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1240           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1241           if (arg_extent != ret_extent)
1242             runtime_error ("Incorrect extent in return array in"
1243                            " MATMUL intrinsic: is %ld, should be %ld",
1244                            (long int) ret_extent, (long int) arg_extent);
1245         }
1246       else
1247         {
1248           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1249           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250           if (arg_extent != ret_extent)
1251             runtime_error ("Incorrect extent in return array in"
1252                            " MATMUL intrinsic for dimension 1:"
1253                            " is %ld, should be %ld",
1254                            (long int) ret_extent, (long int) arg_extent);
1255
1256           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1257           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1258           if (arg_extent != ret_extent)
1259             runtime_error ("Incorrect extent in return array in"
1260                            " MATMUL intrinsic for dimension 2:"
1261                            " is %ld, should be %ld",
1262                            (long int) ret_extent, (long int) arg_extent);
1263         }
1264     }
1265
1266
1267   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1268     {
1269       /* One-dimensional result may be addressed in the code below
1270          either as a row or a column matrix. We want both cases to
1271          work. */
1272       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1273     }
1274   else
1275     {
1276       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1277       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1278     }
1279
1280
1281   if (GFC_DESCRIPTOR_RANK (a) == 1)
1282     {
1283       /* Treat it as a a row matrix A[1,count]. */
1284       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1285       aystride = 1;
1286
1287       xcount = 1;
1288       count = GFC_DESCRIPTOR_EXTENT(a,0);
1289     }
1290   else
1291     {
1292       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1294
1295       count = GFC_DESCRIPTOR_EXTENT(a,1);
1296       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1297     }
1298
1299   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1300     {
1301       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1302         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1303     }
1304
1305   if (GFC_DESCRIPTOR_RANK (b) == 1)
1306     {
1307       /* Treat it as a column matrix B[count,1] */
1308       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1309
1310       /* bystride should never be used for 1-dimensional b.
1311          in case it is we want it to cause a segfault, rather than
1312          an incorrect result. */
1313       bystride = 0xDEADBEEF;
1314       ycount = 1;
1315     }
1316   else
1317     {
1318       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1319       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1320       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1321     }
1322
1323   abase = a->base_addr;
1324   bbase = b->base_addr;
1325   dest = retarray->base_addr;
1326
1327   /* Now that everything is set up, we perform the multiplication
1328      itself.  */
1329
1330 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1331 #define min(a,b) ((a) <= (b) ? (a) : (b))
1332 #define max(a,b) ((a) >= (b) ? (a) : (b))
1333
1334   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1335       && (bxstride == 1 || bystride == 1)
1336       && (((float) xcount) * ((float) ycount) * ((float) count)
1337           > POW3(blas_limit)))
1338     {
1339       const int m = xcount, n = ycount, k = count, ldc = rystride;
1340       const GFC_COMPLEX_8 one = 1, zero = 0;
1341       const int lda = (axstride == 1) ? aystride : axstride,
1342                 ldb = (bxstride == 1) ? bystride : bxstride;
1343
1344       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1345         {
1346           assert (gemm != NULL);
1347           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1348                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1349                 &ldc, 1, 1);
1350           return;
1351         }
1352     }
1353
1354   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1355     {
1356       /* This block of code implements a tuned matmul, derived from
1357          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1358
1359                Bo Kagstrom and Per Ling
1360                Department of Computing Science
1361                Umea University
1362                S-901 87 Umea, Sweden
1363
1364          from netlib.org, translated to C, and modified for matmul.m4.  */
1365
1366       const GFC_COMPLEX_8 *a, *b;
1367       GFC_COMPLEX_8 *c;
1368       const index_type m = xcount, n = ycount, k = count;
1369
1370       /* System generated locals */
1371       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1372                  i1, i2, i3, i4, i5, i6;
1373
1374       /* Local variables */
1375       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
1376                  f11, f12, f21, f22, f31, f32, f41, f42,
1377                  f13, f14, f23, f24, f33, f34, f43, f44;
1378       index_type i, j, l, ii, jj, ll;
1379       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1380
1381       a = abase;
1382       b = bbase;
1383       c = retarray->base_addr;
1384
1385       /* Parameter adjustments */
1386       c_dim1 = rystride;
1387       c_offset = 1 + c_dim1;
1388       c -= c_offset;
1389       a_dim1 = aystride;
1390       a_offset = 1 + a_dim1;
1391       a -= a_offset;
1392       b_dim1 = bystride;
1393       b_offset = 1 + b_dim1;
1394       b -= b_offset;
1395
1396       /* Early exit if possible */
1397       if (m == 0 || n == 0 || k == 0)
1398         return;
1399
1400       /* Empty c first.  */
1401       for (j=1; j<=n; j++)
1402         for (i=1; i<=m; i++)
1403           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
1404
1405       /* Start turning the crank. */
1406       i1 = n;
1407       for (jj = 1; jj <= i1; jj += 512)
1408         {
1409           /* Computing MIN */
1410           i2 = 512;
1411           i3 = n - jj + 1;
1412           jsec = min(i2,i3);
1413           ujsec = jsec - jsec % 4;
1414           i2 = k;
1415           for (ll = 1; ll <= i2; ll += 256)
1416             {
1417               /* Computing MIN */
1418               i3 = 256;
1419               i4 = k - ll + 1;
1420               lsec = min(i3,i4);
1421               ulsec = lsec - lsec % 2;
1422
1423               i3 = m;
1424               for (ii = 1; ii <= i3; ii += 256)
1425                 {
1426                   /* Computing MIN */
1427                   i4 = 256;
1428                   i5 = m - ii + 1;
1429                   isec = min(i4,i5);
1430                   uisec = isec - isec % 2;
1431                   i4 = ll + ulsec - 1;
1432                   for (l = ll; l <= i4; l += 2)
1433                     {
1434                       i5 = ii + uisec - 1;
1435                       for (i = ii; i <= i5; i += 2)
1436                         {
1437                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1438                                         a[i + l * a_dim1];
1439                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1440                                         a[i + (l + 1) * a_dim1];
1441                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1442                                         a[i + 1 + l * a_dim1];
1443                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1444                                         a[i + 1 + (l + 1) * a_dim1];
1445                         }
1446                       if (uisec < isec)
1447                         {
1448                           t1[l - ll + 1 + (isec << 8) - 257] =
1449                                     a[ii + isec - 1 + l * a_dim1];
1450                           t1[l - ll + 2 + (isec << 8) - 257] =
1451                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1452                         }
1453                     }
1454                   if (ulsec < lsec)
1455                     {
1456                       i4 = ii + isec - 1;
1457                       for (i = ii; i<= i4; ++i)
1458                         {
1459                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1460                                     a[i + (ll + lsec - 1) * a_dim1];
1461                         }
1462                     }
1463
1464                   uisec = isec - isec % 4;
1465                   i4 = jj + ujsec - 1;
1466                   for (j = jj; j <= i4; j += 4)
1467                     {
1468                       i5 = ii + uisec - 1;
1469                       for (i = ii; i <= i5; i += 4)
1470                         {
1471                           f11 = c[i + j * c_dim1];
1472                           f21 = c[i + 1 + j * c_dim1];
1473                           f12 = c[i + (j + 1) * c_dim1];
1474                           f22 = c[i + 1 + (j + 1) * c_dim1];
1475                           f13 = c[i + (j + 2) * c_dim1];
1476                           f23 = c[i + 1 + (j + 2) * c_dim1];
1477                           f14 = c[i + (j + 3) * c_dim1];
1478                           f24 = c[i + 1 + (j + 3) * c_dim1];
1479                           f31 = c[i + 2 + j * c_dim1];
1480                           f41 = c[i + 3 + j * c_dim1];
1481                           f32 = c[i + 2 + (j + 1) * c_dim1];
1482                           f42 = c[i + 3 + (j + 1) * c_dim1];
1483                           f33 = c[i + 2 + (j + 2) * c_dim1];
1484                           f43 = c[i + 3 + (j + 2) * c_dim1];
1485                           f34 = c[i + 2 + (j + 3) * c_dim1];
1486                           f44 = c[i + 3 + (j + 3) * c_dim1];
1487                           i6 = ll + lsec - 1;
1488                           for (l = ll; l <= i6; ++l)
1489                             {
1490                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1491                                       * b[l + j * b_dim1];
1492                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1493                                       * b[l + j * b_dim1];
1494                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1495                                       * b[l + (j + 1) * b_dim1];
1496                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1497                                       * b[l + (j + 1) * b_dim1];
1498                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499                                       * b[l + (j + 2) * b_dim1];
1500                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501                                       * b[l + (j + 2) * b_dim1];
1502                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503                                       * b[l + (j + 3) * b_dim1];
1504                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505                                       * b[l + (j + 3) * b_dim1];
1506                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1507                                       * b[l + j * b_dim1];
1508                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1509                                       * b[l + j * b_dim1];
1510                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1511                                       * b[l + (j + 1) * b_dim1];
1512                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1513                                       * b[l + (j + 1) * b_dim1];
1514                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515                                       * b[l + (j + 2) * b_dim1];
1516                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517                                       * b[l + (j + 2) * b_dim1];
1518                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519                                       * b[l + (j + 3) * b_dim1];
1520                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521                                       * b[l + (j + 3) * b_dim1];
1522                             }
1523                           c[i + j * c_dim1] = f11;
1524                           c[i + 1 + j * c_dim1] = f21;
1525                           c[i + (j + 1) * c_dim1] = f12;
1526                           c[i + 1 + (j + 1) * c_dim1] = f22;
1527                           c[i + (j + 2) * c_dim1] = f13;
1528                           c[i + 1 + (j + 2) * c_dim1] = f23;
1529                           c[i + (j + 3) * c_dim1] = f14;
1530                           c[i + 1 + (j + 3) * c_dim1] = f24;
1531                           c[i + 2 + j * c_dim1] = f31;
1532                           c[i + 3 + j * c_dim1] = f41;
1533                           c[i + 2 + (j + 1) * c_dim1] = f32;
1534                           c[i + 3 + (j + 1) * c_dim1] = f42;
1535                           c[i + 2 + (j + 2) * c_dim1] = f33;
1536                           c[i + 3 + (j + 2) * c_dim1] = f43;
1537                           c[i + 2 + (j + 3) * c_dim1] = f34;
1538                           c[i + 3 + (j + 3) * c_dim1] = f44;
1539                         }
1540                       if (uisec < isec)
1541                         {
1542                           i5 = ii + isec - 1;
1543                           for (i = ii + uisec; i <= i5; ++i)
1544                             {
1545                               f11 = c[i + j * c_dim1];
1546                               f12 = c[i + (j + 1) * c_dim1];
1547                               f13 = c[i + (j + 2) * c_dim1];
1548                               f14 = c[i + (j + 3) * c_dim1];
1549                               i6 = ll + lsec - 1;
1550                               for (l = ll; l <= i6; ++l)
1551                                 {
1552                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1553                                           257] * b[l + j * b_dim1];
1554                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1555                                           257] * b[l + (j + 1) * b_dim1];
1556                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1557                                           257] * b[l + (j + 2) * b_dim1];
1558                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1559                                           257] * b[l + (j + 3) * b_dim1];
1560                                 }
1561                               c[i + j * c_dim1] = f11;
1562                               c[i + (j + 1) * c_dim1] = f12;
1563                               c[i + (j + 2) * c_dim1] = f13;
1564                               c[i + (j + 3) * c_dim1] = f14;
1565                             }
1566                         }
1567                     }
1568                   if (ujsec < jsec)
1569                     {
1570                       i4 = jj + jsec - 1;
1571                       for (j = jj + ujsec; j <= i4; ++j)
1572                         {
1573                           i5 = ii + uisec - 1;
1574                           for (i = ii; i <= i5; i += 4)
1575                             {
1576                               f11 = c[i + j * c_dim1];
1577                               f21 = c[i + 1 + j * c_dim1];
1578                               f31 = c[i + 2 + j * c_dim1];
1579                               f41 = c[i + 3 + j * c_dim1];
1580                               i6 = ll + lsec - 1;
1581                               for (l = ll; l <= i6; ++l)
1582                                 {
1583                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1584                                           257] * b[l + j * b_dim1];
1585                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1586                                           257] * b[l + j * b_dim1];
1587                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1588                                           257] * b[l + j * b_dim1];
1589                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1590                                           257] * b[l + j * b_dim1];
1591                                 }
1592                               c[i + j * c_dim1] = f11;
1593                               c[i + 1 + j * c_dim1] = f21;
1594                               c[i + 2 + j * c_dim1] = f31;
1595                               c[i + 3 + j * c_dim1] = f41;
1596                             }
1597                           i5 = ii + isec - 1;
1598                           for (i = ii + uisec; i <= i5; ++i)
1599                             {
1600                               f11 = c[i + j * c_dim1];
1601                               i6 = ll + lsec - 1;
1602                               for (l = ll; l <= i6; ++l)
1603                                 {
1604                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1605                                           257] * b[l + j * b_dim1];
1606                                 }
1607                               c[i + j * c_dim1] = f11;
1608                             }
1609                         }
1610                     }
1611                 }
1612             }
1613         }
1614       return;
1615     }
1616   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1617     {
1618       if (GFC_DESCRIPTOR_RANK (a) != 1)
1619         {
1620           const GFC_COMPLEX_8 *restrict abase_x;
1621           const GFC_COMPLEX_8 *restrict bbase_y;
1622           GFC_COMPLEX_8 *restrict dest_y;
1623           GFC_COMPLEX_8 s;
1624
1625           for (y = 0; y < ycount; y++)
1626             {
1627               bbase_y = &bbase[y*bystride];
1628               dest_y = &dest[y*rystride];
1629               for (x = 0; x < xcount; x++)
1630                 {
1631                   abase_x = &abase[x*axstride];
1632                   s = (GFC_COMPLEX_8) 0;
1633                   for (n = 0; n < count; n++)
1634                     s += abase_x[n] * bbase_y[n];
1635                   dest_y[x] = s;
1636                 }
1637             }
1638         }
1639       else
1640         {
1641           const GFC_COMPLEX_8 *restrict bbase_y;
1642           GFC_COMPLEX_8 s;
1643
1644           for (y = 0; y < ycount; y++)
1645             {
1646               bbase_y = &bbase[y*bystride];
1647               s = (GFC_COMPLEX_8) 0;
1648               for (n = 0; n < count; n++)
1649                 s += abase[n*axstride] * bbase_y[n];
1650               dest[y*rystride] = s;
1651             }
1652         }
1653     }
1654   else if (axstride < aystride)
1655     {
1656       for (y = 0; y < ycount; y++)
1657         for (x = 0; x < xcount; x++)
1658           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1659
1660       for (y = 0; y < ycount; y++)
1661         for (n = 0; n < count; n++)
1662           for (x = 0; x < xcount; x++)
1663             /* dest[x,y] += a[x,n] * b[n,y] */
1664             dest[x*rxstride + y*rystride] +=
1665                                         abase[x*axstride + n*aystride] *
1666                                         bbase[n*bxstride + y*bystride];
1667     }
1668   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1669     {
1670       const GFC_COMPLEX_8 *restrict bbase_y;
1671       GFC_COMPLEX_8 s;
1672
1673       for (y = 0; y < ycount; y++)
1674         {
1675           bbase_y = &bbase[y*bystride];
1676           s = (GFC_COMPLEX_8) 0;
1677           for (n = 0; n < count; n++)
1678             s += abase[n*axstride] * bbase_y[n*bxstride];
1679           dest[y*rxstride] = s;
1680         }
1681     }
1682   else
1683     {
1684       const GFC_COMPLEX_8 *restrict abase_x;
1685       const GFC_COMPLEX_8 *restrict bbase_y;
1686       GFC_COMPLEX_8 *restrict dest_y;
1687       GFC_COMPLEX_8 s;
1688
1689       for (y = 0; y < ycount; y++)
1690         {
1691           bbase_y = &bbase[y*bystride];
1692           dest_y = &dest[y*rystride];
1693           for (x = 0; x < xcount; x++)
1694             {
1695               abase_x = &abase[x*axstride];
1696               s = (GFC_COMPLEX_8) 0;
1697               for (n = 0; n < count; n++)
1698                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1699               dest_y[x*rxstride] = s;
1700             }
1701         }
1702     }
1703 }
1704 #undef POW3
1705 #undef min
1706 #undef max
1707
1708 #endif  /* HAVE_AVX512F */
1709
1710 /* Function to fall back to if there is no special processor-specific version.  */
1711 static void
1712 matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
1713         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1714         int blas_limit, blas_call gemm)
1715 {
1716   const GFC_COMPLEX_8 * restrict abase;
1717   const GFC_COMPLEX_8 * restrict bbase;
1718   GFC_COMPLEX_8 * restrict dest;
1719
1720   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1721   index_type x, y, n, count, xcount, ycount;
1722
1723   assert (GFC_DESCRIPTOR_RANK (a) == 2
1724           || GFC_DESCRIPTOR_RANK (b) == 2);
1725
1726 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1727
1728    Either A or B (but not both) can be rank 1:
1729
1730    o One-dimensional argument A is implicitly treated as a row matrix
1731      dimensioned [1,count], so xcount=1.
1732
1733    o One-dimensional argument B is implicitly treated as a column matrix
1734      dimensioned [count, 1], so ycount=1.
1735 */
1736
1737   if (retarray->base_addr == NULL)
1738     {
1739       if (GFC_DESCRIPTOR_RANK (a) == 1)
1740         {
1741           GFC_DIMENSION_SET(retarray->dim[0], 0,
1742                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1743         }
1744       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1745         {
1746           GFC_DIMENSION_SET(retarray->dim[0], 0,
1747                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1748         }
1749       else
1750         {
1751           GFC_DIMENSION_SET(retarray->dim[0], 0,
1752                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1753
1754           GFC_DIMENSION_SET(retarray->dim[1], 0,
1755                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1756                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1757         }
1758
1759       retarray->base_addr
1760         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1761       retarray->offset = 0;
1762     }
1763   else if (unlikely (compile_options.bounds_check))
1764     {
1765       index_type ret_extent, arg_extent;
1766
1767       if (GFC_DESCRIPTOR_RANK (a) == 1)
1768         {
1769           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1770           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1771           if (arg_extent != ret_extent)
1772             runtime_error ("Incorrect extent in return array in"
1773                            " MATMUL intrinsic: is %ld, should be %ld",
1774                            (long int) ret_extent, (long int) arg_extent);
1775         }
1776       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1777         {
1778           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1779           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1780           if (arg_extent != ret_extent)
1781             runtime_error ("Incorrect extent in return array in"
1782                            " MATMUL intrinsic: is %ld, should be %ld",
1783                            (long int) ret_extent, (long int) arg_extent);
1784         }
1785       else
1786         {
1787           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1788           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1789           if (arg_extent != ret_extent)
1790             runtime_error ("Incorrect extent in return array in"
1791                            " MATMUL intrinsic for dimension 1:"
1792                            " is %ld, should be %ld",
1793                            (long int) ret_extent, (long int) arg_extent);
1794
1795           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1796           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1797           if (arg_extent != ret_extent)
1798             runtime_error ("Incorrect extent in return array in"
1799                            " MATMUL intrinsic for dimension 2:"
1800                            " is %ld, should be %ld",
1801                            (long int) ret_extent, (long int) arg_extent);
1802         }
1803     }
1804
1805
1806   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1807     {
1808       /* One-dimensional result may be addressed in the code below
1809          either as a row or a column matrix. We want both cases to
1810          work. */
1811       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1812     }
1813   else
1814     {
1815       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1816       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1817     }
1818
1819
1820   if (GFC_DESCRIPTOR_RANK (a) == 1)
1821     {
1822       /* Treat it as a a row matrix A[1,count]. */
1823       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1824       aystride = 1;
1825
1826       xcount = 1;
1827       count = GFC_DESCRIPTOR_EXTENT(a,0);
1828     }
1829   else
1830     {
1831       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1833
1834       count = GFC_DESCRIPTOR_EXTENT(a,1);
1835       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1836     }
1837
1838   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1839     {
1840       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1841         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1842     }
1843
1844   if (GFC_DESCRIPTOR_RANK (b) == 1)
1845     {
1846       /* Treat it as a column matrix B[count,1] */
1847       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1848
1849       /* bystride should never be used for 1-dimensional b.
1850          in case it is we want it to cause a segfault, rather than
1851          an incorrect result. */
1852       bystride = 0xDEADBEEF;
1853       ycount = 1;
1854     }
1855   else
1856     {
1857       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1858       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1859       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1860     }
1861
1862   abase = a->base_addr;
1863   bbase = b->base_addr;
1864   dest = retarray->base_addr;
1865
1866   /* Now that everything is set up, we perform the multiplication
1867      itself.  */
1868
1869 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1870 #define min(a,b) ((a) <= (b) ? (a) : (b))
1871 #define max(a,b) ((a) >= (b) ? (a) : (b))
1872
1873   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1874       && (bxstride == 1 || bystride == 1)
1875       && (((float) xcount) * ((float) ycount) * ((float) count)
1876           > POW3(blas_limit)))
1877     {
1878       const int m = xcount, n = ycount, k = count, ldc = rystride;
1879       const GFC_COMPLEX_8 one = 1, zero = 0;
1880       const int lda = (axstride == 1) ? aystride : axstride,
1881                 ldb = (bxstride == 1) ? bystride : bxstride;
1882
1883       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1884         {
1885           assert (gemm != NULL);
1886           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1887                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1888                 &ldc, 1, 1);
1889           return;
1890         }
1891     }
1892
1893   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1894     {
1895       /* This block of code implements a tuned matmul, derived from
1896          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1897
1898                Bo Kagstrom and Per Ling
1899                Department of Computing Science
1900                Umea University
1901                S-901 87 Umea, Sweden
1902
1903          from netlib.org, translated to C, and modified for matmul.m4.  */
1904
1905       const GFC_COMPLEX_8 *a, *b;
1906       GFC_COMPLEX_8 *c;
1907       const index_type m = xcount, n = ycount, k = count;
1908
1909       /* System generated locals */
1910       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1911                  i1, i2, i3, i4, i5, i6;
1912
1913       /* Local variables */
1914       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
1915                  f11, f12, f21, f22, f31, f32, f41, f42,
1916                  f13, f14, f23, f24, f33, f34, f43, f44;
1917       index_type i, j, l, ii, jj, ll;
1918       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1919
1920       a = abase;
1921       b = bbase;
1922       c = retarray->base_addr;
1923
1924       /* Parameter adjustments */
1925       c_dim1 = rystride;
1926       c_offset = 1 + c_dim1;
1927       c -= c_offset;
1928       a_dim1 = aystride;
1929       a_offset = 1 + a_dim1;
1930       a -= a_offset;
1931       b_dim1 = bystride;
1932       b_offset = 1 + b_dim1;
1933       b -= b_offset;
1934
1935       /* Early exit if possible */
1936       if (m == 0 || n == 0 || k == 0)
1937         return;
1938
1939       /* Empty c first.  */
1940       for (j=1; j<=n; j++)
1941         for (i=1; i<=m; i++)
1942           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
1943
1944       /* Start turning the crank. */
1945       i1 = n;
1946       for (jj = 1; jj <= i1; jj += 512)
1947         {
1948           /* Computing MIN */
1949           i2 = 512;
1950           i3 = n - jj + 1;
1951           jsec = min(i2,i3);
1952           ujsec = jsec - jsec % 4;
1953           i2 = k;
1954           for (ll = 1; ll <= i2; ll += 256)
1955             {
1956               /* Computing MIN */
1957               i3 = 256;
1958               i4 = k - ll + 1;
1959               lsec = min(i3,i4);
1960               ulsec = lsec - lsec % 2;
1961
1962               i3 = m;
1963               for (ii = 1; ii <= i3; ii += 256)
1964                 {
1965                   /* Computing MIN */
1966                   i4 = 256;
1967                   i5 = m - ii + 1;
1968                   isec = min(i4,i5);
1969                   uisec = isec - isec % 2;
1970                   i4 = ll + ulsec - 1;
1971                   for (l = ll; l <= i4; l += 2)
1972                     {
1973                       i5 = ii + uisec - 1;
1974                       for (i = ii; i <= i5; i += 2)
1975                         {
1976                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1977                                         a[i + l * a_dim1];
1978                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1979                                         a[i + (l + 1) * a_dim1];
1980                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1981                                         a[i + 1 + l * a_dim1];
1982                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1983                                         a[i + 1 + (l + 1) * a_dim1];
1984                         }
1985                       if (uisec < isec)
1986                         {
1987                           t1[l - ll + 1 + (isec << 8) - 257] =
1988                                     a[ii + isec - 1 + l * a_dim1];
1989                           t1[l - ll + 2 + (isec << 8) - 257] =
1990                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1991                         }
1992                     }
1993                   if (ulsec < lsec)
1994                     {
1995                       i4 = ii + isec - 1;
1996                       for (i = ii; i<= i4; ++i)
1997                         {
1998                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1999                                     a[i + (ll + lsec - 1) * a_dim1];
2000                         }
2001                     }
2002
2003                   uisec = isec - isec % 4;
2004                   i4 = jj + ujsec - 1;
2005                   for (j = jj; j <= i4; j += 4)
2006                     {
2007                       i5 = ii + uisec - 1;
2008                       for (i = ii; i <= i5; i += 4)
2009                         {
2010                           f11 = c[i + j * c_dim1];
2011                           f21 = c[i + 1 + j * c_dim1];
2012                           f12 = c[i + (j + 1) * c_dim1];
2013                           f22 = c[i + 1 + (j + 1) * c_dim1];
2014                           f13 = c[i + (j + 2) * c_dim1];
2015                           f23 = c[i + 1 + (j + 2) * c_dim1];
2016                           f14 = c[i + (j + 3) * c_dim1];
2017                           f24 = c[i + 1 + (j + 3) * c_dim1];
2018                           f31 = c[i + 2 + j * c_dim1];
2019                           f41 = c[i + 3 + j * c_dim1];
2020                           f32 = c[i + 2 + (j + 1) * c_dim1];
2021                           f42 = c[i + 3 + (j + 1) * c_dim1];
2022                           f33 = c[i + 2 + (j + 2) * c_dim1];
2023                           f43 = c[i + 3 + (j + 2) * c_dim1];
2024                           f34 = c[i + 2 + (j + 3) * c_dim1];
2025                           f44 = c[i + 3 + (j + 3) * c_dim1];
2026                           i6 = ll + lsec - 1;
2027                           for (l = ll; l <= i6; ++l)
2028                             {
2029                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2030                                       * b[l + j * b_dim1];
2031                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2032                                       * b[l + j * b_dim1];
2033                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2034                                       * b[l + (j + 1) * b_dim1];
2035                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2036                                       * b[l + (j + 1) * b_dim1];
2037                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038                                       * b[l + (j + 2) * b_dim1];
2039                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040                                       * b[l + (j + 2) * b_dim1];
2041                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042                                       * b[l + (j + 3) * b_dim1];
2043                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044                                       * b[l + (j + 3) * b_dim1];
2045                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2046                                       * b[l + j * b_dim1];
2047                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2048                                       * b[l + j * b_dim1];
2049                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2050                                       * b[l + (j + 1) * b_dim1];
2051                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2052                                       * b[l + (j + 1) * b_dim1];
2053                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054                                       * b[l + (j + 2) * b_dim1];
2055                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056                                       * b[l + (j + 2) * b_dim1];
2057                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058                                       * b[l + (j + 3) * b_dim1];
2059                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060                                       * b[l + (j + 3) * b_dim1];
2061                             }
2062                           c[i + j * c_dim1] = f11;
2063                           c[i + 1 + j * c_dim1] = f21;
2064                           c[i + (j + 1) * c_dim1] = f12;
2065                           c[i + 1 + (j + 1) * c_dim1] = f22;
2066                           c[i + (j + 2) * c_dim1] = f13;
2067                           c[i + 1 + (j + 2) * c_dim1] = f23;
2068                           c[i + (j + 3) * c_dim1] = f14;
2069                           c[i + 1 + (j + 3) * c_dim1] = f24;
2070                           c[i + 2 + j * c_dim1] = f31;
2071                           c[i + 3 + j * c_dim1] = f41;
2072                           c[i + 2 + (j + 1) * c_dim1] = f32;
2073                           c[i + 3 + (j + 1) * c_dim1] = f42;
2074                           c[i + 2 + (j + 2) * c_dim1] = f33;
2075                           c[i + 3 + (j + 2) * c_dim1] = f43;
2076                           c[i + 2 + (j + 3) * c_dim1] = f34;
2077                           c[i + 3 + (j + 3) * c_dim1] = f44;
2078                         }
2079                       if (uisec < isec)
2080                         {
2081                           i5 = ii + isec - 1;
2082                           for (i = ii + uisec; i <= i5; ++i)
2083                             {
2084                               f11 = c[i + j * c_dim1];
2085                               f12 = c[i + (j + 1) * c_dim1];
2086                               f13 = c[i + (j + 2) * c_dim1];
2087                               f14 = c[i + (j + 3) * c_dim1];
2088                               i6 = ll + lsec - 1;
2089                               for (l = ll; l <= i6; ++l)
2090                                 {
2091                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2092                                           257] * b[l + j * b_dim1];
2093                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2094                                           257] * b[l + (j + 1) * b_dim1];
2095                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2096                                           257] * b[l + (j + 2) * b_dim1];
2097                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2098                                           257] * b[l + (j + 3) * b_dim1];
2099                                 }
2100                               c[i + j * c_dim1] = f11;
2101                               c[i + (j + 1) * c_dim1] = f12;
2102                               c[i + (j + 2) * c_dim1] = f13;
2103                               c[i + (j + 3) * c_dim1] = f14;
2104                             }
2105                         }
2106                     }
2107                   if (ujsec < jsec)
2108                     {
2109                       i4 = jj + jsec - 1;
2110                       for (j = jj + ujsec; j <= i4; ++j)
2111                         {
2112                           i5 = ii + uisec - 1;
2113                           for (i = ii; i <= i5; i += 4)
2114                             {
2115                               f11 = c[i + j * c_dim1];
2116                               f21 = c[i + 1 + j * c_dim1];
2117                               f31 = c[i + 2 + j * c_dim1];
2118                               f41 = c[i + 3 + j * c_dim1];
2119                               i6 = ll + lsec - 1;
2120                               for (l = ll; l <= i6; ++l)
2121                                 {
2122                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2123                                           257] * b[l + j * b_dim1];
2124                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2125                                           257] * b[l + j * b_dim1];
2126                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2127                                           257] * b[l + j * b_dim1];
2128                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2129                                           257] * b[l + j * b_dim1];
2130                                 }
2131                               c[i + j * c_dim1] = f11;
2132                               c[i + 1 + j * c_dim1] = f21;
2133                               c[i + 2 + j * c_dim1] = f31;
2134                               c[i + 3 + j * c_dim1] = f41;
2135                             }
2136                           i5 = ii + isec - 1;
2137                           for (i = ii + uisec; i <= i5; ++i)
2138                             {
2139                               f11 = c[i + j * c_dim1];
2140                               i6 = ll + lsec - 1;
2141                               for (l = ll; l <= i6; ++l)
2142                                 {
2143                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2144                                           257] * b[l + j * b_dim1];
2145                                 }
2146                               c[i + j * c_dim1] = f11;
2147                             }
2148                         }
2149                     }
2150                 }
2151             }
2152         }
2153       return;
2154     }
2155   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2156     {
2157       if (GFC_DESCRIPTOR_RANK (a) != 1)
2158         {
2159           const GFC_COMPLEX_8 *restrict abase_x;
2160           const GFC_COMPLEX_8 *restrict bbase_y;
2161           GFC_COMPLEX_8 *restrict dest_y;
2162           GFC_COMPLEX_8 s;
2163
2164           for (y = 0; y < ycount; y++)
2165             {
2166               bbase_y = &bbase[y*bystride];
2167               dest_y = &dest[y*rystride];
2168               for (x = 0; x < xcount; x++)
2169                 {
2170                   abase_x = &abase[x*axstride];
2171                   s = (GFC_COMPLEX_8) 0;
2172                   for (n = 0; n < count; n++)
2173                     s += abase_x[n] * bbase_y[n];
2174                   dest_y[x] = s;
2175                 }
2176             }
2177         }
2178       else
2179         {
2180           const GFC_COMPLEX_8 *restrict bbase_y;
2181           GFC_COMPLEX_8 s;
2182
2183           for (y = 0; y < ycount; y++)
2184             {
2185               bbase_y = &bbase[y*bystride];
2186               s = (GFC_COMPLEX_8) 0;
2187               for (n = 0; n < count; n++)
2188                 s += abase[n*axstride] * bbase_y[n];
2189               dest[y*rystride] = s;
2190             }
2191         }
2192     }
2193   else if (axstride < aystride)
2194     {
2195       for (y = 0; y < ycount; y++)
2196         for (x = 0; x < xcount; x++)
2197           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2198
2199       for (y = 0; y < ycount; y++)
2200         for (n = 0; n < count; n++)
2201           for (x = 0; x < xcount; x++)
2202             /* dest[x,y] += a[x,n] * b[n,y] */
2203             dest[x*rxstride + y*rystride] +=
2204                                         abase[x*axstride + n*aystride] *
2205                                         bbase[n*bxstride + y*bystride];
2206     }
2207   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2208     {
2209       const GFC_COMPLEX_8 *restrict bbase_y;
2210       GFC_COMPLEX_8 s;
2211
2212       for (y = 0; y < ycount; y++)
2213         {
2214           bbase_y = &bbase[y*bystride];
2215           s = (GFC_COMPLEX_8) 0;
2216           for (n = 0; n < count; n++)
2217             s += abase[n*axstride] * bbase_y[n*bxstride];
2218           dest[y*rxstride] = s;
2219         }
2220     }
2221   else
2222     {
2223       const GFC_COMPLEX_8 *restrict abase_x;
2224       const GFC_COMPLEX_8 *restrict bbase_y;
2225       GFC_COMPLEX_8 *restrict dest_y;
2226       GFC_COMPLEX_8 s;
2227
2228       for (y = 0; y < ycount; y++)
2229         {
2230           bbase_y = &bbase[y*bystride];
2231           dest_y = &dest[y*rystride];
2232           for (x = 0; x < xcount; x++)
2233             {
2234               abase_x = &abase[x*axstride];
2235               s = (GFC_COMPLEX_8) 0;
2236               for (n = 0; n < count; n++)
2237                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2238               dest_y[x*rxstride] = s;
2239             }
2240         }
2241     }
2242 }
2243 #undef POW3
2244 #undef min
2245 #undef max
2246
2247
2248 /* Compiling main function, with selection code for the processor.  */
2249
2250 /* Currently, this is i386 only.  Adjust for other architectures.  */
2251
2252 #include <config/i386/cpuinfo.h>
2253 void matmul_c8 (gfc_array_c8 * const restrict retarray,
2254         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2255         int blas_limit, blas_call gemm)
2256 {
2257   static void (*matmul_p) (gfc_array_c8 * const restrict retarray,
2258         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2259         int blas_limit, blas_call gemm) = NULL;
2260
2261   if (matmul_p == NULL)
2262     {
2263       matmul_p = matmul_c8_vanilla;
2264       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2265         {
2266           /* Run down the available processors in order of preference.  */
2267 #ifdef HAVE_AVX512F
2268           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2269             {
2270               matmul_p = matmul_c8_avx512f;
2271               goto tailcall;
2272             }
2273
2274 #endif  /* HAVE_AVX512F */
2275
2276 #ifdef HAVE_AVX2
2277           if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2278              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2279             {
2280               matmul_p = matmul_c8_avx2;
2281               goto tailcall;
2282             }
2283
2284 #endif
2285
2286 #ifdef HAVE_AVX
2287           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2288             {
2289               matmul_p = matmul_c8_avx;
2290               goto tailcall;
2291             }
2292 #endif  /* HAVE_AVX */
2293         }
2294    }
2295
2296 tailcall:
2297    (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2298 }
2299
2300 #else  /* Just the vanilla function.  */
2301
2302 void
2303 matmul_c8 (gfc_array_c8 * const restrict retarray,
2304         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2305         int blas_limit, blas_call gemm)
2306 {
2307   const GFC_COMPLEX_8 * restrict abase;
2308   const GFC_COMPLEX_8 * restrict bbase;
2309   GFC_COMPLEX_8 * restrict dest;
2310
2311   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2312   index_type x, y, n, count, xcount, ycount;
2313
2314   assert (GFC_DESCRIPTOR_RANK (a) == 2
2315           || GFC_DESCRIPTOR_RANK (b) == 2);
2316
2317 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2318
2319    Either A or B (but not both) can be rank 1:
2320
2321    o One-dimensional argument A is implicitly treated as a row matrix
2322      dimensioned [1,count], so xcount=1.
2323
2324    o One-dimensional argument B is implicitly treated as a column matrix
2325      dimensioned [count, 1], so ycount=1.
2326 */
2327
2328   if (retarray->base_addr == NULL)
2329     {
2330       if (GFC_DESCRIPTOR_RANK (a) == 1)
2331         {
2332           GFC_DIMENSION_SET(retarray->dim[0], 0,
2333                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2334         }
2335       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2336         {
2337           GFC_DIMENSION_SET(retarray->dim[0], 0,
2338                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2339         }
2340       else
2341         {
2342           GFC_DIMENSION_SET(retarray->dim[0], 0,
2343                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2344
2345           GFC_DIMENSION_SET(retarray->dim[1], 0,
2346                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2347                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2348         }
2349
2350       retarray->base_addr
2351         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
2352       retarray->offset = 0;
2353     }
2354   else if (unlikely (compile_options.bounds_check))
2355     {
2356       index_type ret_extent, arg_extent;
2357
2358       if (GFC_DESCRIPTOR_RANK (a) == 1)
2359         {
2360           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2361           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2362           if (arg_extent != ret_extent)
2363             runtime_error ("Incorrect extent in return array in"
2364                            " MATMUL intrinsic: is %ld, should be %ld",
2365                            (long int) ret_extent, (long int) arg_extent);
2366         }
2367       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2368         {
2369           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2370           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2371           if (arg_extent != ret_extent)
2372             runtime_error ("Incorrect extent in return array in"
2373                            " MATMUL intrinsic: is %ld, should be %ld",
2374                            (long int) ret_extent, (long int) arg_extent);
2375         }
2376       else
2377         {
2378           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2379           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2380           if (arg_extent != ret_extent)
2381             runtime_error ("Incorrect extent in return array in"
2382                            " MATMUL intrinsic for dimension 1:"
2383                            " is %ld, should be %ld",
2384                            (long int) ret_extent, (long int) arg_extent);
2385
2386           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2387           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2388           if (arg_extent != ret_extent)
2389             runtime_error ("Incorrect extent in return array in"
2390                            " MATMUL intrinsic for dimension 2:"
2391                            " is %ld, should be %ld",
2392                            (long int) ret_extent, (long int) arg_extent);
2393         }
2394     }
2395
2396
2397   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2398     {
2399       /* One-dimensional result may be addressed in the code below
2400          either as a row or a column matrix. We want both cases to
2401          work. */
2402       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2403     }
2404   else
2405     {
2406       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2407       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2408     }
2409
2410
2411   if (GFC_DESCRIPTOR_RANK (a) == 1)
2412     {
2413       /* Treat it as a a row matrix A[1,count]. */
2414       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2415       aystride = 1;
2416
2417       xcount = 1;
2418       count = GFC_DESCRIPTOR_EXTENT(a,0);
2419     }
2420   else
2421     {
2422       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2423       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2424
2425       count = GFC_DESCRIPTOR_EXTENT(a,1);
2426       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2427     }
2428
2429   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2430     {
2431       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2432         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2433     }
2434
2435   if (GFC_DESCRIPTOR_RANK (b) == 1)
2436     {
2437       /* Treat it as a column matrix B[count,1] */
2438       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2439
2440       /* bystride should never be used for 1-dimensional b.
2441          in case it is we want it to cause a segfault, rather than
2442          an incorrect result. */
2443       bystride = 0xDEADBEEF;
2444       ycount = 1;
2445     }
2446   else
2447     {
2448       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2449       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2450       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2451     }
2452
2453   abase = a->base_addr;
2454   bbase = b->base_addr;
2455   dest = retarray->base_addr;
2456
2457   /* Now that everything is set up, we perform the multiplication
2458      itself.  */
2459
2460 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2461 #define min(a,b) ((a) <= (b) ? (a) : (b))
2462 #define max(a,b) ((a) >= (b) ? (a) : (b))
2463
2464   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2465       && (bxstride == 1 || bystride == 1)
2466       && (((float) xcount) * ((float) ycount) * ((float) count)
2467           > POW3(blas_limit)))
2468     {
2469       const int m = xcount, n = ycount, k = count, ldc = rystride;
2470       const GFC_COMPLEX_8 one = 1, zero = 0;
2471       const int lda = (axstride == 1) ? aystride : axstride,
2472                 ldb = (bxstride == 1) ? bystride : bxstride;
2473
2474       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2475         {
2476           assert (gemm != NULL);
2477           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2478                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2479                 &ldc, 1, 1);
2480           return;
2481         }
2482     }
2483
2484   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2485     {
2486       /* This block of code implements a tuned matmul, derived from
2487          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2488
2489                Bo Kagstrom and Per Ling
2490                Department of Computing Science
2491                Umea University
2492                S-901 87 Umea, Sweden
2493
2494          from netlib.org, translated to C, and modified for matmul.m4.  */
2495
2496       const GFC_COMPLEX_8 *a, *b;
2497       GFC_COMPLEX_8 *c;
2498       const index_type m = xcount, n = ycount, k = count;
2499
2500       /* System generated locals */
2501       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2502                  i1, i2, i3, i4, i5, i6;
2503
2504       /* Local variables */
2505       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
2506                  f11, f12, f21, f22, f31, f32, f41, f42,
2507                  f13, f14, f23, f24, f33, f34, f43, f44;
2508       index_type i, j, l, ii, jj, ll;
2509       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2510
2511       a = abase;
2512       b = bbase;
2513       c = retarray->base_addr;
2514
2515       /* Parameter adjustments */
2516       c_dim1 = rystride;
2517       c_offset = 1 + c_dim1;
2518       c -= c_offset;
2519       a_dim1 = aystride;
2520       a_offset = 1 + a_dim1;
2521       a -= a_offset;
2522       b_dim1 = bystride;
2523       b_offset = 1 + b_dim1;
2524       b -= b_offset;
2525
2526       /* Early exit if possible */
2527       if (m == 0 || n == 0 || k == 0)
2528         return;
2529
2530       /* Empty c first.  */
2531       for (j=1; j<=n; j++)
2532         for (i=1; i<=m; i++)
2533           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
2534
2535       /* Start turning the crank. */
2536       i1 = n;
2537       for (jj = 1; jj <= i1; jj += 512)
2538         {
2539           /* Computing MIN */
2540           i2 = 512;
2541           i3 = n - jj + 1;
2542           jsec = min(i2,i3);
2543           ujsec = jsec - jsec % 4;
2544           i2 = k;
2545           for (ll = 1; ll <= i2; ll += 256)
2546             {
2547               /* Computing MIN */
2548               i3 = 256;
2549               i4 = k - ll + 1;
2550               lsec = min(i3,i4);
2551               ulsec = lsec - lsec % 2;
2552
2553               i3 = m;
2554               for (ii = 1; ii <= i3; ii += 256)
2555                 {
2556                   /* Computing MIN */
2557                   i4 = 256;
2558                   i5 = m - ii + 1;
2559                   isec = min(i4,i5);
2560                   uisec = isec - isec % 2;
2561                   i4 = ll + ulsec - 1;
2562                   for (l = ll; l <= i4; l += 2)
2563                     {
2564                       i5 = ii + uisec - 1;
2565                       for (i = ii; i <= i5; i += 2)
2566                         {
2567                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2568                                         a[i + l * a_dim1];
2569                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2570                                         a[i + (l + 1) * a_dim1];
2571                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2572                                         a[i + 1 + l * a_dim1];
2573                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2574                                         a[i + 1 + (l + 1) * a_dim1];
2575                         }
2576                       if (uisec < isec)
2577                         {
2578                           t1[l - ll + 1 + (isec << 8) - 257] =
2579                                     a[ii + isec - 1 + l * a_dim1];
2580                           t1[l - ll + 2 + (isec << 8) - 257] =
2581                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2582                         }
2583                     }
2584                   if (ulsec < lsec)
2585                     {
2586                       i4 = ii + isec - 1;
2587                       for (i = ii; i<= i4; ++i)
2588                         {
2589                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2590                                     a[i + (ll + lsec - 1) * a_dim1];
2591                         }
2592                     }
2593
2594                   uisec = isec - isec % 4;
2595                   i4 = jj + ujsec - 1;
2596                   for (j = jj; j <= i4; j += 4)
2597                     {
2598                       i5 = ii + uisec - 1;
2599                       for (i = ii; i <= i5; i += 4)
2600                         {
2601                           f11 = c[i + j * c_dim1];
2602                           f21 = c[i + 1 + j * c_dim1];
2603                           f12 = c[i + (j + 1) * c_dim1];
2604                           f22 = c[i + 1 + (j + 1) * c_dim1];
2605                           f13 = c[i + (j + 2) * c_dim1];
2606                           f23 = c[i + 1 + (j + 2) * c_dim1];
2607                           f14 = c[i + (j + 3) * c_dim1];
2608                           f24 = c[i + 1 + (j + 3) * c_dim1];
2609                           f31 = c[i + 2 + j * c_dim1];
2610                           f41 = c[i + 3 + j * c_dim1];
2611                           f32 = c[i + 2 + (j + 1) * c_dim1];
2612                           f42 = c[i + 3 + (j + 1) * c_dim1];
2613                           f33 = c[i + 2 + (j + 2) * c_dim1];
2614                           f43 = c[i + 3 + (j + 2) * c_dim1];
2615                           f34 = c[i + 2 + (j + 3) * c_dim1];
2616                           f44 = c[i + 3 + (j + 3) * c_dim1];
2617                           i6 = ll + lsec - 1;
2618                           for (l = ll; l <= i6; ++l)
2619                             {
2620                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2621                                       * b[l + j * b_dim1];
2622                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2623                                       * b[l + j * b_dim1];
2624                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2625                                       * b[l + (j + 1) * b_dim1];
2626                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2627                                       * b[l + (j + 1) * b_dim1];
2628                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2629                                       * b[l + (j + 2) * b_dim1];
2630                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2631                                       * b[l + (j + 2) * b_dim1];
2632                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2633                                       * b[l + (j + 3) * b_dim1];
2634                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2635                                       * b[l + (j + 3) * b_dim1];
2636                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2637                                       * b[l + j * b_dim1];
2638                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2639                                       * b[l + j * b_dim1];
2640                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2641                                       * b[l + (j + 1) * b_dim1];
2642                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2643                                       * b[l + (j + 1) * b_dim1];
2644                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2645                                       * b[l + (j + 2) * b_dim1];
2646                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2647                                       * b[l + (j + 2) * b_dim1];
2648                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2649                                       * b[l + (j + 3) * b_dim1];
2650                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2651                                       * b[l + (j + 3) * b_dim1];
2652                             }
2653                           c[i + j * c_dim1] = f11;
2654                           c[i + 1 + j * c_dim1] = f21;
2655                           c[i + (j + 1) * c_dim1] = f12;
2656                           c[i + 1 + (j + 1) * c_dim1] = f22;
2657                           c[i + (j + 2) * c_dim1] = f13;
2658                           c[i + 1 + (j + 2) * c_dim1] = f23;
2659                           c[i + (j + 3) * c_dim1] = f14;
2660                           c[i + 1 + (j + 3) * c_dim1] = f24;
2661                           c[i + 2 + j * c_dim1] = f31;
2662                           c[i + 3 + j * c_dim1] = f41;
2663                           c[i + 2 + (j + 1) * c_dim1] = f32;
2664                           c[i + 3 + (j + 1) * c_dim1] = f42;
2665                           c[i + 2 + (j + 2) * c_dim1] = f33;
2666                           c[i + 3 + (j + 2) * c_dim1] = f43;
2667                           c[i + 2 + (j + 3) * c_dim1] = f34;
2668                           c[i + 3 + (j + 3) * c_dim1] = f44;
2669                         }
2670                       if (uisec < isec)
2671                         {
2672                           i5 = ii + isec - 1;
2673                           for (i = ii + uisec; i <= i5; ++i)
2674                             {
2675                               f11 = c[i + j * c_dim1];
2676                               f12 = c[i + (j + 1) * c_dim1];
2677                               f13 = c[i + (j + 2) * c_dim1];
2678                               f14 = c[i + (j + 3) * c_dim1];
2679                               i6 = ll + lsec - 1;
2680                               for (l = ll; l <= i6; ++l)
2681                                 {
2682                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2683                                           257] * b[l + j * b_dim1];
2684                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2685                                           257] * b[l + (j + 1) * b_dim1];
2686                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2687                                           257] * b[l + (j + 2) * b_dim1];
2688                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689                                           257] * b[l + (j + 3) * b_dim1];
2690                                 }
2691                               c[i + j * c_dim1] = f11;
2692                               c[i + (j + 1) * c_dim1] = f12;
2693                               c[i + (j + 2) * c_dim1] = f13;
2694                               c[i + (j + 3) * c_dim1] = f14;
2695                             }
2696                         }
2697                     }
2698                   if (ujsec < jsec)
2699                     {
2700                       i4 = jj + jsec - 1;
2701                       for (j = jj + ujsec; j <= i4; ++j)
2702                         {
2703                           i5 = ii + uisec - 1;
2704                           for (i = ii; i <= i5; i += 4)
2705                             {
2706                               f11 = c[i + j * c_dim1];
2707                               f21 = c[i + 1 + j * c_dim1];
2708                               f31 = c[i + 2 + j * c_dim1];
2709                               f41 = c[i + 3 + j * c_dim1];
2710                               i6 = ll + lsec - 1;
2711                               for (l = ll; l <= i6; ++l)
2712                                 {
2713                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2714                                           257] * b[l + j * b_dim1];
2715                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2716                                           257] * b[l + j * b_dim1];
2717                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2718                                           257] * b[l + j * b_dim1];
2719                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2720                                           257] * b[l + j * b_dim1];
2721                                 }
2722                               c[i + j * c_dim1] = f11;
2723                               c[i + 1 + j * c_dim1] = f21;
2724                               c[i + 2 + j * c_dim1] = f31;
2725                               c[i + 3 + j * c_dim1] = f41;
2726                             }
2727                           i5 = ii + isec - 1;
2728                           for (i = ii + uisec; i <= i5; ++i)
2729                             {
2730                               f11 = c[i + j * c_dim1];
2731                               i6 = ll + lsec - 1;
2732                               for (l = ll; l <= i6; ++l)
2733                                 {
2734                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2735                                           257] * b[l + j * b_dim1];
2736                                 }
2737                               c[i + j * c_dim1] = f11;
2738                             }
2739                         }
2740                     }
2741                 }
2742             }
2743         }
2744       return;
2745     }
2746   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2747     {
2748       if (GFC_DESCRIPTOR_RANK (a) != 1)
2749         {
2750           const GFC_COMPLEX_8 *restrict abase_x;
2751           const GFC_COMPLEX_8 *restrict bbase_y;
2752           GFC_COMPLEX_8 *restrict dest_y;
2753           GFC_COMPLEX_8 s;
2754
2755           for (y = 0; y < ycount; y++)
2756             {
2757               bbase_y = &bbase[y*bystride];
2758               dest_y = &dest[y*rystride];
2759               for (x = 0; x < xcount; x++)
2760                 {
2761                   abase_x = &abase[x*axstride];
2762                   s = (GFC_COMPLEX_8) 0;
2763                   for (n = 0; n < count; n++)
2764                     s += abase_x[n] * bbase_y[n];
2765                   dest_y[x] = s;
2766                 }
2767             }
2768         }
2769       else
2770         {
2771           const GFC_COMPLEX_8 *restrict bbase_y;
2772           GFC_COMPLEX_8 s;
2773
2774           for (y = 0; y < ycount; y++)
2775             {
2776               bbase_y = &bbase[y*bystride];
2777               s = (GFC_COMPLEX_8) 0;
2778               for (n = 0; n < count; n++)
2779                 s += abase[n*axstride] * bbase_y[n];
2780               dest[y*rystride] = s;
2781             }
2782         }
2783     }
2784   else if (axstride < aystride)
2785     {
2786       for (y = 0; y < ycount; y++)
2787         for (x = 0; x < xcount; x++)
2788           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2789
2790       for (y = 0; y < ycount; y++)
2791         for (n = 0; n < count; n++)
2792           for (x = 0; x < xcount; x++)
2793             /* dest[x,y] += a[x,n] * b[n,y] */
2794             dest[x*rxstride + y*rystride] +=
2795                                         abase[x*axstride + n*aystride] *
2796                                         bbase[n*bxstride + y*bystride];
2797     }
2798   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2799     {
2800       const GFC_COMPLEX_8 *restrict bbase_y;
2801       GFC_COMPLEX_8 s;
2802
2803       for (y = 0; y < ycount; y++)
2804         {
2805           bbase_y = &bbase[y*bystride];
2806           s = (GFC_COMPLEX_8) 0;
2807           for (n = 0; n < count; n++)
2808             s += abase[n*axstride] * bbase_y[n*bxstride];
2809           dest[y*rxstride] = s;
2810         }
2811     }
2812   else
2813     {
2814       const GFC_COMPLEX_8 *restrict abase_x;
2815       const GFC_COMPLEX_8 *restrict bbase_y;
2816       GFC_COMPLEX_8 *restrict dest_y;
2817       GFC_COMPLEX_8 s;
2818
2819       for (y = 0; y < ycount; y++)
2820         {
2821           bbase_y = &bbase[y*bystride];
2822           dest_y = &dest[y*rystride];
2823           for (x = 0; x < xcount; x++)
2824             {
2825               abase_x = &abase[x*axstride];
2826               s = (GFC_COMPLEX_8) 0;
2827               for (n = 0; n < count; n++)
2828                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2829               dest_y[x*rxstride] = s;
2830             }
2831         }
2832     }
2833 }
2834 #undef POW3
2835 #undef min
2836 #undef max
2837
2838 #endif
2839 #endif
2840