libgfortran/generated/matmul_c8.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_COMPLEX_8)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
  39                           const int *, const GFC_COMPLEX_8 *, const int *,
  40                           const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
  73         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_c8);
  76
  77
  78
  79
  80 /* Put exhaustive list of possible architectures here here, ORed together.  */
  81
  82 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  83
  84 #ifdef HAVE_AVX
  85 static void
  86 matmul_c8_avx (gfc_array_c8 * const restrict retarray,
  87         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
  88         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  89 static void
  90 matmul_c8_avx (gfc_array_c8 * const restrict retarray,
  91         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
  92         int blas_limit, blas_call gemm)
  93 {
  94   const GFC_COMPLEX_8 * restrict abase;
  95   const GFC_COMPLEX_8 * restrict bbase;
  96   GFC_COMPLEX_8 * restrict dest;
  97
  98   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  99   index_type x, y, n, count, xcount, ycount;
 100
 101   assert (GFC_DESCRIPTOR_RANK (a) == 2
 102           || GFC_DESCRIPTOR_RANK (b) == 2);
 103
 104 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 105
 106    Either A or B (but not both) can be rank 1:
 107
 108    o One-dimensional argument A is implicitly treated as a row matrix
 109      dimensioned [1,count], so xcount=1.
 110
 111    o One-dimensional argument B is implicitly treated as a column matrix
 112      dimensioned [count, 1], so ycount=1.
 113 */
 114
 115   if (retarray->base_addr == NULL)
 116     {
 117       if (GFC_DESCRIPTOR_RANK (a) == 1)
 118         {
 119           GFC_DIMENSION_SET(retarray->dim[0], 0,
 120                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 121         }
 122       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 123         {
 124           GFC_DIMENSION_SET(retarray->dim[0], 0,
 125                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 126         }
 127       else
 128         {
 129           GFC_DIMENSION_SET(retarray->dim[0], 0,
 130                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 131
 132           GFC_DIMENSION_SET(retarray->dim[1], 0,
 133                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 134                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 135         }
 136
 137       retarray->base_addr
 138         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
 139       retarray->offset = 0;
 140     }
 141   else if (unlikely (compile_options.bounds_check))
 142     {
 143       index_type ret_extent, arg_extent;
 144
 145       if (GFC_DESCRIPTOR_RANK (a) == 1)
 146         {
 147           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 148           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 149           if (arg_extent != ret_extent)
 150             runtime_error ("Incorrect extent in return array in"
 151                            " MATMUL intrinsic: is %ld, should be %ld",
 152                            (long int) ret_extent, (long int) arg_extent);
 153         }
 154       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 155         {
 156           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 157           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 158           if (arg_extent != ret_extent)
 159             runtime_error ("Incorrect extent in return array in"
 160                            " MATMUL intrinsic: is %ld, should be %ld",
 161                            (long int) ret_extent, (long int) arg_extent);
 162         }
 163       else
 164         {
 165           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 166           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 167           if (arg_extent != ret_extent)
 168             runtime_error ("Incorrect extent in return array in"
 169                            " MATMUL intrinsic for dimension 1:"
 170                            " is %ld, should be %ld",
 171                            (long int) ret_extent, (long int) arg_extent);
 172
 173           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 174           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 175           if (arg_extent != ret_extent)
 176             runtime_error ("Incorrect extent in return array in"
 177                            " MATMUL intrinsic for dimension 2:"
 178                            " is %ld, should be %ld",
 179                            (long int) ret_extent, (long int) arg_extent);
 180         }
 181     }
 182
 183
 184   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 185     {
 186       /* One-dimensional result may be addressed in the code below
 187          either as a row or a column matrix. We want both cases to
 188          work. */
 189       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 190     }
 191   else
 192     {
 193       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 194       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 195     }
 196
 197
 198   if (GFC_DESCRIPTOR_RANK (a) == 1)
 199     {
 200       /* Treat it as a a row matrix A[1,count]. */
 201       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 202       aystride = 1;
 203
 204       xcount = 1;
 205       count = GFC_DESCRIPTOR_EXTENT(a,0);
 206     }
 207   else
 208     {
 209       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 210       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 211
 212       count = GFC_DESCRIPTOR_EXTENT(a,1);
 213       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 214     }
 215
 216   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 217     {
 218       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 219         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 220     }
 221
 222   if (GFC_DESCRIPTOR_RANK (b) == 1)
 223     {
 224       /* Treat it as a column matrix B[count,1] */
 225       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 226
 227       /* bystride should never be used for 1-dimensional b.
 228          in case it is we want it to cause a segfault, rather than
 229          an incorrect result. */
 230       bystride = 0xDEADBEEF;
 231       ycount = 1;
 232     }
 233   else
 234     {
 235       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 236       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 237       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 238     }
 239
 240   abase = a->base_addr;
 241   bbase = b->base_addr;
 242   dest = retarray->base_addr;
 243
 244   /* Now that everything is set up, we perform the multiplication
 245      itself.  */
 246
 247 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 248 #define min(a,b) ((a) <= (b) ? (a) : (b))
 249 #define max(a,b) ((a) >= (b) ? (a) : (b))
 250
 251   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 252       && (bxstride == 1 || bystride == 1)
 253       && (((float) xcount) * ((float) ycount) * ((float) count)
 254           > POW3(blas_limit)))
 255     {
 256       const int m = xcount, n = ycount, k = count, ldc = rystride;
 257       const GFC_COMPLEX_8 one = 1, zero = 0;
 258       const int lda = (axstride == 1) ? aystride : axstride,
 259                 ldb = (bxstride == 1) ? bystride : bxstride;
 260
 261       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 262         {
 263           assert (gemm != NULL);
 264           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 265                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 266                 &ldc, 1, 1);
 267           return;
 268         }
 269     }
 270
 271   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 272     {
 273       /* This block of code implements a tuned matmul, derived from
 274          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 275
 276                Bo Kagstrom and Per Ling
 277                Department of Computing Science
 278                Umea University
 279                S-901 87 Umea, Sweden
 280
 281          from netlib.org, translated to C, and modified for matmul.m4.  */
 282
 283       const GFC_COMPLEX_8 *a, *b;
 284       GFC_COMPLEX_8 *c;
 285       const index_type m = xcount, n = ycount, k = count;
 286
 287       /* System generated locals */
 288       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 289                  i1, i2, i3, i4, i5, i6;
 290
 291       /* Local variables */
 292       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
 293                  f11, f12, f21, f22, f31, f32, f41, f42,
 294                  f13, f14, f23, f24, f33, f34, f43, f44;
 295       index_type i, j, l, ii, jj, ll;
 296       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 297
 298       a = abase;
 299       b = bbase;
 300       c = retarray->base_addr;
 301
 302       /* Parameter adjustments */
 303       c_dim1 = rystride;
 304       c_offset = 1 + c_dim1;
 305       c -= c_offset;
 306       a_dim1 = aystride;
 307       a_offset = 1 + a_dim1;
 308       a -= a_offset;
 309       b_dim1 = bystride;
 310       b_offset = 1 + b_dim1;
 311       b -= b_offset;
 312
 313       /* Early exit if possible */
 314       if (m == 0 || n == 0 || k == 0)
 315         return;
 316
 317       /* Empty c first.  */
 318       for (j=1; j<=n; j++)
 319         for (i=1; i<=m; i++)
 320           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
 321
 322       /* Start turning the crank. */
 323       i1 = n;
 324       for (jj = 1; jj <= i1; jj += 512)
 325         {
 326           /* Computing MIN */
 327           i2 = 512;
 328           i3 = n - jj + 1;
 329           jsec = min(i2,i3);
 330           ujsec = jsec - jsec % 4;
 331           i2 = k;
 332           for (ll = 1; ll <= i2; ll += 256)
 333             {
 334               /* Computing MIN */
 335               i3 = 256;
 336               i4 = k - ll + 1;
 337               lsec = min(i3,i4);
 338               ulsec = lsec - lsec % 2;
 339
 340               i3 = m;
 341               for (ii = 1; ii <= i3; ii += 256)
 342                 {
 343                   /* Computing MIN */
 344                   i4 = 256;
 345                   i5 = m - ii + 1;
 346                   isec = min(i4,i5);
 347                   uisec = isec - isec % 2;
 348                   i4 = ll + ulsec - 1;
 349                   for (l = ll; l <= i4; l += 2)
 350                     {
 351                       i5 = ii + uisec - 1;
 352                       for (i = ii; i <= i5; i += 2)
 353                         {
 354                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 355                                         a[i + l * a_dim1];
 356                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 357                                         a[i + (l + 1) * a_dim1];
 358                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 359                                         a[i + 1 + l * a_dim1];
 360                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 361                                         a[i + 1 + (l + 1) * a_dim1];
 362                         }
 363                       if (uisec < isec)
 364                         {
 365                           t1[l - ll + 1 + (isec << 8) - 257] =
 366                                     a[ii + isec - 1 + l * a_dim1];
 367                           t1[l - ll + 2 + (isec << 8) - 257] =
 368                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 369                         }
 370                     }
 371                   if (ulsec < lsec)
 372                     {
 373                       i4 = ii + isec - 1;
 374                       for (i = ii; i<= i4; ++i)
 375                         {
 376                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 377                                     a[i + (ll + lsec - 1) * a_dim1];
 378                         }
 379                     }
 380
 381                   uisec = isec - isec % 4;
 382                   i4 = jj + ujsec - 1;
 383                   for (j = jj; j <= i4; j += 4)
 384                     {
 385                       i5 = ii + uisec - 1;
 386                       for (i = ii; i <= i5; i += 4)
 387                         {
 388                           f11 = c[i + j * c_dim1];
 389                           f21 = c[i + 1 + j * c_dim1];
 390                           f12 = c[i + (j + 1) * c_dim1];
 391                           f22 = c[i + 1 + (j + 1) * c_dim1];
 392                           f13 = c[i + (j + 2) * c_dim1];
 393                           f23 = c[i + 1 + (j + 2) * c_dim1];
 394                           f14 = c[i + (j + 3) * c_dim1];
 395                           f24 = c[i + 1 + (j + 3) * c_dim1];
 396                           f31 = c[i + 2 + j * c_dim1];
 397                           f41 = c[i + 3 + j * c_dim1];
 398                           f32 = c[i + 2 + (j + 1) * c_dim1];
 399                           f42 = c[i + 3 + (j + 1) * c_dim1];
 400                           f33 = c[i + 2 + (j + 2) * c_dim1];
 401                           f43 = c[i + 3 + (j + 2) * c_dim1];
 402                           f34 = c[i + 2 + (j + 3) * c_dim1];
 403                           f44 = c[i + 3 + (j + 3) * c_dim1];
 404                           i6 = ll + lsec - 1;
 405                           for (l = ll; l <= i6; ++l)
 406                             {
 407                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 408                                       * b[l + j * b_dim1];
 409                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 410                                       * b[l + j * b_dim1];
 411                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 412                                       * b[l + (j + 1) * b_dim1];
 413                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 414                                       * b[l + (j + 1) * b_dim1];
 415                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 416                                       * b[l + (j + 2) * b_dim1];
 417                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 418                                       * b[l + (j + 2) * b_dim1];
 419                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 420                                       * b[l + (j + 3) * b_dim1];
 421                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 422                                       * b[l + (j + 3) * b_dim1];
 423                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 424                                       * b[l + j * b_dim1];
 425                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 426                                       * b[l + j * b_dim1];
 427                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 428                                       * b[l + (j + 1) * b_dim1];
 429                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 430                                       * b[l + (j + 1) * b_dim1];
 431                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 432                                       * b[l + (j + 2) * b_dim1];
 433                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 434                                       * b[l + (j + 2) * b_dim1];
 435                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 436                                       * b[l + (j + 3) * b_dim1];
 437                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 438                                       * b[l + (j + 3) * b_dim1];
 439                             }
 440                           c[i + j * c_dim1] = f11;
 441                           c[i + 1 + j * c_dim1] = f21;
 442                           c[i + (j + 1) * c_dim1] = f12;
 443                           c[i + 1 + (j + 1) * c_dim1] = f22;
 444                           c[i + (j + 2) * c_dim1] = f13;
 445                           c[i + 1 + (j + 2) * c_dim1] = f23;
 446                           c[i + (j + 3) * c_dim1] = f14;
 447                           c[i + 1 + (j + 3) * c_dim1] = f24;
 448                           c[i + 2 + j * c_dim1] = f31;
 449                           c[i + 3 + j * c_dim1] = f41;
 450                           c[i + 2 + (j + 1) * c_dim1] = f32;
 451                           c[i + 3 + (j + 1) * c_dim1] = f42;
 452                           c[i + 2 + (j + 2) * c_dim1] = f33;
 453                           c[i + 3 + (j + 2) * c_dim1] = f43;
 454                           c[i + 2 + (j + 3) * c_dim1] = f34;
 455                           c[i + 3 + (j + 3) * c_dim1] = f44;
 456                         }
 457                       if (uisec < isec)
 458                         {
 459                           i5 = ii + isec - 1;
 460                           for (i = ii + uisec; i <= i5; ++i)
 461                             {
 462                               f11 = c[i + j * c_dim1];
 463                               f12 = c[i + (j + 1) * c_dim1];
 464                               f13 = c[i + (j + 2) * c_dim1];
 465                               f14 = c[i + (j + 3) * c_dim1];
 466                               i6 = ll + lsec - 1;
 467                               for (l = ll; l <= i6; ++l)
 468                                 {
 469                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 470                                           257] * b[l + j * b_dim1];
 471                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 472                                           257] * b[l + (j + 1) * b_dim1];
 473                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 474                                           257] * b[l + (j + 2) * b_dim1];
 475                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 476                                           257] * b[l + (j + 3) * b_dim1];
 477                                 }
 478                               c[i + j * c_dim1] = f11;
 479                               c[i + (j + 1) * c_dim1] = f12;
 480                               c[i + (j + 2) * c_dim1] = f13;
 481                               c[i + (j + 3) * c_dim1] = f14;
 482                             }
 483                         }
 484                     }
 485                   if (ujsec < jsec)
 486                     {
 487                       i4 = jj + jsec - 1;
 488                       for (j = jj + ujsec; j <= i4; ++j)
 489                         {
 490                           i5 = ii + uisec - 1;
 491                           for (i = ii; i <= i5; i += 4)
 492                             {
 493                               f11 = c[i + j * c_dim1];
 494                               f21 = c[i + 1 + j * c_dim1];
 495                               f31 = c[i + 2 + j * c_dim1];
 496                               f41 = c[i + 3 + j * c_dim1];
 497                               i6 = ll + lsec - 1;
 498                               for (l = ll; l <= i6; ++l)
 499                                 {
 500                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 501                                           257] * b[l + j * b_dim1];
 502                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 503                                           257] * b[l + j * b_dim1];
 504                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 505                                           257] * b[l + j * b_dim1];
 506                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 507                                           257] * b[l + j * b_dim1];
 508                                 }
 509                               c[i + j * c_dim1] = f11;
 510                               c[i + 1 + j * c_dim1] = f21;
 511                               c[i + 2 + j * c_dim1] = f31;
 512                               c[i + 3 + j * c_dim1] = f41;
 513                             }
 514                           i5 = ii + isec - 1;
 515                           for (i = ii + uisec; i <= i5; ++i)
 516                             {
 517                               f11 = c[i + j * c_dim1];
 518                               i6 = ll + lsec - 1;
 519                               for (l = ll; l <= i6; ++l)
 520                                 {
 521                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 522                                           257] * b[l + j * b_dim1];
 523                                 }
 524                               c[i + j * c_dim1] = f11;
 525                             }
 526                         }
 527                     }
 528                 }
 529             }
 530         }
 531       return;
 532     }
 533   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 534     {
 535       if (GFC_DESCRIPTOR_RANK (a) != 1)
 536         {
 537           const GFC_COMPLEX_8 *restrict abase_x;
 538           const GFC_COMPLEX_8 *restrict bbase_y;
 539           GFC_COMPLEX_8 *restrict dest_y;
 540           GFC_COMPLEX_8 s;
 541
 542           for (y = 0; y < ycount; y++)
 543             {
 544               bbase_y = &bbase[y*bystride];
 545               dest_y = &dest[y*rystride];
 546               for (x = 0; x < xcount; x++)
 547                 {
 548                   abase_x = &abase[x*axstride];
 549                   s = (GFC_COMPLEX_8) 0;
 550                   for (n = 0; n < count; n++)
 551                     s += abase_x[n] * bbase_y[n];
 552                   dest_y[x] = s;
 553                 }
 554             }
 555         }
 556       else
 557         {
 558           const GFC_COMPLEX_8 *restrict bbase_y;
 559           GFC_COMPLEX_8 s;
 560
 561           for (y = 0; y < ycount; y++)
 562             {
 563               bbase_y = &bbase[y*bystride];
 564               s = (GFC_COMPLEX_8) 0;
 565               for (n = 0; n < count; n++)
 566                 s += abase[n*axstride] * bbase_y[n];
 567               dest[y*rystride] = s;
 568             }
 569         }
 570     }
 571   else if (axstride < aystride)
 572     {
 573       for (y = 0; y < ycount; y++)
 574         for (x = 0; x < xcount; x++)
 575           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
 576
 577       for (y = 0; y < ycount; y++)
 578         for (n = 0; n < count; n++)
 579           for (x = 0; x < xcount; x++)
 580             /* dest[x,y] += a[x,n] * b[n,y] */
 581             dest[x*rxstride + y*rystride] +=
 582                                         abase[x*axstride + n*aystride] *
 583                                         bbase[n*bxstride + y*bystride];
 584     }
 585   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 586     {
 587       const GFC_COMPLEX_8 *restrict bbase_y;
 588       GFC_COMPLEX_8 s;
 589
 590       for (y = 0; y < ycount; y++)
 591         {
 592           bbase_y = &bbase[y*bystride];
 593           s = (GFC_COMPLEX_8) 0;
 594           for (n = 0; n < count; n++)
 595             s += abase[n*axstride] * bbase_y[n*bxstride];
 596           dest[y*rxstride] = s;
 597         }
 598     }
 599   else
 600     {
 601       const GFC_COMPLEX_8 *restrict abase_x;
 602       const GFC_COMPLEX_8 *restrict bbase_y;
 603       GFC_COMPLEX_8 *restrict dest_y;
 604       GFC_COMPLEX_8 s;
 605
 606       for (y = 0; y < ycount; y++)
 607         {
 608           bbase_y = &bbase[y*bystride];
 609           dest_y = &dest[y*rystride];
 610           for (x = 0; x < xcount; x++)
 611             {
 612               abase_x = &abase[x*axstride];
 613               s = (GFC_COMPLEX_8) 0;
 614               for (n = 0; n < count; n++)
 615                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 616               dest_y[x*rxstride] = s;
 617             }
 618         }
 619     }
 620 }
 621 #undef POW3
 622 #undef min
 623 #undef max
 624
 625 #endif /* HAVE_AVX */
 626
 627 #ifdef HAVE_AVX2
 628 static void
 629 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
 630         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
 631         int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
 632 static void
 633 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
 634         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
 635         int blas_limit, blas_call gemm)
 636 {
 637   const GFC_COMPLEX_8 * restrict abase;
 638   const GFC_COMPLEX_8 * restrict bbase;
 639   GFC_COMPLEX_8 * restrict dest;
 640
 641   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 642   index_type x, y, n, count, xcount, ycount;
 643
 644   assert (GFC_DESCRIPTOR_RANK (a) == 2
 645           || GFC_DESCRIPTOR_RANK (b) == 2);
 646
 647 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 648
 649    Either A or B (but not both) can be rank 1:
 650
 651    o One-dimensional argument A is implicitly treated as a row matrix
 652      dimensioned [1,count], so xcount=1.
 653
 654    o One-dimensional argument B is implicitly treated as a column matrix
 655      dimensioned [count, 1], so ycount=1.
 656 */
 657
 658   if (retarray->base_addr == NULL)
 659     {
 660       if (GFC_DESCRIPTOR_RANK (a) == 1)
 661         {
 662           GFC_DIMENSION_SET(retarray->dim[0], 0,
 663                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 664         }
 665       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 666         {
 667           GFC_DIMENSION_SET(retarray->dim[0], 0,
 668                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 669         }
 670       else
 671         {
 672           GFC_DIMENSION_SET(retarray->dim[0], 0,
 673                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 674
 675           GFC_DIMENSION_SET(retarray->dim[1], 0,
 676                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 677                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 678         }
 679
 680       retarray->base_addr
 681         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
 682       retarray->offset = 0;
 683     }
 684   else if (unlikely (compile_options.bounds_check))
 685     {
 686       index_type ret_extent, arg_extent;
 687
 688       if (GFC_DESCRIPTOR_RANK (a) == 1)
 689         {
 690           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 691           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 692           if (arg_extent != ret_extent)
 693             runtime_error ("Incorrect extent in return array in"
 694                            " MATMUL intrinsic: is %ld, should be %ld",
 695                            (long int) ret_extent, (long int) arg_extent);
 696         }
 697       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 698         {
 699           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 700           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 701           if (arg_extent != ret_extent)
 702             runtime_error ("Incorrect extent in return array in"
 703                            " MATMUL intrinsic: is %ld, should be %ld",
 704                            (long int) ret_extent, (long int) arg_extent);
 705         }
 706       else
 707         {
 708           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 709           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 710           if (arg_extent != ret_extent)
 711             runtime_error ("Incorrect extent in return array in"
 712                            " MATMUL intrinsic for dimension 1:"
 713                            " is %ld, should be %ld",
 714                            (long int) ret_extent, (long int) arg_extent);
 715
 716           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 717           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 718           if (arg_extent != ret_extent)
 719             runtime_error ("Incorrect extent in return array in"
 720                            " MATMUL intrinsic for dimension 2:"
 721                            " is %ld, should be %ld",
 722                            (long int) ret_extent, (long int) arg_extent);
 723         }
 724     }
 725
 726
 727   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 728     {
 729       /* One-dimensional result may be addressed in the code below
 730          either as a row or a column matrix. We want both cases to
 731          work. */
 732       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 733     }
 734   else
 735     {
 736       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 737       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 738     }
 739
 740
 741   if (GFC_DESCRIPTOR_RANK (a) == 1)
 742     {
 743       /* Treat it as a a row matrix A[1,count]. */
 744       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 745       aystride = 1;
 746
 747       xcount = 1;
 748       count = GFC_DESCRIPTOR_EXTENT(a,0);
 749     }
 750   else
 751     {
 752       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 753       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 754
 755       count = GFC_DESCRIPTOR_EXTENT(a,1);
 756       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 757     }
 758
 759   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 760     {
 761       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 762         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 763     }
 764
 765   if (GFC_DESCRIPTOR_RANK (b) == 1)
 766     {
 767       /* Treat it as a column matrix B[count,1] */
 768       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 769
 770       /* bystride should never be used for 1-dimensional b.
 771          in case it is we want it to cause a segfault, rather than
 772          an incorrect result. */
 773       bystride = 0xDEADBEEF;
 774       ycount = 1;
 775     }
 776   else
 777     {
 778       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 779       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 780       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 781     }
 782
 783   abase = a->base_addr;
 784   bbase = b->base_addr;
 785   dest = retarray->base_addr;
 786
 787   /* Now that everything is set up, we perform the multiplication
 788      itself.  */
 789
 790 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 791 #define min(a,b) ((a) <= (b) ? (a) : (b))
 792 #define max(a,b) ((a) >= (b) ? (a) : (b))
 793
 794   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 795       && (bxstride == 1 || bystride == 1)
 796       && (((float) xcount) * ((float) ycount) * ((float) count)
 797           > POW3(blas_limit)))
 798     {
 799       const int m = xcount, n = ycount, k = count, ldc = rystride;
 800       const GFC_COMPLEX_8 one = 1, zero = 0;
 801       const int lda = (axstride == 1) ? aystride : axstride,
 802                 ldb = (bxstride == 1) ? bystride : bxstride;
 803
 804       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 805         {
 806           assert (gemm != NULL);
 807           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 808                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 809                 &ldc, 1, 1);
 810           return;
 811         }
 812     }
 813
 814   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 815     {
 816       /* This block of code implements a tuned matmul, derived from
 817          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 818
 819                Bo Kagstrom and Per Ling
 820                Department of Computing Science
 821                Umea University
 822                S-901 87 Umea, Sweden
 823
 824          from netlib.org, translated to C, and modified for matmul.m4.  */
 825
 826       const GFC_COMPLEX_8 *a, *b;
 827       GFC_COMPLEX_8 *c;
 828       const index_type m = xcount, n = ycount, k = count;
 829
 830       /* System generated locals */
 831       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 832                  i1, i2, i3, i4, i5, i6;
 833
 834       /* Local variables */
 835       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
 836                  f11, f12, f21, f22, f31, f32, f41, f42,
 837                  f13, f14, f23, f24, f33, f34, f43, f44;
 838       index_type i, j, l, ii, jj, ll;
 839       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 840
 841       a = abase;
 842       b = bbase;
 843       c = retarray->base_addr;
 844
 845       /* Parameter adjustments */
 846       c_dim1 = rystride;
 847       c_offset = 1 + c_dim1;
 848       c -= c_offset;
 849       a_dim1 = aystride;
 850       a_offset = 1 + a_dim1;
 851       a -= a_offset;
 852       b_dim1 = bystride;
 853       b_offset = 1 + b_dim1;
 854       b -= b_offset;
 855
 856       /* Early exit if possible */
 857       if (m == 0 || n == 0 || k == 0)
 858         return;
 859
 860       /* Empty c first.  */
 861       for (j=1; j<=n; j++)
 862         for (i=1; i<=m; i++)
 863           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
 864
 865       /* Start turning the crank. */
 866       i1 = n;
 867       for (jj = 1; jj <= i1; jj += 512)
 868         {
 869           /* Computing MIN */
 870           i2 = 512;
 871           i3 = n - jj + 1;
 872           jsec = min(i2,i3);
 873           ujsec = jsec - jsec % 4;
 874           i2 = k;
 875           for (ll = 1; ll <= i2; ll += 256)
 876             {
 877               /* Computing MIN */
 878               i3 = 256;
 879               i4 = k - ll + 1;
 880               lsec = min(i3,i4);
 881               ulsec = lsec - lsec % 2;
 882
 883               i3 = m;
 884               for (ii = 1; ii <= i3; ii += 256)
 885                 {
 886                   /* Computing MIN */
 887                   i4 = 256;
 888                   i5 = m - ii + 1;
 889                   isec = min(i4,i5);
 890                   uisec = isec - isec % 2;
 891                   i4 = ll + ulsec - 1;
 892                   for (l = ll; l <= i4; l += 2)
 893                     {
 894                       i5 = ii + uisec - 1;
 895                       for (i = ii; i <= i5; i += 2)
 896                         {
 897                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 898                                         a[i + l * a_dim1];
 899                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 900                                         a[i + (l + 1) * a_dim1];
 901                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 902                                         a[i + 1 + l * a_dim1];
 903                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 904                                         a[i + 1 + (l + 1) * a_dim1];
 905                         }
 906                       if (uisec < isec)
 907                         {
 908                           t1[l - ll + 1 + (isec << 8) - 257] =
 909                                     a[ii + isec - 1 + l * a_dim1];
 910                           t1[l - ll + 2 + (isec << 8) - 257] =
 911                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 912                         }
 913                     }
 914                   if (ulsec < lsec)
 915                     {
 916                       i4 = ii + isec - 1;
 917                       for (i = ii; i<= i4; ++i)
 918                         {
 919                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 920                                     a[i + (ll + lsec - 1) * a_dim1];
 921                         }
 922                     }
 923
 924                   uisec = isec - isec % 4;
 925                   i4 = jj + ujsec - 1;
 926                   for (j = jj; j <= i4; j += 4)
 927                     {
 928                       i5 = ii + uisec - 1;
 929                       for (i = ii; i <= i5; i += 4)
 930                         {
 931                           f11 = c[i + j * c_dim1];
 932                           f21 = c[i + 1 + j * c_dim1];
 933                           f12 = c[i + (j + 1) * c_dim1];
 934                           f22 = c[i + 1 + (j + 1) * c_dim1];
 935                           f13 = c[i + (j + 2) * c_dim1];
 936                           f23 = c[i + 1 + (j + 2) * c_dim1];
 937                           f14 = c[i + (j + 3) * c_dim1];
 938                           f24 = c[i + 1 + (j + 3) * c_dim1];
 939                           f31 = c[i + 2 + j * c_dim1];
 940                           f41 = c[i + 3 + j * c_dim1];
 941                           f32 = c[i + 2 + (j + 1) * c_dim1];
 942                           f42 = c[i + 3 + (j + 1) * c_dim1];
 943                           f33 = c[i + 2 + (j + 2) * c_dim1];
 944                           f43 = c[i + 3 + (j + 2) * c_dim1];
 945                           f34 = c[i + 2 + (j + 3) * c_dim1];
 946                           f44 = c[i + 3 + (j + 3) * c_dim1];
 947                           i6 = ll + lsec - 1;
 948                           for (l = ll; l <= i6; ++l)
 949                             {
 950                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 951                                       * b[l + j * b_dim1];
 952                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 953                                       * b[l + j * b_dim1];
 954                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 955                                       * b[l + (j + 1) * b_dim1];
 956                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 957                                       * b[l + (j + 1) * b_dim1];
 958                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 959                                       * b[l + (j + 2) * b_dim1];
 960                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 961                                       * b[l + (j + 2) * b_dim1];
 962                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 963                                       * b[l + (j + 3) * b_dim1];
 964                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 965                                       * b[l + (j + 3) * b_dim1];
 966                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 967                                       * b[l + j * b_dim1];
 968                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 969                                       * b[l + j * b_dim1];
 970                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 971                                       * b[l + (j + 1) * b_dim1];
 972                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 973                                       * b[l + (j + 1) * b_dim1];
 974                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 975                                       * b[l + (j + 2) * b_dim1];
 976                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 977                                       * b[l + (j + 2) * b_dim1];
 978                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 979                                       * b[l + (j + 3) * b_dim1];
 980                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 981                                       * b[l + (j + 3) * b_dim1];
 982                             }
 983                           c[i + j * c_dim1] = f11;
 984                           c[i + 1 + j * c_dim1] = f21;
 985                           c[i + (j + 1) * c_dim1] = f12;
 986                           c[i + 1 + (j + 1) * c_dim1] = f22;
 987                           c[i + (j + 2) * c_dim1] = f13;
 988                           c[i + 1 + (j + 2) * c_dim1] = f23;
 989                           c[i + (j + 3) * c_dim1] = f14;
 990                           c[i + 1 + (j + 3) * c_dim1] = f24;
 991                           c[i + 2 + j * c_dim1] = f31;
 992                           c[i + 3 + j * c_dim1] = f41;
 993                           c[i + 2 + (j + 1) * c_dim1] = f32;
 994                           c[i + 3 + (j + 1) * c_dim1] = f42;
 995                           c[i + 2 + (j + 2) * c_dim1] = f33;
 996                           c[i + 3 + (j + 2) * c_dim1] = f43;
 997                           c[i + 2 + (j + 3) * c_dim1] = f34;
 998                           c[i + 3 + (j + 3) * c_dim1] = f44;
 999                         }
1000                       if (uisec < isec)
1001                         {
1002                           i5 = ii + isec - 1;
1003                           for (i = ii + uisec; i <= i5; ++i)
1004                             {
1005                               f11 = c[i + j * c_dim1];
1006                               f12 = c[i + (j + 1) * c_dim1];
1007                               f13 = c[i + (j + 2) * c_dim1];
1008                               f14 = c[i + (j + 3) * c_dim1];
1009                               i6 = ll + lsec - 1;
1010                               for (l = ll; l <= i6; ++l)
1011                                 {
1012                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1013                                           257] * b[l + j * b_dim1];
1014                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1015                                           257] * b[l + (j + 1) * b_dim1];
1016                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1017                                           257] * b[l + (j + 2) * b_dim1];
1018                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1019                                           257] * b[l + (j + 3) * b_dim1];
1020                                 }
1021                               c[i + j * c_dim1] = f11;
1022                               c[i + (j + 1) * c_dim1] = f12;
1023                               c[i + (j + 2) * c_dim1] = f13;
1024                               c[i + (j + 3) * c_dim1] = f14;
1025                             }
1026                         }
1027                     }
1028                   if (ujsec < jsec)
1029                     {
1030                       i4 = jj + jsec - 1;
1031                       for (j = jj + ujsec; j <= i4; ++j)
1032                         {
1033                           i5 = ii + uisec - 1;
1034                           for (i = ii; i <= i5; i += 4)
1035                             {
1036                               f11 = c[i + j * c_dim1];
1037                               f21 = c[i + 1 + j * c_dim1];
1038                               f31 = c[i + 2 + j * c_dim1];
1039                               f41 = c[i + 3 + j * c_dim1];
1040                               i6 = ll + lsec - 1;
1041                               for (l = ll; l <= i6; ++l)
1042                                 {
1043                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1044                                           257] * b[l + j * b_dim1];
1045                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1046                                           257] * b[l + j * b_dim1];
1047                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1048                                           257] * b[l + j * b_dim1];
1049                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1050                                           257] * b[l + j * b_dim1];
1051                                 }
1052                               c[i + j * c_dim1] = f11;
1053                               c[i + 1 + j * c_dim1] = f21;
1054                               c[i + 2 + j * c_dim1] = f31;
1055                               c[i + 3 + j * c_dim1] = f41;
1056                             }
1057                           i5 = ii + isec - 1;
1058                           for (i = ii + uisec; i <= i5; ++i)
1059                             {
1060                               f11 = c[i + j * c_dim1];
1061                               i6 = ll + lsec - 1;
1062                               for (l = ll; l <= i6; ++l)
1063                                 {
1064                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065                                           257] * b[l + j * b_dim1];
1066                                 }
1067                               c[i + j * c_dim1] = f11;
1068                             }
1069                         }
1070                     }
1071                 }
1072             }
1073         }
1074       return;
1075     }
1076   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1077     {
1078       if (GFC_DESCRIPTOR_RANK (a) != 1)
1079         {
1080           const GFC_COMPLEX_8 *restrict abase_x;
1081           const GFC_COMPLEX_8 *restrict bbase_y;
1082           GFC_COMPLEX_8 *restrict dest_y;
1083           GFC_COMPLEX_8 s;
1084
1085           for (y = 0; y < ycount; y++)
1086             {
1087               bbase_y = &bbase[y*bystride];
1088               dest_y = &dest[y*rystride];
1089               for (x = 0; x < xcount; x++)
1090                 {
1091                   abase_x = &abase[x*axstride];
1092                   s = (GFC_COMPLEX_8) 0;
1093                   for (n = 0; n < count; n++)
1094                     s += abase_x[n] * bbase_y[n];
1095                   dest_y[x] = s;
1096                 }
1097             }
1098         }
1099       else
1100         {
1101           const GFC_COMPLEX_8 *restrict bbase_y;
1102           GFC_COMPLEX_8 s;
1103
1104           for (y = 0; y < ycount; y++)
1105             {
1106               bbase_y = &bbase[y*bystride];
1107               s = (GFC_COMPLEX_8) 0;
1108               for (n = 0; n < count; n++)
1109                 s += abase[n*axstride] * bbase_y[n];
1110               dest[y*rystride] = s;
1111             }
1112         }
1113     }
1114   else if (axstride < aystride)
1115     {
1116       for (y = 0; y < ycount; y++)
1117         for (x = 0; x < xcount; x++)
1118           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1119
1120       for (y = 0; y < ycount; y++)
1121         for (n = 0; n < count; n++)
1122           for (x = 0; x < xcount; x++)
1123             /* dest[x,y] += a[x,n] * b[n,y] */
1124             dest[x*rxstride + y*rystride] +=
1125                                         abase[x*axstride + n*aystride] *
1126                                         bbase[n*bxstride + y*bystride];
1127     }
1128   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1129     {
1130       const GFC_COMPLEX_8 *restrict bbase_y;
1131       GFC_COMPLEX_8 s;
1132
1133       for (y = 0; y < ycount; y++)
1134         {
1135           bbase_y = &bbase[y*bystride];
1136           s = (GFC_COMPLEX_8) 0;
1137           for (n = 0; n < count; n++)
1138             s += abase[n*axstride] * bbase_y[n*bxstride];
1139           dest[y*rxstride] = s;
1140         }
1141     }
1142   else
1143     {
1144       const GFC_COMPLEX_8 *restrict abase_x;
1145       const GFC_COMPLEX_8 *restrict bbase_y;
1146       GFC_COMPLEX_8 *restrict dest_y;
1147       GFC_COMPLEX_8 s;
1148
1149       for (y = 0; y < ycount; y++)
1150         {
1151           bbase_y = &bbase[y*bystride];
1152           dest_y = &dest[y*rystride];
1153           for (x = 0; x < xcount; x++)
1154             {
1155               abase_x = &abase[x*axstride];
1156               s = (GFC_COMPLEX_8) 0;
1157               for (n = 0; n < count; n++)
1158                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1159               dest_y[x*rxstride] = s;
1160             }
1161         }
1162     }
1163 }
1164 #undef POW3
1165 #undef min
1166 #undef max
1167
1168 #endif /* HAVE_AVX2 */
1169
1170 #ifdef HAVE_AVX512F
1171 static void
1172 matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1173         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1174         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1175 static void
1176 matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1177         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1178         int blas_limit, blas_call gemm)
1179 {
1180   const GFC_COMPLEX_8 * restrict abase;
1181   const GFC_COMPLEX_8 * restrict bbase;
1182   GFC_COMPLEX_8 * restrict dest;
1183
1184   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1185   index_type x, y, n, count, xcount, ycount;
1186
1187   assert (GFC_DESCRIPTOR_RANK (a) == 2
1188           || GFC_DESCRIPTOR_RANK (b) == 2);
1189
1190 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1191
1192    Either A or B (but not both) can be rank 1:
1193
1194    o One-dimensional argument A is implicitly treated as a row matrix
1195      dimensioned [1,count], so xcount=1.
1196
1197    o One-dimensional argument B is implicitly treated as a column matrix
1198      dimensioned [count, 1], so ycount=1.
1199 */
1200
1201   if (retarray->base_addr == NULL)
1202     {
1203       if (GFC_DESCRIPTOR_RANK (a) == 1)
1204         {
1205           GFC_DIMENSION_SET(retarray->dim[0], 0,
1206                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1207         }
1208       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1209         {
1210           GFC_DIMENSION_SET(retarray->dim[0], 0,
1211                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1212         }
1213       else
1214         {
1215           GFC_DIMENSION_SET(retarray->dim[0], 0,
1216                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1217
1218           GFC_DIMENSION_SET(retarray->dim[1], 0,
1219                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1220                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1221         }
1222
1223       retarray->base_addr
1224         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1225       retarray->offset = 0;
1226     }
1227   else if (unlikely (compile_options.bounds_check))
1228     {
1229       index_type ret_extent, arg_extent;
1230
1231       if (GFC_DESCRIPTOR_RANK (a) == 1)
1232         {
1233           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1234           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1235           if (arg_extent != ret_extent)
1236             runtime_error ("Incorrect extent in return array in"
1237                            " MATMUL intrinsic: is %ld, should be %ld",
1238                            (long int) ret_extent, (long int) arg_extent);
1239         }
1240       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1241         {
1242           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1243           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1244           if (arg_extent != ret_extent)
1245             runtime_error ("Incorrect extent in return array in"
1246                            " MATMUL intrinsic: is %ld, should be %ld",
1247                            (long int) ret_extent, (long int) arg_extent);
1248         }
1249       else
1250         {
1251           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1252           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1253           if (arg_extent != ret_extent)
1254             runtime_error ("Incorrect extent in return array in"
1255                            " MATMUL intrinsic for dimension 1:"
1256                            " is %ld, should be %ld",
1257                            (long int) ret_extent, (long int) arg_extent);
1258
1259           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1260           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1261           if (arg_extent != ret_extent)
1262             runtime_error ("Incorrect extent in return array in"
1263                            " MATMUL intrinsic for dimension 2:"
1264                            " is %ld, should be %ld",
1265                            (long int) ret_extent, (long int) arg_extent);
1266         }
1267     }
1268
1269
1270   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1271     {
1272       /* One-dimensional result may be addressed in the code below
1273          either as a row or a column matrix. We want both cases to
1274          work. */
1275       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1276     }
1277   else
1278     {
1279       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1280       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1281     }
1282
1283
1284   if (GFC_DESCRIPTOR_RANK (a) == 1)
1285     {
1286       /* Treat it as a a row matrix A[1,count]. */
1287       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1288       aystride = 1;
1289
1290       xcount = 1;
1291       count = GFC_DESCRIPTOR_EXTENT(a,0);
1292     }
1293   else
1294     {
1295       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1296       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1297
1298       count = GFC_DESCRIPTOR_EXTENT(a,1);
1299       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1300     }
1301
1302   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1303     {
1304       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1305         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1306     }
1307
1308   if (GFC_DESCRIPTOR_RANK (b) == 1)
1309     {
1310       /* Treat it as a column matrix B[count,1] */
1311       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1312
1313       /* bystride should never be used for 1-dimensional b.
1314          in case it is we want it to cause a segfault, rather than
1315          an incorrect result. */
1316       bystride = 0xDEADBEEF;
1317       ycount = 1;
1318     }
1319   else
1320     {
1321       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1322       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1323       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1324     }
1325
1326   abase = a->base_addr;
1327   bbase = b->base_addr;
1328   dest = retarray->base_addr;
1329
1330   /* Now that everything is set up, we perform the multiplication
1331      itself.  */
1332
1333 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1334 #define min(a,b) ((a) <= (b) ? (a) : (b))
1335 #define max(a,b) ((a) >= (b) ? (a) : (b))
1336
1337   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1338       && (bxstride == 1 || bystride == 1)
1339       && (((float) xcount) * ((float) ycount) * ((float) count)
1340           > POW3(blas_limit)))
1341     {
1342       const int m = xcount, n = ycount, k = count, ldc = rystride;
1343       const GFC_COMPLEX_8 one = 1, zero = 0;
1344       const int lda = (axstride == 1) ? aystride : axstride,
1345                 ldb = (bxstride == 1) ? bystride : bxstride;
1346
1347       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1348         {
1349           assert (gemm != NULL);
1350           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1351                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1352                 &ldc, 1, 1);
1353           return;
1354         }
1355     }
1356
1357   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1358     {
1359       /* This block of code implements a tuned matmul, derived from
1360          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1361
1362                Bo Kagstrom and Per Ling
1363                Department of Computing Science
1364                Umea University
1365                S-901 87 Umea, Sweden
1366
1367          from netlib.org, translated to C, and modified for matmul.m4.  */
1368
1369       const GFC_COMPLEX_8 *a, *b;
1370       GFC_COMPLEX_8 *c;
1371       const index_type m = xcount, n = ycount, k = count;
1372
1373       /* System generated locals */
1374       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1375                  i1, i2, i3, i4, i5, i6;
1376
1377       /* Local variables */
1378       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
1379                  f11, f12, f21, f22, f31, f32, f41, f42,
1380                  f13, f14, f23, f24, f33, f34, f43, f44;
1381       index_type i, j, l, ii, jj, ll;
1382       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1383
1384       a = abase;
1385       b = bbase;
1386       c = retarray->base_addr;
1387
1388       /* Parameter adjustments */
1389       c_dim1 = rystride;
1390       c_offset = 1 + c_dim1;
1391       c -= c_offset;
1392       a_dim1 = aystride;
1393       a_offset = 1 + a_dim1;
1394       a -= a_offset;
1395       b_dim1 = bystride;
1396       b_offset = 1 + b_dim1;
1397       b -= b_offset;
1398
1399       /* Early exit if possible */
1400       if (m == 0 || n == 0 || k == 0)
1401         return;
1402
1403       /* Empty c first.  */
1404       for (j=1; j<=n; j++)
1405         for (i=1; i<=m; i++)
1406           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
1407
1408       /* Start turning the crank. */
1409       i1 = n;
1410       for (jj = 1; jj <= i1; jj += 512)
1411         {
1412           /* Computing MIN */
1413           i2 = 512;
1414           i3 = n - jj + 1;
1415           jsec = min(i2,i3);
1416           ujsec = jsec - jsec % 4;
1417           i2 = k;
1418           for (ll = 1; ll <= i2; ll += 256)
1419             {
1420               /* Computing MIN */
1421               i3 = 256;
1422               i4 = k - ll + 1;
1423               lsec = min(i3,i4);
1424               ulsec = lsec - lsec % 2;
1425
1426               i3 = m;
1427               for (ii = 1; ii <= i3; ii += 256)
1428                 {
1429                   /* Computing MIN */
1430                   i4 = 256;
1431                   i5 = m - ii + 1;
1432                   isec = min(i4,i5);
1433                   uisec = isec - isec % 2;
1434                   i4 = ll + ulsec - 1;
1435                   for (l = ll; l <= i4; l += 2)
1436                     {
1437                       i5 = ii + uisec - 1;
1438                       for (i = ii; i <= i5; i += 2)
1439                         {
1440                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1441                                         a[i + l * a_dim1];
1442                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1443                                         a[i + (l + 1) * a_dim1];
1444                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1445                                         a[i + 1 + l * a_dim1];
1446                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1447                                         a[i + 1 + (l + 1) * a_dim1];
1448                         }
1449                       if (uisec < isec)
1450                         {
1451                           t1[l - ll + 1 + (isec << 8) - 257] =
1452                                     a[ii + isec - 1 + l * a_dim1];
1453                           t1[l - ll + 2 + (isec << 8) - 257] =
1454                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1455                         }
1456                     }
1457                   if (ulsec < lsec)
1458                     {
1459                       i4 = ii + isec - 1;
1460                       for (i = ii; i<= i4; ++i)
1461                         {
1462                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1463                                     a[i + (ll + lsec - 1) * a_dim1];
1464                         }
1465                     }
1466
1467                   uisec = isec - isec % 4;
1468                   i4 = jj + ujsec - 1;
1469                   for (j = jj; j <= i4; j += 4)
1470                     {
1471                       i5 = ii + uisec - 1;
1472                       for (i = ii; i <= i5; i += 4)
1473                         {
1474                           f11 = c[i + j * c_dim1];
1475                           f21 = c[i + 1 + j * c_dim1];
1476                           f12 = c[i + (j + 1) * c_dim1];
1477                           f22 = c[i + 1 + (j + 1) * c_dim1];
1478                           f13 = c[i + (j + 2) * c_dim1];
1479                           f23 = c[i + 1 + (j + 2) * c_dim1];
1480                           f14 = c[i + (j + 3) * c_dim1];
1481                           f24 = c[i + 1 + (j + 3) * c_dim1];
1482                           f31 = c[i + 2 + j * c_dim1];
1483                           f41 = c[i + 3 + j * c_dim1];
1484                           f32 = c[i + 2 + (j + 1) * c_dim1];
1485                           f42 = c[i + 3 + (j + 1) * c_dim1];
1486                           f33 = c[i + 2 + (j + 2) * c_dim1];
1487                           f43 = c[i + 3 + (j + 2) * c_dim1];
1488                           f34 = c[i + 2 + (j + 3) * c_dim1];
1489                           f44 = c[i + 3 + (j + 3) * c_dim1];
1490                           i6 = ll + lsec - 1;
1491                           for (l = ll; l <= i6; ++l)
1492                             {
1493                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1494                                       * b[l + j * b_dim1];
1495                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1496                                       * b[l + j * b_dim1];
1497                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1498                                       * b[l + (j + 1) * b_dim1];
1499                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1500                                       * b[l + (j + 1) * b_dim1];
1501                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1502                                       * b[l + (j + 2) * b_dim1];
1503                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1504                                       * b[l + (j + 2) * b_dim1];
1505                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1506                                       * b[l + (j + 3) * b_dim1];
1507                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1508                                       * b[l + (j + 3) * b_dim1];
1509                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1510                                       * b[l + j * b_dim1];
1511                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1512                                       * b[l + j * b_dim1];
1513                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1514                                       * b[l + (j + 1) * b_dim1];
1515                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1516                                       * b[l + (j + 1) * b_dim1];
1517                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1518                                       * b[l + (j + 2) * b_dim1];
1519                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1520                                       * b[l + (j + 2) * b_dim1];
1521                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1522                                       * b[l + (j + 3) * b_dim1];
1523                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1524                                       * b[l + (j + 3) * b_dim1];
1525                             }
1526                           c[i + j * c_dim1] = f11;
1527                           c[i + 1 + j * c_dim1] = f21;
1528                           c[i + (j + 1) * c_dim1] = f12;
1529                           c[i + 1 + (j + 1) * c_dim1] = f22;
1530                           c[i + (j + 2) * c_dim1] = f13;
1531                           c[i + 1 + (j + 2) * c_dim1] = f23;
1532                           c[i + (j + 3) * c_dim1] = f14;
1533                           c[i + 1 + (j + 3) * c_dim1] = f24;
1534                           c[i + 2 + j * c_dim1] = f31;
1535                           c[i + 3 + j * c_dim1] = f41;
1536                           c[i + 2 + (j + 1) * c_dim1] = f32;
1537                           c[i + 3 + (j + 1) * c_dim1] = f42;
1538                           c[i + 2 + (j + 2) * c_dim1] = f33;
1539                           c[i + 3 + (j + 2) * c_dim1] = f43;
1540                           c[i + 2 + (j + 3) * c_dim1] = f34;
1541                           c[i + 3 + (j + 3) * c_dim1] = f44;
1542                         }
1543                       if (uisec < isec)
1544                         {
1545                           i5 = ii + isec - 1;
1546                           for (i = ii + uisec; i <= i5; ++i)
1547                             {
1548                               f11 = c[i + j * c_dim1];
1549                               f12 = c[i + (j + 1) * c_dim1];
1550                               f13 = c[i + (j + 2) * c_dim1];
1551                               f14 = c[i + (j + 3) * c_dim1];
1552                               i6 = ll + lsec - 1;
1553                               for (l = ll; l <= i6; ++l)
1554                                 {
1555                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1556                                           257] * b[l + j * b_dim1];
1557                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1558                                           257] * b[l + (j + 1) * b_dim1];
1559                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1560                                           257] * b[l + (j + 2) * b_dim1];
1561                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1562                                           257] * b[l + (j + 3) * b_dim1];
1563                                 }
1564                               c[i + j * c_dim1] = f11;
1565                               c[i + (j + 1) * c_dim1] = f12;
1566                               c[i + (j + 2) * c_dim1] = f13;
1567                               c[i + (j + 3) * c_dim1] = f14;
1568                             }
1569                         }
1570                     }
1571                   if (ujsec < jsec)
1572                     {
1573                       i4 = jj + jsec - 1;
1574                       for (j = jj + ujsec; j <= i4; ++j)
1575                         {
1576                           i5 = ii + uisec - 1;
1577                           for (i = ii; i <= i5; i += 4)
1578                             {
1579                               f11 = c[i + j * c_dim1];
1580                               f21 = c[i + 1 + j * c_dim1];
1581                               f31 = c[i + 2 + j * c_dim1];
1582                               f41 = c[i + 3 + j * c_dim1];
1583                               i6 = ll + lsec - 1;
1584                               for (l = ll; l <= i6; ++l)
1585                                 {
1586                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1587                                           257] * b[l + j * b_dim1];
1588                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1589                                           257] * b[l + j * b_dim1];
1590                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1591                                           257] * b[l + j * b_dim1];
1592                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1593                                           257] * b[l + j * b_dim1];
1594                                 }
1595                               c[i + j * c_dim1] = f11;
1596                               c[i + 1 + j * c_dim1] = f21;
1597                               c[i + 2 + j * c_dim1] = f31;
1598                               c[i + 3 + j * c_dim1] = f41;
1599                             }
1600                           i5 = ii + isec - 1;
1601                           for (i = ii + uisec; i <= i5; ++i)
1602                             {
1603                               f11 = c[i + j * c_dim1];
1604                               i6 = ll + lsec - 1;
1605                               for (l = ll; l <= i6; ++l)
1606                                 {
1607                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1608                                           257] * b[l + j * b_dim1];
1609                                 }
1610                               c[i + j * c_dim1] = f11;
1611                             }
1612                         }
1613                     }
1614                 }
1615             }
1616         }
1617       return;
1618     }
1619   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1620     {
1621       if (GFC_DESCRIPTOR_RANK (a) != 1)
1622         {
1623           const GFC_COMPLEX_8 *restrict abase_x;
1624           const GFC_COMPLEX_8 *restrict bbase_y;
1625           GFC_COMPLEX_8 *restrict dest_y;
1626           GFC_COMPLEX_8 s;
1627
1628           for (y = 0; y < ycount; y++)
1629             {
1630               bbase_y = &bbase[y*bystride];
1631               dest_y = &dest[y*rystride];
1632               for (x = 0; x < xcount; x++)
1633                 {
1634                   abase_x = &abase[x*axstride];
1635                   s = (GFC_COMPLEX_8) 0;
1636                   for (n = 0; n < count; n++)
1637                     s += abase_x[n] * bbase_y[n];
1638                   dest_y[x] = s;
1639                 }
1640             }
1641         }
1642       else
1643         {
1644           const GFC_COMPLEX_8 *restrict bbase_y;
1645           GFC_COMPLEX_8 s;
1646
1647           for (y = 0; y < ycount; y++)
1648             {
1649               bbase_y = &bbase[y*bystride];
1650               s = (GFC_COMPLEX_8) 0;
1651               for (n = 0; n < count; n++)
1652                 s += abase[n*axstride] * bbase_y[n];
1653               dest[y*rystride] = s;
1654             }
1655         }
1656     }
1657   else if (axstride < aystride)
1658     {
1659       for (y = 0; y < ycount; y++)
1660         for (x = 0; x < xcount; x++)
1661           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1662
1663       for (y = 0; y < ycount; y++)
1664         for (n = 0; n < count; n++)
1665           for (x = 0; x < xcount; x++)
1666             /* dest[x,y] += a[x,n] * b[n,y] */
1667             dest[x*rxstride + y*rystride] +=
1668                                         abase[x*axstride + n*aystride] *
1669                                         bbase[n*bxstride + y*bystride];
1670     }
1671   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1672     {
1673       const GFC_COMPLEX_8 *restrict bbase_y;
1674       GFC_COMPLEX_8 s;
1675
1676       for (y = 0; y < ycount; y++)
1677         {
1678           bbase_y = &bbase[y*bystride];
1679           s = (GFC_COMPLEX_8) 0;
1680           for (n = 0; n < count; n++)
1681             s += abase[n*axstride] * bbase_y[n*bxstride];
1682           dest[y*rxstride] = s;
1683         }
1684     }
1685   else
1686     {
1687       const GFC_COMPLEX_8 *restrict abase_x;
1688       const GFC_COMPLEX_8 *restrict bbase_y;
1689       GFC_COMPLEX_8 *restrict dest_y;
1690       GFC_COMPLEX_8 s;
1691
1692       for (y = 0; y < ycount; y++)
1693         {
1694           bbase_y = &bbase[y*bystride];
1695           dest_y = &dest[y*rystride];
1696           for (x = 0; x < xcount; x++)
1697             {
1698               abase_x = &abase[x*axstride];
1699               s = (GFC_COMPLEX_8) 0;
1700               for (n = 0; n < count; n++)
1701                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1702               dest_y[x*rxstride] = s;
1703             }
1704         }
1705     }
1706 }
1707 #undef POW3
1708 #undef min
1709 #undef max
1710
1711 #endif  /* HAVE_AVX512F */
1712
1713 /* Function to fall back to if there is no special processor-specific version.  */
1714 static void
1715 matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
1716         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1717         int blas_limit, blas_call gemm)
1718 {
1719   const GFC_COMPLEX_8 * restrict abase;
1720   const GFC_COMPLEX_8 * restrict bbase;
1721   GFC_COMPLEX_8 * restrict dest;
1722
1723   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1724   index_type x, y, n, count, xcount, ycount;
1725
1726   assert (GFC_DESCRIPTOR_RANK (a) == 2
1727           || GFC_DESCRIPTOR_RANK (b) == 2);
1728
1729 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1730
1731    Either A or B (but not both) can be rank 1:
1732
1733    o One-dimensional argument A is implicitly treated as a row matrix
1734      dimensioned [1,count], so xcount=1.
1735
1736    o One-dimensional argument B is implicitly treated as a column matrix
1737      dimensioned [count, 1], so ycount=1.
1738 */
1739
1740   if (retarray->base_addr == NULL)
1741     {
1742       if (GFC_DESCRIPTOR_RANK (a) == 1)
1743         {
1744           GFC_DIMENSION_SET(retarray->dim[0], 0,
1745                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1746         }
1747       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1748         {
1749           GFC_DIMENSION_SET(retarray->dim[0], 0,
1750                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1751         }
1752       else
1753         {
1754           GFC_DIMENSION_SET(retarray->dim[0], 0,
1755                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1756
1757           GFC_DIMENSION_SET(retarray->dim[1], 0,
1758                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1759                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1760         }
1761
1762       retarray->base_addr
1763         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1764       retarray->offset = 0;
1765     }
1766   else if (unlikely (compile_options.bounds_check))
1767     {
1768       index_type ret_extent, arg_extent;
1769
1770       if (GFC_DESCRIPTOR_RANK (a) == 1)
1771         {
1772           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1773           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1774           if (arg_extent != ret_extent)
1775             runtime_error ("Incorrect extent in return array in"
1776                            " MATMUL intrinsic: is %ld, should be %ld",
1777                            (long int) ret_extent, (long int) arg_extent);
1778         }
1779       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1780         {
1781           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1782           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1783           if (arg_extent != ret_extent)
1784             runtime_error ("Incorrect extent in return array in"
1785                            " MATMUL intrinsic: is %ld, should be %ld",
1786                            (long int) ret_extent, (long int) arg_extent);
1787         }
1788       else
1789         {
1790           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1791           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1792           if (arg_extent != ret_extent)
1793             runtime_error ("Incorrect extent in return array in"
1794                            " MATMUL intrinsic for dimension 1:"
1795                            " is %ld, should be %ld",
1796                            (long int) ret_extent, (long int) arg_extent);
1797
1798           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1799           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1800           if (arg_extent != ret_extent)
1801             runtime_error ("Incorrect extent in return array in"
1802                            " MATMUL intrinsic for dimension 2:"
1803                            " is %ld, should be %ld",
1804                            (long int) ret_extent, (long int) arg_extent);
1805         }
1806     }
1807
1808
1809   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1810     {
1811       /* One-dimensional result may be addressed in the code below
1812          either as a row or a column matrix. We want both cases to
1813          work. */
1814       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1815     }
1816   else
1817     {
1818       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1819       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1820     }
1821
1822
1823   if (GFC_DESCRIPTOR_RANK (a) == 1)
1824     {
1825       /* Treat it as a a row matrix A[1,count]. */
1826       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1827       aystride = 1;
1828
1829       xcount = 1;
1830       count = GFC_DESCRIPTOR_EXTENT(a,0);
1831     }
1832   else
1833     {
1834       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1835       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1836
1837       count = GFC_DESCRIPTOR_EXTENT(a,1);
1838       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1839     }
1840
1841   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1842     {
1843       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1844         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1845     }
1846
1847   if (GFC_DESCRIPTOR_RANK (b) == 1)
1848     {
1849       /* Treat it as a column matrix B[count,1] */
1850       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1851
1852       /* bystride should never be used for 1-dimensional b.
1853          in case it is we want it to cause a segfault, rather than
1854          an incorrect result. */
1855       bystride = 0xDEADBEEF;
1856       ycount = 1;
1857     }
1858   else
1859     {
1860       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1861       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1862       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1863     }
1864
1865   abase = a->base_addr;
1866   bbase = b->base_addr;
1867   dest = retarray->base_addr;
1868
1869   /* Now that everything is set up, we perform the multiplication
1870      itself.  */
1871
1872 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1873 #define min(a,b) ((a) <= (b) ? (a) : (b))
1874 #define max(a,b) ((a) >= (b) ? (a) : (b))
1875
1876   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1877       && (bxstride == 1 || bystride == 1)
1878       && (((float) xcount) * ((float) ycount) * ((float) count)
1879           > POW3(blas_limit)))
1880     {
1881       const int m = xcount, n = ycount, k = count, ldc = rystride;
1882       const GFC_COMPLEX_8 one = 1, zero = 0;
1883       const int lda = (axstride == 1) ? aystride : axstride,
1884                 ldb = (bxstride == 1) ? bystride : bxstride;
1885
1886       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1887         {
1888           assert (gemm != NULL);
1889           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1890                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1891                 &ldc, 1, 1);
1892           return;
1893         }
1894     }
1895
1896   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1897     {
1898       /* This block of code implements a tuned matmul, derived from
1899          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1900
1901                Bo Kagstrom and Per Ling
1902                Department of Computing Science
1903                Umea University
1904                S-901 87 Umea, Sweden
1905
1906          from netlib.org, translated to C, and modified for matmul.m4.  */
1907
1908       const GFC_COMPLEX_8 *a, *b;
1909       GFC_COMPLEX_8 *c;
1910       const index_type m = xcount, n = ycount, k = count;
1911
1912       /* System generated locals */
1913       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1914                  i1, i2, i3, i4, i5, i6;
1915
1916       /* Local variables */
1917       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
1918                  f11, f12, f21, f22, f31, f32, f41, f42,
1919                  f13, f14, f23, f24, f33, f34, f43, f44;
1920       index_type i, j, l, ii, jj, ll;
1921       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1922
1923       a = abase;
1924       b = bbase;
1925       c = retarray->base_addr;
1926
1927       /* Parameter adjustments */
1928       c_dim1 = rystride;
1929       c_offset = 1 + c_dim1;
1930       c -= c_offset;
1931       a_dim1 = aystride;
1932       a_offset = 1 + a_dim1;
1933       a -= a_offset;
1934       b_dim1 = bystride;
1935       b_offset = 1 + b_dim1;
1936       b -= b_offset;
1937
1938       /* Early exit if possible */
1939       if (m == 0 || n == 0 || k == 0)
1940         return;
1941
1942       /* Empty c first.  */
1943       for (j=1; j<=n; j++)
1944         for (i=1; i<=m; i++)
1945           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
1946
1947       /* Start turning the crank. */
1948       i1 = n;
1949       for (jj = 1; jj <= i1; jj += 512)
1950         {
1951           /* Computing MIN */
1952           i2 = 512;
1953           i3 = n - jj + 1;
1954           jsec = min(i2,i3);
1955           ujsec = jsec - jsec % 4;
1956           i2 = k;
1957           for (ll = 1; ll <= i2; ll += 256)
1958             {
1959               /* Computing MIN */
1960               i3 = 256;
1961               i4 = k - ll + 1;
1962               lsec = min(i3,i4);
1963               ulsec = lsec - lsec % 2;
1964
1965               i3 = m;
1966               for (ii = 1; ii <= i3; ii += 256)
1967                 {
1968                   /* Computing MIN */
1969                   i4 = 256;
1970                   i5 = m - ii + 1;
1971                   isec = min(i4,i5);
1972                   uisec = isec - isec % 2;
1973                   i4 = ll + ulsec - 1;
1974                   for (l = ll; l <= i4; l += 2)
1975                     {
1976                       i5 = ii + uisec - 1;
1977                       for (i = ii; i <= i5; i += 2)
1978                         {
1979                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1980                                         a[i + l * a_dim1];
1981                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1982                                         a[i + (l + 1) * a_dim1];
1983                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1984                                         a[i + 1 + l * a_dim1];
1985                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1986                                         a[i + 1 + (l + 1) * a_dim1];
1987                         }
1988                       if (uisec < isec)
1989                         {
1990                           t1[l - ll + 1 + (isec << 8) - 257] =
1991                                     a[ii + isec - 1 + l * a_dim1];
1992                           t1[l - ll + 2 + (isec << 8) - 257] =
1993                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1994                         }
1995                     }
1996                   if (ulsec < lsec)
1997                     {
1998                       i4 = ii + isec - 1;
1999                       for (i = ii; i<= i4; ++i)
2000                         {
2001                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2002                                     a[i + (ll + lsec - 1) * a_dim1];
2003                         }
2004                     }
2005
2006                   uisec = isec - isec % 4;
2007                   i4 = jj + ujsec - 1;
2008                   for (j = jj; j <= i4; j += 4)
2009                     {
2010                       i5 = ii + uisec - 1;
2011                       for (i = ii; i <= i5; i += 4)
2012                         {
2013                           f11 = c[i + j * c_dim1];
2014                           f21 = c[i + 1 + j * c_dim1];
2015                           f12 = c[i + (j + 1) * c_dim1];
2016                           f22 = c[i + 1 + (j + 1) * c_dim1];
2017                           f13 = c[i + (j + 2) * c_dim1];
2018                           f23 = c[i + 1 + (j + 2) * c_dim1];
2019                           f14 = c[i + (j + 3) * c_dim1];
2020                           f24 = c[i + 1 + (j + 3) * c_dim1];
2021                           f31 = c[i + 2 + j * c_dim1];
2022                           f41 = c[i + 3 + j * c_dim1];
2023                           f32 = c[i + 2 + (j + 1) * c_dim1];
2024                           f42 = c[i + 3 + (j + 1) * c_dim1];
2025                           f33 = c[i + 2 + (j + 2) * c_dim1];
2026                           f43 = c[i + 3 + (j + 2) * c_dim1];
2027                           f34 = c[i + 2 + (j + 3) * c_dim1];
2028                           f44 = c[i + 3 + (j + 3) * c_dim1];
2029                           i6 = ll + lsec - 1;
2030                           for (l = ll; l <= i6; ++l)
2031                             {
2032                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2033                                       * b[l + j * b_dim1];
2034                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2035                                       * b[l + j * b_dim1];
2036                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2037                                       * b[l + (j + 1) * b_dim1];
2038                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2039                                       * b[l + (j + 1) * b_dim1];
2040                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2041                                       * b[l + (j + 2) * b_dim1];
2042                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2043                                       * b[l + (j + 2) * b_dim1];
2044                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2045                                       * b[l + (j + 3) * b_dim1];
2046                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2047                                       * b[l + (j + 3) * b_dim1];
2048                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2049                                       * b[l + j * b_dim1];
2050                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2051                                       * b[l + j * b_dim1];
2052                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2053                                       * b[l + (j + 1) * b_dim1];
2054                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2055                                       * b[l + (j + 1) * b_dim1];
2056                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2057                                       * b[l + (j + 2) * b_dim1];
2058                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2059                                       * b[l + (j + 2) * b_dim1];
2060                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2061                                       * b[l + (j + 3) * b_dim1];
2062                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2063                                       * b[l + (j + 3) * b_dim1];
2064                             }
2065                           c[i + j * c_dim1] = f11;
2066                           c[i + 1 + j * c_dim1] = f21;
2067                           c[i + (j + 1) * c_dim1] = f12;
2068                           c[i + 1 + (j + 1) * c_dim1] = f22;
2069                           c[i + (j + 2) * c_dim1] = f13;
2070                           c[i + 1 + (j + 2) * c_dim1] = f23;
2071                           c[i + (j + 3) * c_dim1] = f14;
2072                           c[i + 1 + (j + 3) * c_dim1] = f24;
2073                           c[i + 2 + j * c_dim1] = f31;
2074                           c[i + 3 + j * c_dim1] = f41;
2075                           c[i + 2 + (j + 1) * c_dim1] = f32;
2076                           c[i + 3 + (j + 1) * c_dim1] = f42;
2077                           c[i + 2 + (j + 2) * c_dim1] = f33;
2078                           c[i + 3 + (j + 2) * c_dim1] = f43;
2079                           c[i + 2 + (j + 3) * c_dim1] = f34;
2080                           c[i + 3 + (j + 3) * c_dim1] = f44;
2081                         }
2082                       if (uisec < isec)
2083                         {
2084                           i5 = ii + isec - 1;
2085                           for (i = ii + uisec; i <= i5; ++i)
2086                             {
2087                               f11 = c[i + j * c_dim1];
2088                               f12 = c[i + (j + 1) * c_dim1];
2089                               f13 = c[i + (j + 2) * c_dim1];
2090                               f14 = c[i + (j + 3) * c_dim1];
2091                               i6 = ll + lsec - 1;
2092                               for (l = ll; l <= i6; ++l)
2093                                 {
2094                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2095                                           257] * b[l + j * b_dim1];
2096                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2097                                           257] * b[l + (j + 1) * b_dim1];
2098                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2099                                           257] * b[l + (j + 2) * b_dim1];
2100                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2101                                           257] * b[l + (j + 3) * b_dim1];
2102                                 }
2103                               c[i + j * c_dim1] = f11;
2104                               c[i + (j + 1) * c_dim1] = f12;
2105                               c[i + (j + 2) * c_dim1] = f13;
2106                               c[i + (j + 3) * c_dim1] = f14;
2107                             }
2108                         }
2109                     }
2110                   if (ujsec < jsec)
2111                     {
2112                       i4 = jj + jsec - 1;
2113                       for (j = jj + ujsec; j <= i4; ++j)
2114                         {
2115                           i5 = ii + uisec - 1;
2116                           for (i = ii; i <= i5; i += 4)
2117                             {
2118                               f11 = c[i + j * c_dim1];
2119                               f21 = c[i + 1 + j * c_dim1];
2120                               f31 = c[i + 2 + j * c_dim1];
2121                               f41 = c[i + 3 + j * c_dim1];
2122                               i6 = ll + lsec - 1;
2123                               for (l = ll; l <= i6; ++l)
2124                                 {
2125                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2126                                           257] * b[l + j * b_dim1];
2127                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2128                                           257] * b[l + j * b_dim1];
2129                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2130                                           257] * b[l + j * b_dim1];
2131                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2132                                           257] * b[l + j * b_dim1];
2133                                 }
2134                               c[i + j * c_dim1] = f11;
2135                               c[i + 1 + j * c_dim1] = f21;
2136                               c[i + 2 + j * c_dim1] = f31;
2137                               c[i + 3 + j * c_dim1] = f41;
2138                             }
2139                           i5 = ii + isec - 1;
2140                           for (i = ii + uisec; i <= i5; ++i)
2141                             {
2142                               f11 = c[i + j * c_dim1];
2143                               i6 = ll + lsec - 1;
2144                               for (l = ll; l <= i6; ++l)
2145                                 {
2146                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2147                                           257] * b[l + j * b_dim1];
2148                                 }
2149                               c[i + j * c_dim1] = f11;
2150                             }
2151                         }
2152                     }
2153                 }
2154             }
2155         }
2156       return;
2157     }
2158   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2159     {
2160       if (GFC_DESCRIPTOR_RANK (a) != 1)
2161         {
2162           const GFC_COMPLEX_8 *restrict abase_x;
2163           const GFC_COMPLEX_8 *restrict bbase_y;
2164           GFC_COMPLEX_8 *restrict dest_y;
2165           GFC_COMPLEX_8 s;
2166
2167           for (y = 0; y < ycount; y++)
2168             {
2169               bbase_y = &bbase[y*bystride];
2170               dest_y = &dest[y*rystride];
2171               for (x = 0; x < xcount; x++)
2172                 {
2173                   abase_x = &abase[x*axstride];
2174                   s = (GFC_COMPLEX_8) 0;
2175                   for (n = 0; n < count; n++)
2176                     s += abase_x[n] * bbase_y[n];
2177                   dest_y[x] = s;
2178                 }
2179             }
2180         }
2181       else
2182         {
2183           const GFC_COMPLEX_8 *restrict bbase_y;
2184           GFC_COMPLEX_8 s;
2185
2186           for (y = 0; y < ycount; y++)
2187             {
2188               bbase_y = &bbase[y*bystride];
2189               s = (GFC_COMPLEX_8) 0;
2190               for (n = 0; n < count; n++)
2191                 s += abase[n*axstride] * bbase_y[n];
2192               dest[y*rystride] = s;
2193             }
2194         }
2195     }
2196   else if (axstride < aystride)
2197     {
2198       for (y = 0; y < ycount; y++)
2199         for (x = 0; x < xcount; x++)
2200           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2201
2202       for (y = 0; y < ycount; y++)
2203         for (n = 0; n < count; n++)
2204           for (x = 0; x < xcount; x++)
2205             /* dest[x,y] += a[x,n] * b[n,y] */
2206             dest[x*rxstride + y*rystride] +=
2207                                         abase[x*axstride + n*aystride] *
2208                                         bbase[n*bxstride + y*bystride];
2209     }
2210   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2211     {
2212       const GFC_COMPLEX_8 *restrict bbase_y;
2213       GFC_COMPLEX_8 s;
2214
2215       for (y = 0; y < ycount; y++)
2216         {
2217           bbase_y = &bbase[y*bystride];
2218           s = (GFC_COMPLEX_8) 0;
2219           for (n = 0; n < count; n++)
2220             s += abase[n*axstride] * bbase_y[n*bxstride];
2221           dest[y*rxstride] = s;
2222         }
2223     }
2224   else
2225     {
2226       const GFC_COMPLEX_8 *restrict abase_x;
2227       const GFC_COMPLEX_8 *restrict bbase_y;
2228       GFC_COMPLEX_8 *restrict dest_y;
2229       GFC_COMPLEX_8 s;
2230
2231       for (y = 0; y < ycount; y++)
2232         {
2233           bbase_y = &bbase[y*bystride];
2234           dest_y = &dest[y*rystride];
2235           for (x = 0; x < xcount; x++)
2236             {
2237               abase_x = &abase[x*axstride];
2238               s = (GFC_COMPLEX_8) 0;
2239               for (n = 0; n < count; n++)
2240                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2241               dest_y[x*rxstride] = s;
2242             }
2243         }
2244     }
2245 }
2246 #undef POW3
2247 #undef min
2248 #undef max
2249
2250
2251 /* Compiling main function, with selection code for the processor.  */
2252
2253 /* Currently, this is i386 only.  Adjust for other architectures.  */
2254
2255 #include <config/i386/cpuinfo.h>
2256 void matmul_c8 (gfc_array_c8 * const restrict retarray,
2257         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2258         int blas_limit, blas_call gemm)
2259 {
2260   static void (*matmul_p) (gfc_array_c8 * const restrict retarray,
2261         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2262         int blas_limit, blas_call gemm) = NULL;
2263
2264   if (matmul_p == NULL)
2265     {
2266       matmul_p = matmul_c8_vanilla;
2267       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2268         {
2269           /* Run down the available processors in order of preference.  */
2270 #ifdef HAVE_AVX512F
2271           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2272             {
2273               matmul_p = matmul_c8_avx512f;
2274               goto tailcall;
2275             }
2276
2277 #endif  /* HAVE_AVX512F */
2278
2279 #ifdef HAVE_AVX2
2280           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2281             {
2282               matmul_p = matmul_c8_avx2;
2283               goto tailcall;
2284             }
2285
2286 #endif
2287
2288 #ifdef HAVE_AVX
2289           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2290             {
2291               matmul_p = matmul_c8_avx;
2292               goto tailcall;
2293             }
2294 #endif  /* HAVE_AVX */
2295         }
2296    }
2297
2298 tailcall:
2299    (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2300 }
2301
2302 #else  /* Just the vanilla function.  */
2303
2304 void
2305 matmul_c8 (gfc_array_c8 * const restrict retarray,
2306         gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2307         int blas_limit, blas_call gemm)
2308 {
2309   const GFC_COMPLEX_8 * restrict abase;
2310   const GFC_COMPLEX_8 * restrict bbase;
2311   GFC_COMPLEX_8 * restrict dest;
2312
2313   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2314   index_type x, y, n, count, xcount, ycount;
2315
2316   assert (GFC_DESCRIPTOR_RANK (a) == 2
2317           || GFC_DESCRIPTOR_RANK (b) == 2);
2318
2319 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2320
2321    Either A or B (but not both) can be rank 1:
2322
2323    o One-dimensional argument A is implicitly treated as a row matrix
2324      dimensioned [1,count], so xcount=1.
2325
2326    o One-dimensional argument B is implicitly treated as a column matrix
2327      dimensioned [count, 1], so ycount=1.
2328 */
2329
2330   if (retarray->base_addr == NULL)
2331     {
2332       if (GFC_DESCRIPTOR_RANK (a) == 1)
2333         {
2334           GFC_DIMENSION_SET(retarray->dim[0], 0,
2335                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2336         }
2337       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2338         {
2339           GFC_DIMENSION_SET(retarray->dim[0], 0,
2340                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2341         }
2342       else
2343         {
2344           GFC_DIMENSION_SET(retarray->dim[0], 0,
2345                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2346
2347           GFC_DIMENSION_SET(retarray->dim[1], 0,
2348                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2349                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2350         }
2351
2352       retarray->base_addr
2353         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
2354       retarray->offset = 0;
2355     }
2356   else if (unlikely (compile_options.bounds_check))
2357     {
2358       index_type ret_extent, arg_extent;
2359
2360       if (GFC_DESCRIPTOR_RANK (a) == 1)
2361         {
2362           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2363           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2364           if (arg_extent != ret_extent)
2365             runtime_error ("Incorrect extent in return array in"
2366                            " MATMUL intrinsic: is %ld, should be %ld",
2367                            (long int) ret_extent, (long int) arg_extent);
2368         }
2369       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2370         {
2371           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2372           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2373           if (arg_extent != ret_extent)
2374             runtime_error ("Incorrect extent in return array in"
2375                            " MATMUL intrinsic: is %ld, should be %ld",
2376                            (long int) ret_extent, (long int) arg_extent);
2377         }
2378       else
2379         {
2380           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2381           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2382           if (arg_extent != ret_extent)
2383             runtime_error ("Incorrect extent in return array in"
2384                            " MATMUL intrinsic for dimension 1:"
2385                            " is %ld, should be %ld",
2386                            (long int) ret_extent, (long int) arg_extent);
2387
2388           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2389           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2390           if (arg_extent != ret_extent)
2391             runtime_error ("Incorrect extent in return array in"
2392                            " MATMUL intrinsic for dimension 2:"
2393                            " is %ld, should be %ld",
2394                            (long int) ret_extent, (long int) arg_extent);
2395         }
2396     }
2397
2398
2399   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2400     {
2401       /* One-dimensional result may be addressed in the code below
2402          either as a row or a column matrix. We want both cases to
2403          work. */
2404       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2405     }
2406   else
2407     {
2408       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2409       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2410     }
2411
2412
2413   if (GFC_DESCRIPTOR_RANK (a) == 1)
2414     {
2415       /* Treat it as a a row matrix A[1,count]. */
2416       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2417       aystride = 1;
2418
2419       xcount = 1;
2420       count = GFC_DESCRIPTOR_EXTENT(a,0);
2421     }
2422   else
2423     {
2424       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2425       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2426
2427       count = GFC_DESCRIPTOR_EXTENT(a,1);
2428       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2429     }
2430
2431   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2432     {
2433       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2434         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2435     }
2436
2437   if (GFC_DESCRIPTOR_RANK (b) == 1)
2438     {
2439       /* Treat it as a column matrix B[count,1] */
2440       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2441
2442       /* bystride should never be used for 1-dimensional b.
2443          in case it is we want it to cause a segfault, rather than
2444          an incorrect result. */
2445       bystride = 0xDEADBEEF;
2446       ycount = 1;
2447     }
2448   else
2449     {
2450       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2451       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2452       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2453     }
2454
2455   abase = a->base_addr;
2456   bbase = b->base_addr;
2457   dest = retarray->base_addr;
2458
2459   /* Now that everything is set up, we perform the multiplication
2460      itself.  */
2461
2462 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2463 #define min(a,b) ((a) <= (b) ? (a) : (b))
2464 #define max(a,b) ((a) >= (b) ? (a) : (b))
2465
2466   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2467       && (bxstride == 1 || bystride == 1)
2468       && (((float) xcount) * ((float) ycount) * ((float) count)
2469           > POW3(blas_limit)))
2470     {
2471       const int m = xcount, n = ycount, k = count, ldc = rystride;
2472       const GFC_COMPLEX_8 one = 1, zero = 0;
2473       const int lda = (axstride == 1) ? aystride : axstride,
2474                 ldb = (bxstride == 1) ? bystride : bxstride;
2475
2476       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2477         {
2478           assert (gemm != NULL);
2479           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2480                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2481                 &ldc, 1, 1);
2482           return;
2483         }
2484     }
2485
2486   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2487     {
2488       /* This block of code implements a tuned matmul, derived from
2489          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2490
2491                Bo Kagstrom and Per Ling
2492                Department of Computing Science
2493                Umea University
2494                S-901 87 Umea, Sweden
2495
2496          from netlib.org, translated to C, and modified for matmul.m4.  */
2497
2498       const GFC_COMPLEX_8 *a, *b;
2499       GFC_COMPLEX_8 *c;
2500       const index_type m = xcount, n = ycount, k = count;
2501
2502       /* System generated locals */
2503       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2504                  i1, i2, i3, i4, i5, i6;
2505
2506       /* Local variables */
2507       GFC_COMPLEX_8 t1[65536], /* was [256][256] */
2508                  f11, f12, f21, f22, f31, f32, f41, f42,
2509                  f13, f14, f23, f24, f33, f34, f43, f44;
2510       index_type i, j, l, ii, jj, ll;
2511       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2512
2513       a = abase;
2514       b = bbase;
2515       c = retarray->base_addr;
2516
2517       /* Parameter adjustments */
2518       c_dim1 = rystride;
2519       c_offset = 1 + c_dim1;
2520       c -= c_offset;
2521       a_dim1 = aystride;
2522       a_offset = 1 + a_dim1;
2523       a -= a_offset;
2524       b_dim1 = bystride;
2525       b_offset = 1 + b_dim1;
2526       b -= b_offset;
2527
2528       /* Early exit if possible */
2529       if (m == 0 || n == 0 || k == 0)
2530         return;
2531
2532       /* Empty c first.  */
2533       for (j=1; j<=n; j++)
2534         for (i=1; i<=m; i++)
2535           c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
2536
2537       /* Start turning the crank. */
2538       i1 = n;
2539       for (jj = 1; jj <= i1; jj += 512)
2540         {
2541           /* Computing MIN */
2542           i2 = 512;
2543           i3 = n - jj + 1;
2544           jsec = min(i2,i3);
2545           ujsec = jsec - jsec % 4;
2546           i2 = k;
2547           for (ll = 1; ll <= i2; ll += 256)
2548             {
2549               /* Computing MIN */
2550               i3 = 256;
2551               i4 = k - ll + 1;
2552               lsec = min(i3,i4);
2553               ulsec = lsec - lsec % 2;
2554
2555               i3 = m;
2556               for (ii = 1; ii <= i3; ii += 256)
2557                 {
2558                   /* Computing MIN */
2559                   i4 = 256;
2560                   i5 = m - ii + 1;
2561                   isec = min(i4,i5);
2562                   uisec = isec - isec % 2;
2563                   i4 = ll + ulsec - 1;
2564                   for (l = ll; l <= i4; l += 2)
2565                     {
2566                       i5 = ii + uisec - 1;
2567                       for (i = ii; i <= i5; i += 2)
2568                         {
2569                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2570                                         a[i + l * a_dim1];
2571                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2572                                         a[i + (l + 1) * a_dim1];
2573                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2574                                         a[i + 1 + l * a_dim1];
2575                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2576                                         a[i + 1 + (l + 1) * a_dim1];
2577                         }
2578                       if (uisec < isec)
2579                         {
2580                           t1[l - ll + 1 + (isec << 8) - 257] =
2581                                     a[ii + isec - 1 + l * a_dim1];
2582                           t1[l - ll + 2 + (isec << 8) - 257] =
2583                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2584                         }
2585                     }
2586                   if (ulsec < lsec)
2587                     {
2588                       i4 = ii + isec - 1;
2589                       for (i = ii; i<= i4; ++i)
2590                         {
2591                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2592                                     a[i + (ll + lsec - 1) * a_dim1];
2593                         }
2594                     }
2595
2596                   uisec = isec - isec % 4;
2597                   i4 = jj + ujsec - 1;
2598                   for (j = jj; j <= i4; j += 4)
2599                     {
2600                       i5 = ii + uisec - 1;
2601                       for (i = ii; i <= i5; i += 4)
2602                         {
2603                           f11 = c[i + j * c_dim1];
2604                           f21 = c[i + 1 + j * c_dim1];
2605                           f12 = c[i + (j + 1) * c_dim1];
2606                           f22 = c[i + 1 + (j + 1) * c_dim1];
2607                           f13 = c[i + (j + 2) * c_dim1];
2608                           f23 = c[i + 1 + (j + 2) * c_dim1];
2609                           f14 = c[i + (j + 3) * c_dim1];
2610                           f24 = c[i + 1 + (j + 3) * c_dim1];
2611                           f31 = c[i + 2 + j * c_dim1];
2612                           f41 = c[i + 3 + j * c_dim1];
2613                           f32 = c[i + 2 + (j + 1) * c_dim1];
2614                           f42 = c[i + 3 + (j + 1) * c_dim1];
2615                           f33 = c[i + 2 + (j + 2) * c_dim1];
2616                           f43 = c[i + 3 + (j + 2) * c_dim1];
2617                           f34 = c[i + 2 + (j + 3) * c_dim1];
2618                           f44 = c[i + 3 + (j + 3) * c_dim1];
2619                           i6 = ll + lsec - 1;
2620                           for (l = ll; l <= i6; ++l)
2621                             {
2622                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2623                                       * b[l + j * b_dim1];
2624                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2625                                       * b[l + j * b_dim1];
2626                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627                                       * b[l + (j + 1) * b_dim1];
2628                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629                                       * b[l + (j + 1) * b_dim1];
2630                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631                                       * b[l + (j + 2) * b_dim1];
2632                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633                                       * b[l + (j + 2) * b_dim1];
2634                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635                                       * b[l + (j + 3) * b_dim1];
2636                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637                                       * b[l + (j + 3) * b_dim1];
2638                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2639                                       * b[l + j * b_dim1];
2640                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2641                                       * b[l + j * b_dim1];
2642                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643                                       * b[l + (j + 1) * b_dim1];
2644                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645                                       * b[l + (j + 1) * b_dim1];
2646                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647                                       * b[l + (j + 2) * b_dim1];
2648                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649                                       * b[l + (j + 2) * b_dim1];
2650                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651                                       * b[l + (j + 3) * b_dim1];
2652                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653                                       * b[l + (j + 3) * b_dim1];
2654                             }
2655                           c[i + j * c_dim1] = f11;
2656                           c[i + 1 + j * c_dim1] = f21;
2657                           c[i + (j + 1) * c_dim1] = f12;
2658                           c[i + 1 + (j + 1) * c_dim1] = f22;
2659                           c[i + (j + 2) * c_dim1] = f13;
2660                           c[i + 1 + (j + 2) * c_dim1] = f23;
2661                           c[i + (j + 3) * c_dim1] = f14;
2662                           c[i + 1 + (j + 3) * c_dim1] = f24;
2663                           c[i + 2 + j * c_dim1] = f31;
2664                           c[i + 3 + j * c_dim1] = f41;
2665                           c[i + 2 + (j + 1) * c_dim1] = f32;
2666                           c[i + 3 + (j + 1) * c_dim1] = f42;
2667                           c[i + 2 + (j + 2) * c_dim1] = f33;
2668                           c[i + 3 + (j + 2) * c_dim1] = f43;
2669                           c[i + 2 + (j + 3) * c_dim1] = f34;
2670                           c[i + 3 + (j + 3) * c_dim1] = f44;
2671                         }
2672                       if (uisec < isec)
2673                         {
2674                           i5 = ii + isec - 1;
2675                           for (i = ii + uisec; i <= i5; ++i)
2676                             {
2677                               f11 = c[i + j * c_dim1];
2678                               f12 = c[i + (j + 1) * c_dim1];
2679                               f13 = c[i + (j + 2) * c_dim1];
2680                               f14 = c[i + (j + 3) * c_dim1];
2681                               i6 = ll + lsec - 1;
2682                               for (l = ll; l <= i6; ++l)
2683                                 {
2684                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2685                                           257] * b[l + j * b_dim1];
2686                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2687                                           257] * b[l + (j + 1) * b_dim1];
2688                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689                                           257] * b[l + (j + 2) * b_dim1];
2690                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691                                           257] * b[l + (j + 3) * b_dim1];
2692                                 }
2693                               c[i + j * c_dim1] = f11;
2694                               c[i + (j + 1) * c_dim1] = f12;
2695                               c[i + (j + 2) * c_dim1] = f13;
2696                               c[i + (j + 3) * c_dim1] = f14;
2697                             }
2698                         }
2699                     }
2700                   if (ujsec < jsec)
2701                     {
2702                       i4 = jj + jsec - 1;
2703                       for (j = jj + ujsec; j <= i4; ++j)
2704                         {
2705                           i5 = ii + uisec - 1;
2706                           for (i = ii; i <= i5; i += 4)
2707                             {
2708                               f11 = c[i + j * c_dim1];
2709                               f21 = c[i + 1 + j * c_dim1];
2710                               f31 = c[i + 2 + j * c_dim1];
2711                               f41 = c[i + 3 + j * c_dim1];
2712                               i6 = ll + lsec - 1;
2713                               for (l = ll; l <= i6; ++l)
2714                                 {
2715                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2716                                           257] * b[l + j * b_dim1];
2717                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2718                                           257] * b[l + j * b_dim1];
2719                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2720                                           257] * b[l + j * b_dim1];
2721                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2722                                           257] * b[l + j * b_dim1];
2723                                 }
2724                               c[i + j * c_dim1] = f11;
2725                               c[i + 1 + j * c_dim1] = f21;
2726                               c[i + 2 + j * c_dim1] = f31;
2727                               c[i + 3 + j * c_dim1] = f41;
2728                             }
2729                           i5 = ii + isec - 1;
2730                           for (i = ii + uisec; i <= i5; ++i)
2731                             {
2732                               f11 = c[i + j * c_dim1];
2733                               i6 = ll + lsec - 1;
2734                               for (l = ll; l <= i6; ++l)
2735                                 {
2736                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2737                                           257] * b[l + j * b_dim1];
2738                                 }
2739                               c[i + j * c_dim1] = f11;
2740                             }
2741                         }
2742                     }
2743                 }
2744             }
2745         }
2746       return;
2747     }
2748   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2749     {
2750       if (GFC_DESCRIPTOR_RANK (a) != 1)
2751         {
2752           const GFC_COMPLEX_8 *restrict abase_x;
2753           const GFC_COMPLEX_8 *restrict bbase_y;
2754           GFC_COMPLEX_8 *restrict dest_y;
2755           GFC_COMPLEX_8 s;
2756
2757           for (y = 0; y < ycount; y++)
2758             {
2759               bbase_y = &bbase[y*bystride];
2760               dest_y = &dest[y*rystride];
2761               for (x = 0; x < xcount; x++)
2762                 {
2763                   abase_x = &abase[x*axstride];
2764                   s = (GFC_COMPLEX_8) 0;
2765                   for (n = 0; n < count; n++)
2766                     s += abase_x[n] * bbase_y[n];
2767                   dest_y[x] = s;
2768                 }
2769             }
2770         }
2771       else
2772         {
2773           const GFC_COMPLEX_8 *restrict bbase_y;
2774           GFC_COMPLEX_8 s;
2775
2776           for (y = 0; y < ycount; y++)
2777             {
2778               bbase_y = &bbase[y*bystride];
2779               s = (GFC_COMPLEX_8) 0;
2780               for (n = 0; n < count; n++)
2781                 s += abase[n*axstride] * bbase_y[n];
2782               dest[y*rystride] = s;
2783             }
2784         }
2785     }
2786   else if (axstride < aystride)
2787     {
2788       for (y = 0; y < ycount; y++)
2789         for (x = 0; x < xcount; x++)
2790           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2791
2792       for (y = 0; y < ycount; y++)
2793         for (n = 0; n < count; n++)
2794           for (x = 0; x < xcount; x++)
2795             /* dest[x,y] += a[x,n] * b[n,y] */
2796             dest[x*rxstride + y*rystride] +=
2797                                         abase[x*axstride + n*aystride] *
2798                                         bbase[n*bxstride + y*bystride];
2799     }
2800   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2801     {
2802       const GFC_COMPLEX_8 *restrict bbase_y;
2803       GFC_COMPLEX_8 s;
2804
2805       for (y = 0; y < ycount; y++)
2806         {
2807           bbase_y = &bbase[y*bystride];
2808           s = (GFC_COMPLEX_8) 0;
2809           for (n = 0; n < count; n++)
2810             s += abase[n*axstride] * bbase_y[n*bxstride];
2811           dest[y*rxstride] = s;
2812         }
2813     }
2814   else
2815     {
2816       const GFC_COMPLEX_8 *restrict abase_x;
2817       const GFC_COMPLEX_8 *restrict bbase_y;
2818       GFC_COMPLEX_8 *restrict dest_y;
2819       GFC_COMPLEX_8 s;
2820
2821       for (y = 0; y < ycount; y++)
2822         {
2823           bbase_y = &bbase[y*bystride];
2824           dest_y = &dest[y*rystride];
2825           for (x = 0; x < xcount; x++)
2826             {
2827               abase_x = &abase[x*axstride];
2828               s = (GFC_COMPLEX_8) 0;
2829               for (n = 0; n < count; n++)
2830                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2831               dest_y[x*rxstride] = s;
2832             }
2833         }
2834     }
2835 }
2836 #undef POW3
2837 #undef min
2838 #undef max
2839
2840 #endif
2841 #endif
2842