libgfortran/generated/matmul_i4.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_INTEGER_4)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
  39                           const int *, const GFC_INTEGER_4 *, const int *,
  40                           const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
  73         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_i4);
  76
  77 /* Put exhaustive list of possible architectures here here, ORed together.  */
  78
  79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  80
  81 #ifdef HAVE_AVX
  82 static void
  83 matmul_i4_avx (gfc_array_i4 * const restrict retarray,
  84         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
  85         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  86 static void
  87 matmul_i4_avx (gfc_array_i4 * const restrict retarray,
  88         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
  89         int blas_limit, blas_call gemm)
  90 {
  91   const GFC_INTEGER_4 * restrict abase;
  92   const GFC_INTEGER_4 * restrict bbase;
  93   GFC_INTEGER_4 * restrict dest;
  94
  95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  96   index_type x, y, n, count, xcount, ycount;
  97
  98   assert (GFC_DESCRIPTOR_RANK (a) == 2
  99           || GFC_DESCRIPTOR_RANK (b) == 2);
 100
 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 102
 103    Either A or B (but not both) can be rank 1:
 104
 105    o One-dimensional argument A is implicitly treated as a row matrix
 106      dimensioned [1,count], so xcount=1.
 107
 108    o One-dimensional argument B is implicitly treated as a column matrix
 109      dimensioned [count, 1], so ycount=1.
 110 */
 111
 112   if (retarray->base_addr == NULL)
 113     {
 114       if (GFC_DESCRIPTOR_RANK (a) == 1)
 115         {
 116           GFC_DIMENSION_SET(retarray->dim[0], 0,
 117                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 118         }
 119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 120         {
 121           GFC_DIMENSION_SET(retarray->dim[0], 0,
 122                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 123         }
 124       else
 125         {
 126           GFC_DIMENSION_SET(retarray->dim[0], 0,
 127                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 128
 129           GFC_DIMENSION_SET(retarray->dim[1], 0,
 130                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 131                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 132         }
 133
 134       retarray->base_addr
 135         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
 136       retarray->offset = 0;
 137     }
 138   else if (unlikely (compile_options.bounds_check))
 139     {
 140       index_type ret_extent, arg_extent;
 141
 142       if (GFC_DESCRIPTOR_RANK (a) == 1)
 143         {
 144           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 145           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 146           if (arg_extent != ret_extent)
 147             runtime_error ("Incorrect extent in return array in"
 148                            " MATMUL intrinsic: is %ld, should be %ld",
 149                            (long int) ret_extent, (long int) arg_extent);
 150         }
 151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 152         {
 153           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 154           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 155           if (arg_extent != ret_extent)
 156             runtime_error ("Incorrect extent in return array in"
 157                            " MATMUL intrinsic: is %ld, should be %ld",
 158                            (long int) ret_extent, (long int) arg_extent);
 159         }
 160       else
 161         {
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Incorrect extent in return array in"
 166                            " MATMUL intrinsic for dimension 1:"
 167                            " is %ld, should be %ld",
 168                            (long int) ret_extent, (long int) arg_extent);
 169
 170           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 171           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 172           if (arg_extent != ret_extent)
 173             runtime_error ("Incorrect extent in return array in"
 174                            " MATMUL intrinsic for dimension 2:"
 175                            " is %ld, should be %ld",
 176                            (long int) ret_extent, (long int) arg_extent);
 177         }
 178     }
 179
 180
 181   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 182     {
 183       /* One-dimensional result may be addressed in the code below
 184          either as a row or a column matrix. We want both cases to
 185          work. */
 186       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 187     }
 188   else
 189     {
 190       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 191       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 192     }
 193
 194
 195   if (GFC_DESCRIPTOR_RANK (a) == 1)
 196     {
 197       /* Treat it as a a row matrix A[1,count]. */
 198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 199       aystride = 1;
 200
 201       xcount = 1;
 202       count = GFC_DESCRIPTOR_EXTENT(a,0);
 203     }
 204   else
 205     {
 206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 207       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 208
 209       count = GFC_DESCRIPTOR_EXTENT(a,1);
 210       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 211     }
 212
 213   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 214     {
 215       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 216         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 217     }
 218
 219   if (GFC_DESCRIPTOR_RANK (b) == 1)
 220     {
 221       /* Treat it as a column matrix B[count,1] */
 222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 223
 224       /* bystride should never be used for 1-dimensional b.
 225          in case it is we want it to cause a segfault, rather than
 226          an incorrect result. */
 227       bystride = 0xDEADBEEF;
 228       ycount = 1;
 229     }
 230   else
 231     {
 232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 235     }
 236
 237   abase = a->base_addr;
 238   bbase = b->base_addr;
 239   dest = retarray->base_addr;
 240
 241   /* Now that everything is set up, we perform the multiplication
 242      itself.  */
 243
 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 245 #define min(a,b) ((a) <= (b) ? (a) : (b))
 246 #define max(a,b) ((a) >= (b) ? (a) : (b))
 247
 248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 249       && (bxstride == 1 || bystride == 1)
 250       && (((float) xcount) * ((float) ycount) * ((float) count)
 251           > POW3(blas_limit)))
 252     {
 253       const int m = xcount, n = ycount, k = count, ldc = rystride;
 254       const GFC_INTEGER_4 one = 1, zero = 0;
 255       const int lda = (axstride == 1) ? aystride : axstride,
 256                 ldb = (bxstride == 1) ? bystride : bxstride;
 257
 258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 259         {
 260           assert (gemm != NULL);
 261           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 262                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 263                 &ldc, 1, 1);
 264           return;
 265         }
 266     }
 267
 268   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 269     {
 270       /* This block of code implements a tuned matmul, derived from
 271          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 272
 273                Bo Kagstrom and Per Ling
 274                Department of Computing Science
 275                Umea University
 276                S-901 87 Umea, Sweden
 277
 278          from netlib.org, translated to C, and modified for matmul.m4.  */
 279
 280       const GFC_INTEGER_4 *a, *b;
 281       GFC_INTEGER_4 *c;
 282       const index_type m = xcount, n = ycount, k = count;
 283
 284       /* System generated locals */
 285       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 286                  i1, i2, i3, i4, i5, i6;
 287
 288       /* Local variables */
 289       GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
 290                  f13, f14, f23, f24, f33, f34, f43, f44;
 291       index_type i, j, l, ii, jj, ll;
 292       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 293       GFC_INTEGER_4 *t1;
 294
 295       a = abase;
 296       b = bbase;
 297       c = retarray->base_addr;
 298
 299       /* Parameter adjustments */
 300       c_dim1 = rystride;
 301       c_offset = 1 + c_dim1;
 302       c -= c_offset;
 303       a_dim1 = aystride;
 304       a_offset = 1 + a_dim1;
 305       a -= a_offset;
 306       b_dim1 = bystride;
 307       b_offset = 1 + b_dim1;
 308       b -= b_offset;
 309
 310       /* Early exit if possible */
 311       if (m == 0 || n == 0 || k == 0)
 312         return;
 313
 314       /* Adjust size of t1 to what is needed.  */
 315       index_type t1_dim;
 316       t1_dim = (a_dim1-1) * 256 + b_dim1;
 317       if (t1_dim > 65536)
 318         t1_dim = 65536;
 319
 320       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 321
 322       /* Empty c first.  */
 323       for (j=1; j<=n; j++)
 324         for (i=1; i<=m; i++)
 325           c[i + j * c_dim1] = (GFC_INTEGER_4)0;
 326
 327       /* Start turning the crank. */
 328       i1 = n;
 329       for (jj = 1; jj <= i1; jj += 512)
 330         {
 331           /* Computing MIN */
 332           i2 = 512;
 333           i3 = n - jj + 1;
 334           jsec = min(i2,i3);
 335           ujsec = jsec - jsec % 4;
 336           i2 = k;
 337           for (ll = 1; ll <= i2; ll += 256)
 338             {
 339               /* Computing MIN */
 340               i3 = 256;
 341               i4 = k - ll + 1;
 342               lsec = min(i3,i4);
 343               ulsec = lsec - lsec % 2;
 344
 345               i3 = m;
 346               for (ii = 1; ii <= i3; ii += 256)
 347                 {
 348                   /* Computing MIN */
 349                   i4 = 256;
 350                   i5 = m - ii + 1;
 351                   isec = min(i4,i5);
 352                   uisec = isec - isec % 2;
 353                   i4 = ll + ulsec - 1;
 354                   for (l = ll; l <= i4; l += 2)
 355                     {
 356                       i5 = ii + uisec - 1;
 357                       for (i = ii; i <= i5; i += 2)
 358                         {
 359                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 360                                         a[i + l * a_dim1];
 361                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 362                                         a[i + (l + 1) * a_dim1];
 363                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 364                                         a[i + 1 + l * a_dim1];
 365                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 366                                         a[i + 1 + (l + 1) * a_dim1];
 367                         }
 368                       if (uisec < isec)
 369                         {
 370                           t1[l - ll + 1 + (isec << 8) - 257] =
 371                                     a[ii + isec - 1 + l * a_dim1];
 372                           t1[l - ll + 2 + (isec << 8) - 257] =
 373                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 374                         }
 375                     }
 376                   if (ulsec < lsec)
 377                     {
 378                       i4 = ii + isec - 1;
 379                       for (i = ii; i<= i4; ++i)
 380                         {
 381                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 382                                     a[i + (ll + lsec - 1) * a_dim1];
 383                         }
 384                     }
 385
 386                   uisec = isec - isec % 4;
 387                   i4 = jj + ujsec - 1;
 388                   for (j = jj; j <= i4; j += 4)
 389                     {
 390                       i5 = ii + uisec - 1;
 391                       for (i = ii; i <= i5; i += 4)
 392                         {
 393                           f11 = c[i + j * c_dim1];
 394                           f21 = c[i + 1 + j * c_dim1];
 395                           f12 = c[i + (j + 1) * c_dim1];
 396                           f22 = c[i + 1 + (j + 1) * c_dim1];
 397                           f13 = c[i + (j + 2) * c_dim1];
 398                           f23 = c[i + 1 + (j + 2) * c_dim1];
 399                           f14 = c[i + (j + 3) * c_dim1];
 400                           f24 = c[i + 1 + (j + 3) * c_dim1];
 401                           f31 = c[i + 2 + j * c_dim1];
 402                           f41 = c[i + 3 + j * c_dim1];
 403                           f32 = c[i + 2 + (j + 1) * c_dim1];
 404                           f42 = c[i + 3 + (j + 1) * c_dim1];
 405                           f33 = c[i + 2 + (j + 2) * c_dim1];
 406                           f43 = c[i + 3 + (j + 2) * c_dim1];
 407                           f34 = c[i + 2 + (j + 3) * c_dim1];
 408                           f44 = c[i + 3 + (j + 3) * c_dim1];
 409                           i6 = ll + lsec - 1;
 410                           for (l = ll; l <= i6; ++l)
 411                             {
 412                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 413                                       * b[l + j * b_dim1];
 414                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 415                                       * b[l + j * b_dim1];
 416                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 417                                       * b[l + (j + 1) * b_dim1];
 418                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 419                                       * b[l + (j + 1) * b_dim1];
 420                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 421                                       * b[l + (j + 2) * b_dim1];
 422                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 423                                       * b[l + (j + 2) * b_dim1];
 424                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 425                                       * b[l + (j + 3) * b_dim1];
 426                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 427                                       * b[l + (j + 3) * b_dim1];
 428                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 429                                       * b[l + j * b_dim1];
 430                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 431                                       * b[l + j * b_dim1];
 432                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 433                                       * b[l + (j + 1) * b_dim1];
 434                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 435                                       * b[l + (j + 1) * b_dim1];
 436                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 437                                       * b[l + (j + 2) * b_dim1];
 438                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 439                                       * b[l + (j + 2) * b_dim1];
 440                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 441                                       * b[l + (j + 3) * b_dim1];
 442                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 443                                       * b[l + (j + 3) * b_dim1];
 444                             }
 445                           c[i + j * c_dim1] = f11;
 446                           c[i + 1 + j * c_dim1] = f21;
 447                           c[i + (j + 1) * c_dim1] = f12;
 448                           c[i + 1 + (j + 1) * c_dim1] = f22;
 449                           c[i + (j + 2) * c_dim1] = f13;
 450                           c[i + 1 + (j + 2) * c_dim1] = f23;
 451                           c[i + (j + 3) * c_dim1] = f14;
 452                           c[i + 1 + (j + 3) * c_dim1] = f24;
 453                           c[i + 2 + j * c_dim1] = f31;
 454                           c[i + 3 + j * c_dim1] = f41;
 455                           c[i + 2 + (j + 1) * c_dim1] = f32;
 456                           c[i + 3 + (j + 1) * c_dim1] = f42;
 457                           c[i + 2 + (j + 2) * c_dim1] = f33;
 458                           c[i + 3 + (j + 2) * c_dim1] = f43;
 459                           c[i + 2 + (j + 3) * c_dim1] = f34;
 460                           c[i + 3 + (j + 3) * c_dim1] = f44;
 461                         }
 462                       if (uisec < isec)
 463                         {
 464                           i5 = ii + isec - 1;
 465                           for (i = ii + uisec; i <= i5; ++i)
 466                             {
 467                               f11 = c[i + j * c_dim1];
 468                               f12 = c[i + (j + 1) * c_dim1];
 469                               f13 = c[i + (j + 2) * c_dim1];
 470                               f14 = c[i + (j + 3) * c_dim1];
 471                               i6 = ll + lsec - 1;
 472                               for (l = ll; l <= i6; ++l)
 473                                 {
 474                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 475                                           257] * b[l + j * b_dim1];
 476                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 477                                           257] * b[l + (j + 1) * b_dim1];
 478                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 479                                           257] * b[l + (j + 2) * b_dim1];
 480                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 481                                           257] * b[l + (j + 3) * b_dim1];
 482                                 }
 483                               c[i + j * c_dim1] = f11;
 484                               c[i + (j + 1) * c_dim1] = f12;
 485                               c[i + (j + 2) * c_dim1] = f13;
 486                               c[i + (j + 3) * c_dim1] = f14;
 487                             }
 488                         }
 489                     }
 490                   if (ujsec < jsec)
 491                     {
 492                       i4 = jj + jsec - 1;
 493                       for (j = jj + ujsec; j <= i4; ++j)
 494                         {
 495                           i5 = ii + uisec - 1;
 496                           for (i = ii; i <= i5; i += 4)
 497                             {
 498                               f11 = c[i + j * c_dim1];
 499                               f21 = c[i + 1 + j * c_dim1];
 500                               f31 = c[i + 2 + j * c_dim1];
 501                               f41 = c[i + 3 + j * c_dim1];
 502                               i6 = ll + lsec - 1;
 503                               for (l = ll; l <= i6; ++l)
 504                                 {
 505                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 506                                           257] * b[l + j * b_dim1];
 507                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 508                                           257] * b[l + j * b_dim1];
 509                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 510                                           257] * b[l + j * b_dim1];
 511                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 512                                           257] * b[l + j * b_dim1];
 513                                 }
 514                               c[i + j * c_dim1] = f11;
 515                               c[i + 1 + j * c_dim1] = f21;
 516                               c[i + 2 + j * c_dim1] = f31;
 517                               c[i + 3 + j * c_dim1] = f41;
 518                             }
 519                           i5 = ii + isec - 1;
 520                           for (i = ii + uisec; i <= i5; ++i)
 521                             {
 522                               f11 = c[i + j * c_dim1];
 523                               i6 = ll + lsec - 1;
 524                               for (l = ll; l <= i6; ++l)
 525                                 {
 526                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 527                                           257] * b[l + j * b_dim1];
 528                                 }
 529                               c[i + j * c_dim1] = f11;
 530                             }
 531                         }
 532                     }
 533                 }
 534             }
 535         }
 536       free(t1);
 537       return;
 538     }
 539   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 540     {
 541       if (GFC_DESCRIPTOR_RANK (a) != 1)
 542         {
 543           const GFC_INTEGER_4 *restrict abase_x;
 544           const GFC_INTEGER_4 *restrict bbase_y;
 545           GFC_INTEGER_4 *restrict dest_y;
 546           GFC_INTEGER_4 s;
 547
 548           for (y = 0; y < ycount; y++)
 549             {
 550               bbase_y = &bbase[y*bystride];
 551               dest_y = &dest[y*rystride];
 552               for (x = 0; x < xcount; x++)
 553                 {
 554                   abase_x = &abase[x*axstride];
 555                   s = (GFC_INTEGER_4) 0;
 556                   for (n = 0; n < count; n++)
 557                     s += abase_x[n] * bbase_y[n];
 558                   dest_y[x] = s;
 559                 }
 560             }
 561         }
 562       else
 563         {
 564           const GFC_INTEGER_4 *restrict bbase_y;
 565           GFC_INTEGER_4 s;
 566
 567           for (y = 0; y < ycount; y++)
 568             {
 569               bbase_y = &bbase[y*bystride];
 570               s = (GFC_INTEGER_4) 0;
 571               for (n = 0; n < count; n++)
 572                 s += abase[n*axstride] * bbase_y[n];
 573               dest[y*rystride] = s;
 574             }
 575         }
 576     }
 577   else if (axstride < aystride)
 578     {
 579       for (y = 0; y < ycount; y++)
 580         for (x = 0; x < xcount; x++)
 581           dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
 582
 583       for (y = 0; y < ycount; y++)
 584         for (n = 0; n < count; n++)
 585           for (x = 0; x < xcount; x++)
 586             /* dest[x,y] += a[x,n] * b[n,y] */
 587             dest[x*rxstride + y*rystride] +=
 588                                         abase[x*axstride + n*aystride] *
 589                                         bbase[n*bxstride + y*bystride];
 590     }
 591   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 592     {
 593       const GFC_INTEGER_4 *restrict bbase_y;
 594       GFC_INTEGER_4 s;
 595
 596       for (y = 0; y < ycount; y++)
 597         {
 598           bbase_y = &bbase[y*bystride];
 599           s = (GFC_INTEGER_4) 0;
 600           for (n = 0; n < count; n++)
 601             s += abase[n*axstride] * bbase_y[n*bxstride];
 602           dest[y*rxstride] = s;
 603         }
 604     }
 605   else
 606     {
 607       const GFC_INTEGER_4 *restrict abase_x;
 608       const GFC_INTEGER_4 *restrict bbase_y;
 609       GFC_INTEGER_4 *restrict dest_y;
 610       GFC_INTEGER_4 s;
 611
 612       for (y = 0; y < ycount; y++)
 613         {
 614           bbase_y = &bbase[y*bystride];
 615           dest_y = &dest[y*rystride];
 616           for (x = 0; x < xcount; x++)
 617             {
 618               abase_x = &abase[x*axstride];
 619               s = (GFC_INTEGER_4) 0;
 620               for (n = 0; n < count; n++)
 621                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 622               dest_y[x*rxstride] = s;
 623             }
 624         }
 625     }
 626 }
 627 #undef POW3
 628 #undef min
 629 #undef max
 630
 631 #endif /* HAVE_AVX */
 632
 633 #ifdef HAVE_AVX2
 634 static void
 635 matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
 636         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
 637         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 638 static void
 639 matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
 640         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
 641         int blas_limit, blas_call gemm)
 642 {
 643   const GFC_INTEGER_4 * restrict abase;
 644   const GFC_INTEGER_4 * restrict bbase;
 645   GFC_INTEGER_4 * restrict dest;
 646
 647   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 648   index_type x, y, n, count, xcount, ycount;
 649
 650   assert (GFC_DESCRIPTOR_RANK (a) == 2
 651           || GFC_DESCRIPTOR_RANK (b) == 2);
 652
 653 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 654
 655    Either A or B (but not both) can be rank 1:
 656
 657    o One-dimensional argument A is implicitly treated as a row matrix
 658      dimensioned [1,count], so xcount=1.
 659
 660    o One-dimensional argument B is implicitly treated as a column matrix
 661      dimensioned [count, 1], so ycount=1.
 662 */
 663
 664   if (retarray->base_addr == NULL)
 665     {
 666       if (GFC_DESCRIPTOR_RANK (a) == 1)
 667         {
 668           GFC_DIMENSION_SET(retarray->dim[0], 0,
 669                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 670         }
 671       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 672         {
 673           GFC_DIMENSION_SET(retarray->dim[0], 0,
 674                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 675         }
 676       else
 677         {
 678           GFC_DIMENSION_SET(retarray->dim[0], 0,
 679                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 680
 681           GFC_DIMENSION_SET(retarray->dim[1], 0,
 682                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 683                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 684         }
 685
 686       retarray->base_addr
 687         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
 688       retarray->offset = 0;
 689     }
 690   else if (unlikely (compile_options.bounds_check))
 691     {
 692       index_type ret_extent, arg_extent;
 693
 694       if (GFC_DESCRIPTOR_RANK (a) == 1)
 695         {
 696           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 697           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 698           if (arg_extent != ret_extent)
 699             runtime_error ("Incorrect extent in return array in"
 700                            " MATMUL intrinsic: is %ld, should be %ld",
 701                            (long int) ret_extent, (long int) arg_extent);
 702         }
 703       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 704         {
 705           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 706           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 707           if (arg_extent != ret_extent)
 708             runtime_error ("Incorrect extent in return array in"
 709                            " MATMUL intrinsic: is %ld, should be %ld",
 710                            (long int) ret_extent, (long int) arg_extent);
 711         }
 712       else
 713         {
 714           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 715           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 716           if (arg_extent != ret_extent)
 717             runtime_error ("Incorrect extent in return array in"
 718                            " MATMUL intrinsic for dimension 1:"
 719                            " is %ld, should be %ld",
 720                            (long int) ret_extent, (long int) arg_extent);
 721
 722           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 723           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 724           if (arg_extent != ret_extent)
 725             runtime_error ("Incorrect extent in return array in"
 726                            " MATMUL intrinsic for dimension 2:"
 727                            " is %ld, should be %ld",
 728                            (long int) ret_extent, (long int) arg_extent);
 729         }
 730     }
 731
 732
 733   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 734     {
 735       /* One-dimensional result may be addressed in the code below
 736          either as a row or a column matrix. We want both cases to
 737          work. */
 738       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 739     }
 740   else
 741     {
 742       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 743       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 744     }
 745
 746
 747   if (GFC_DESCRIPTOR_RANK (a) == 1)
 748     {
 749       /* Treat it as a a row matrix A[1,count]. */
 750       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 751       aystride = 1;
 752
 753       xcount = 1;
 754       count = GFC_DESCRIPTOR_EXTENT(a,0);
 755     }
 756   else
 757     {
 758       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 759       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 760
 761       count = GFC_DESCRIPTOR_EXTENT(a,1);
 762       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 763     }
 764
 765   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 766     {
 767       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 768         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 769     }
 770
 771   if (GFC_DESCRIPTOR_RANK (b) == 1)
 772     {
 773       /* Treat it as a column matrix B[count,1] */
 774       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 775
 776       /* bystride should never be used for 1-dimensional b.
 777          in case it is we want it to cause a segfault, rather than
 778          an incorrect result. */
 779       bystride = 0xDEADBEEF;
 780       ycount = 1;
 781     }
 782   else
 783     {
 784       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 785       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 786       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 787     }
 788
 789   abase = a->base_addr;
 790   bbase = b->base_addr;
 791   dest = retarray->base_addr;
 792
 793   /* Now that everything is set up, we perform the multiplication
 794      itself.  */
 795
 796 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 797 #define min(a,b) ((a) <= (b) ? (a) : (b))
 798 #define max(a,b) ((a) >= (b) ? (a) : (b))
 799
 800   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 801       && (bxstride == 1 || bystride == 1)
 802       && (((float) xcount) * ((float) ycount) * ((float) count)
 803           > POW3(blas_limit)))
 804     {
 805       const int m = xcount, n = ycount, k = count, ldc = rystride;
 806       const GFC_INTEGER_4 one = 1, zero = 0;
 807       const int lda = (axstride == 1) ? aystride : axstride,
 808                 ldb = (bxstride == 1) ? bystride : bxstride;
 809
 810       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 811         {
 812           assert (gemm != NULL);
 813           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 814                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 815                 &ldc, 1, 1);
 816           return;
 817         }
 818     }
 819
 820   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 821     {
 822       /* This block of code implements a tuned matmul, derived from
 823          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 824
 825                Bo Kagstrom and Per Ling
 826                Department of Computing Science
 827                Umea University
 828                S-901 87 Umea, Sweden
 829
 830          from netlib.org, translated to C, and modified for matmul.m4.  */
 831
 832       const GFC_INTEGER_4 *a, *b;
 833       GFC_INTEGER_4 *c;
 834       const index_type m = xcount, n = ycount, k = count;
 835
 836       /* System generated locals */
 837       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 838                  i1, i2, i3, i4, i5, i6;
 839
 840       /* Local variables */
 841       GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
 842                  f13, f14, f23, f24, f33, f34, f43, f44;
 843       index_type i, j, l, ii, jj, ll;
 844       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 845       GFC_INTEGER_4 *t1;
 846
 847       a = abase;
 848       b = bbase;
 849       c = retarray->base_addr;
 850
 851       /* Parameter adjustments */
 852       c_dim1 = rystride;
 853       c_offset = 1 + c_dim1;
 854       c -= c_offset;
 855       a_dim1 = aystride;
 856       a_offset = 1 + a_dim1;
 857       a -= a_offset;
 858       b_dim1 = bystride;
 859       b_offset = 1 + b_dim1;
 860       b -= b_offset;
 861
 862       /* Early exit if possible */
 863       if (m == 0 || n == 0 || k == 0)
 864         return;
 865
 866       /* Adjust size of t1 to what is needed.  */
 867       index_type t1_dim;
 868       t1_dim = (a_dim1-1) * 256 + b_dim1;
 869       if (t1_dim > 65536)
 870         t1_dim = 65536;
 871
 872       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
 873
 874       /* Empty c first.  */
 875       for (j=1; j<=n; j++)
 876         for (i=1; i<=m; i++)
 877           c[i + j * c_dim1] = (GFC_INTEGER_4)0;
 878
 879       /* Start turning the crank. */
 880       i1 = n;
 881       for (jj = 1; jj <= i1; jj += 512)
 882         {
 883           /* Computing MIN */
 884           i2 = 512;
 885           i3 = n - jj + 1;
 886           jsec = min(i2,i3);
 887           ujsec = jsec - jsec % 4;
 888           i2 = k;
 889           for (ll = 1; ll <= i2; ll += 256)
 890             {
 891               /* Computing MIN */
 892               i3 = 256;
 893               i4 = k - ll + 1;
 894               lsec = min(i3,i4);
 895               ulsec = lsec - lsec % 2;
 896
 897               i3 = m;
 898               for (ii = 1; ii <= i3; ii += 256)
 899                 {
 900                   /* Computing MIN */
 901                   i4 = 256;
 902                   i5 = m - ii + 1;
 903                   isec = min(i4,i5);
 904                   uisec = isec - isec % 2;
 905                   i4 = ll + ulsec - 1;
 906                   for (l = ll; l <= i4; l += 2)
 907                     {
 908                       i5 = ii + uisec - 1;
 909                       for (i = ii; i <= i5; i += 2)
 910                         {
 911                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 912                                         a[i + l * a_dim1];
 913                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 914                                         a[i + (l + 1) * a_dim1];
 915                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 916                                         a[i + 1 + l * a_dim1];
 917                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 918                                         a[i + 1 + (l + 1) * a_dim1];
 919                         }
 920                       if (uisec < isec)
 921                         {
 922                           t1[l - ll + 1 + (isec << 8) - 257] =
 923                                     a[ii + isec - 1 + l * a_dim1];
 924                           t1[l - ll + 2 + (isec << 8) - 257] =
 925                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 926                         }
 927                     }
 928                   if (ulsec < lsec)
 929                     {
 930                       i4 = ii + isec - 1;
 931                       for (i = ii; i<= i4; ++i)
 932                         {
 933                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 934                                     a[i + (ll + lsec - 1) * a_dim1];
 935                         }
 936                     }
 937
 938                   uisec = isec - isec % 4;
 939                   i4 = jj + ujsec - 1;
 940                   for (j = jj; j <= i4; j += 4)
 941                     {
 942                       i5 = ii + uisec - 1;
 943                       for (i = ii; i <= i5; i += 4)
 944                         {
 945                           f11 = c[i + j * c_dim1];
 946                           f21 = c[i + 1 + j * c_dim1];
 947                           f12 = c[i + (j + 1) * c_dim1];
 948                           f22 = c[i + 1 + (j + 1) * c_dim1];
 949                           f13 = c[i + (j + 2) * c_dim1];
 950                           f23 = c[i + 1 + (j + 2) * c_dim1];
 951                           f14 = c[i + (j + 3) * c_dim1];
 952                           f24 = c[i + 1 + (j + 3) * c_dim1];
 953                           f31 = c[i + 2 + j * c_dim1];
 954                           f41 = c[i + 3 + j * c_dim1];
 955                           f32 = c[i + 2 + (j + 1) * c_dim1];
 956                           f42 = c[i + 3 + (j + 1) * c_dim1];
 957                           f33 = c[i + 2 + (j + 2) * c_dim1];
 958                           f43 = c[i + 3 + (j + 2) * c_dim1];
 959                           f34 = c[i + 2 + (j + 3) * c_dim1];
 960                           f44 = c[i + 3 + (j + 3) * c_dim1];
 961                           i6 = ll + lsec - 1;
 962                           for (l = ll; l <= i6; ++l)
 963                             {
 964                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 965                                       * b[l + j * b_dim1];
 966                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 967                                       * b[l + j * b_dim1];
 968                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 969                                       * b[l + (j + 1) * b_dim1];
 970                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 971                                       * b[l + (j + 1) * b_dim1];
 972                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 973                                       * b[l + (j + 2) * b_dim1];
 974                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 975                                       * b[l + (j + 2) * b_dim1];
 976                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 977                                       * b[l + (j + 3) * b_dim1];
 978                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 979                                       * b[l + (j + 3) * b_dim1];
 980                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 981                                       * b[l + j * b_dim1];
 982                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 983                                       * b[l + j * b_dim1];
 984                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 985                                       * b[l + (j + 1) * b_dim1];
 986                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 987                                       * b[l + (j + 1) * b_dim1];
 988                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 989                                       * b[l + (j + 2) * b_dim1];
 990                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 991                                       * b[l + (j + 2) * b_dim1];
 992                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 993                                       * b[l + (j + 3) * b_dim1];
 994                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 995                                       * b[l + (j + 3) * b_dim1];
 996                             }
 997                           c[i + j * c_dim1] = f11;
 998                           c[i + 1 + j * c_dim1] = f21;
 999                           c[i + (j + 1) * c_dim1] = f12;
1000                           c[i + 1 + (j + 1) * c_dim1] = f22;
1001                           c[i + (j + 2) * c_dim1] = f13;
1002                           c[i + 1 + (j + 2) * c_dim1] = f23;
1003                           c[i + (j + 3) * c_dim1] = f14;
1004                           c[i + 1 + (j + 3) * c_dim1] = f24;
1005                           c[i + 2 + j * c_dim1] = f31;
1006                           c[i + 3 + j * c_dim1] = f41;
1007                           c[i + 2 + (j + 1) * c_dim1] = f32;
1008                           c[i + 3 + (j + 1) * c_dim1] = f42;
1009                           c[i + 2 + (j + 2) * c_dim1] = f33;
1010                           c[i + 3 + (j + 2) * c_dim1] = f43;
1011                           c[i + 2 + (j + 3) * c_dim1] = f34;
1012                           c[i + 3 + (j + 3) * c_dim1] = f44;
1013                         }
1014                       if (uisec < isec)
1015                         {
1016                           i5 = ii + isec - 1;
1017                           for (i = ii + uisec; i <= i5; ++i)
1018                             {
1019                               f11 = c[i + j * c_dim1];
1020                               f12 = c[i + (j + 1) * c_dim1];
1021                               f13 = c[i + (j + 2) * c_dim1];
1022                               f14 = c[i + (j + 3) * c_dim1];
1023                               i6 = ll + lsec - 1;
1024                               for (l = ll; l <= i6; ++l)
1025                                 {
1026                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027                                           257] * b[l + j * b_dim1];
1028                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029                                           257] * b[l + (j + 1) * b_dim1];
1030                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031                                           257] * b[l + (j + 2) * b_dim1];
1032                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033                                           257] * b[l + (j + 3) * b_dim1];
1034                                 }
1035                               c[i + j * c_dim1] = f11;
1036                               c[i + (j + 1) * c_dim1] = f12;
1037                               c[i + (j + 2) * c_dim1] = f13;
1038                               c[i + (j + 3) * c_dim1] = f14;
1039                             }
1040                         }
1041                     }
1042                   if (ujsec < jsec)
1043                     {
1044                       i4 = jj + jsec - 1;
1045                       for (j = jj + ujsec; j <= i4; ++j)
1046                         {
1047                           i5 = ii + uisec - 1;
1048                           for (i = ii; i <= i5; i += 4)
1049                             {
1050                               f11 = c[i + j * c_dim1];
1051                               f21 = c[i + 1 + j * c_dim1];
1052                               f31 = c[i + 2 + j * c_dim1];
1053                               f41 = c[i + 3 + j * c_dim1];
1054                               i6 = ll + lsec - 1;
1055                               for (l = ll; l <= i6; ++l)
1056                                 {
1057                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058                                           257] * b[l + j * b_dim1];
1059                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060                                           257] * b[l + j * b_dim1];
1061                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062                                           257] * b[l + j * b_dim1];
1063                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064                                           257] * b[l + j * b_dim1];
1065                                 }
1066                               c[i + j * c_dim1] = f11;
1067                               c[i + 1 + j * c_dim1] = f21;
1068                               c[i + 2 + j * c_dim1] = f31;
1069                               c[i + 3 + j * c_dim1] = f41;
1070                             }
1071                           i5 = ii + isec - 1;
1072                           for (i = ii + uisec; i <= i5; ++i)
1073                             {
1074                               f11 = c[i + j * c_dim1];
1075                               i6 = ll + lsec - 1;
1076                               for (l = ll; l <= i6; ++l)
1077                                 {
1078                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079                                           257] * b[l + j * b_dim1];
1080                                 }
1081                               c[i + j * c_dim1] = f11;
1082                             }
1083                         }
1084                     }
1085                 }
1086             }
1087         }
1088       free(t1);
1089       return;
1090     }
1091   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092     {
1093       if (GFC_DESCRIPTOR_RANK (a) != 1)
1094         {
1095           const GFC_INTEGER_4 *restrict abase_x;
1096           const GFC_INTEGER_4 *restrict bbase_y;
1097           GFC_INTEGER_4 *restrict dest_y;
1098           GFC_INTEGER_4 s;
1099
1100           for (y = 0; y < ycount; y++)
1101             {
1102               bbase_y = &bbase[y*bystride];
1103               dest_y = &dest[y*rystride];
1104               for (x = 0; x < xcount; x++)
1105                 {
1106                   abase_x = &abase[x*axstride];
1107                   s = (GFC_INTEGER_4) 0;
1108                   for (n = 0; n < count; n++)
1109                     s += abase_x[n] * bbase_y[n];
1110                   dest_y[x] = s;
1111                 }
1112             }
1113         }
1114       else
1115         {
1116           const GFC_INTEGER_4 *restrict bbase_y;
1117           GFC_INTEGER_4 s;
1118
1119           for (y = 0; y < ycount; y++)
1120             {
1121               bbase_y = &bbase[y*bystride];
1122               s = (GFC_INTEGER_4) 0;
1123               for (n = 0; n < count; n++)
1124                 s += abase[n*axstride] * bbase_y[n];
1125               dest[y*rystride] = s;
1126             }
1127         }
1128     }
1129   else if (axstride < aystride)
1130     {
1131       for (y = 0; y < ycount; y++)
1132         for (x = 0; x < xcount; x++)
1133           dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
1134
1135       for (y = 0; y < ycount; y++)
1136         for (n = 0; n < count; n++)
1137           for (x = 0; x < xcount; x++)
1138             /* dest[x,y] += a[x,n] * b[n,y] */
1139             dest[x*rxstride + y*rystride] +=
1140                                         abase[x*axstride + n*aystride] *
1141                                         bbase[n*bxstride + y*bystride];
1142     }
1143   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1144     {
1145       const GFC_INTEGER_4 *restrict bbase_y;
1146       GFC_INTEGER_4 s;
1147
1148       for (y = 0; y < ycount; y++)
1149         {
1150           bbase_y = &bbase[y*bystride];
1151           s = (GFC_INTEGER_4) 0;
1152           for (n = 0; n < count; n++)
1153             s += abase[n*axstride] * bbase_y[n*bxstride];
1154           dest[y*rxstride] = s;
1155         }
1156     }
1157   else
1158     {
1159       const GFC_INTEGER_4 *restrict abase_x;
1160       const GFC_INTEGER_4 *restrict bbase_y;
1161       GFC_INTEGER_4 *restrict dest_y;
1162       GFC_INTEGER_4 s;
1163
1164       for (y = 0; y < ycount; y++)
1165         {
1166           bbase_y = &bbase[y*bystride];
1167           dest_y = &dest[y*rystride];
1168           for (x = 0; x < xcount; x++)
1169             {
1170               abase_x = &abase[x*axstride];
1171               s = (GFC_INTEGER_4) 0;
1172               for (n = 0; n < count; n++)
1173                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174               dest_y[x*rxstride] = s;
1175             }
1176         }
1177     }
1178 }
1179 #undef POW3
1180 #undef min
1181 #undef max
1182
1183 #endif /* HAVE_AVX2 */
1184
1185 #ifdef HAVE_AVX512F
1186 static void
1187 matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
1188         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
1189         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1190 static void
1191 matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
1192         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
1193         int blas_limit, blas_call gemm)
1194 {
1195   const GFC_INTEGER_4 * restrict abase;
1196   const GFC_INTEGER_4 * restrict bbase;
1197   GFC_INTEGER_4 * restrict dest;
1198
1199   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1200   index_type x, y, n, count, xcount, ycount;
1201
1202   assert (GFC_DESCRIPTOR_RANK (a) == 2
1203           || GFC_DESCRIPTOR_RANK (b) == 2);
1204
1205 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1206
1207    Either A or B (but not both) can be rank 1:
1208
1209    o One-dimensional argument A is implicitly treated as a row matrix
1210      dimensioned [1,count], so xcount=1.
1211
1212    o One-dimensional argument B is implicitly treated as a column matrix
1213      dimensioned [count, 1], so ycount=1.
1214 */
1215
1216   if (retarray->base_addr == NULL)
1217     {
1218       if (GFC_DESCRIPTOR_RANK (a) == 1)
1219         {
1220           GFC_DIMENSION_SET(retarray->dim[0], 0,
1221                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1222         }
1223       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1224         {
1225           GFC_DIMENSION_SET(retarray->dim[0], 0,
1226                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1227         }
1228       else
1229         {
1230           GFC_DIMENSION_SET(retarray->dim[0], 0,
1231                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1232
1233           GFC_DIMENSION_SET(retarray->dim[1], 0,
1234                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1235                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1236         }
1237
1238       retarray->base_addr
1239         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
1240       retarray->offset = 0;
1241     }
1242   else if (unlikely (compile_options.bounds_check))
1243     {
1244       index_type ret_extent, arg_extent;
1245
1246       if (GFC_DESCRIPTOR_RANK (a) == 1)
1247         {
1248           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1249           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250           if (arg_extent != ret_extent)
1251             runtime_error ("Incorrect extent in return array in"
1252                            " MATMUL intrinsic: is %ld, should be %ld",
1253                            (long int) ret_extent, (long int) arg_extent);
1254         }
1255       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1256         {
1257           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1258           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1259           if (arg_extent != ret_extent)
1260             runtime_error ("Incorrect extent in return array in"
1261                            " MATMUL intrinsic: is %ld, should be %ld",
1262                            (long int) ret_extent, (long int) arg_extent);
1263         }
1264       else
1265         {
1266           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1267           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1268           if (arg_extent != ret_extent)
1269             runtime_error ("Incorrect extent in return array in"
1270                            " MATMUL intrinsic for dimension 1:"
1271                            " is %ld, should be %ld",
1272                            (long int) ret_extent, (long int) arg_extent);
1273
1274           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1275           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1276           if (arg_extent != ret_extent)
1277             runtime_error ("Incorrect extent in return array in"
1278                            " MATMUL intrinsic for dimension 2:"
1279                            " is %ld, should be %ld",
1280                            (long int) ret_extent, (long int) arg_extent);
1281         }
1282     }
1283
1284
1285   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1286     {
1287       /* One-dimensional result may be addressed in the code below
1288          either as a row or a column matrix. We want both cases to
1289          work. */
1290       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1291     }
1292   else
1293     {
1294       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1295       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1296     }
1297
1298
1299   if (GFC_DESCRIPTOR_RANK (a) == 1)
1300     {
1301       /* Treat it as a a row matrix A[1,count]. */
1302       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1303       aystride = 1;
1304
1305       xcount = 1;
1306       count = GFC_DESCRIPTOR_EXTENT(a,0);
1307     }
1308   else
1309     {
1310       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1311       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1312
1313       count = GFC_DESCRIPTOR_EXTENT(a,1);
1314       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1315     }
1316
1317   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1318     {
1319       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1320         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1321     }
1322
1323   if (GFC_DESCRIPTOR_RANK (b) == 1)
1324     {
1325       /* Treat it as a column matrix B[count,1] */
1326       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1327
1328       /* bystride should never be used for 1-dimensional b.
1329          in case it is we want it to cause a segfault, rather than
1330          an incorrect result. */
1331       bystride = 0xDEADBEEF;
1332       ycount = 1;
1333     }
1334   else
1335     {
1336       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1338       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1339     }
1340
1341   abase = a->base_addr;
1342   bbase = b->base_addr;
1343   dest = retarray->base_addr;
1344
1345   /* Now that everything is set up, we perform the multiplication
1346      itself.  */
1347
1348 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1349 #define min(a,b) ((a) <= (b) ? (a) : (b))
1350 #define max(a,b) ((a) >= (b) ? (a) : (b))
1351
1352   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1353       && (bxstride == 1 || bystride == 1)
1354       && (((float) xcount) * ((float) ycount) * ((float) count)
1355           > POW3(blas_limit)))
1356     {
1357       const int m = xcount, n = ycount, k = count, ldc = rystride;
1358       const GFC_INTEGER_4 one = 1, zero = 0;
1359       const int lda = (axstride == 1) ? aystride : axstride,
1360                 ldb = (bxstride == 1) ? bystride : bxstride;
1361
1362       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1363         {
1364           assert (gemm != NULL);
1365           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1366                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1367                 &ldc, 1, 1);
1368           return;
1369         }
1370     }
1371
1372   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1373     {
1374       /* This block of code implements a tuned matmul, derived from
1375          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1376
1377                Bo Kagstrom and Per Ling
1378                Department of Computing Science
1379                Umea University
1380                S-901 87 Umea, Sweden
1381
1382          from netlib.org, translated to C, and modified for matmul.m4.  */
1383
1384       const GFC_INTEGER_4 *a, *b;
1385       GFC_INTEGER_4 *c;
1386       const index_type m = xcount, n = ycount, k = count;
1387
1388       /* System generated locals */
1389       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1390                  i1, i2, i3, i4, i5, i6;
1391
1392       /* Local variables */
1393       GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
1394                  f13, f14, f23, f24, f33, f34, f43, f44;
1395       index_type i, j, l, ii, jj, ll;
1396       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1397       GFC_INTEGER_4 *t1;
1398
1399       a = abase;
1400       b = bbase;
1401       c = retarray->base_addr;
1402
1403       /* Parameter adjustments */
1404       c_dim1 = rystride;
1405       c_offset = 1 + c_dim1;
1406       c -= c_offset;
1407       a_dim1 = aystride;
1408       a_offset = 1 + a_dim1;
1409       a -= a_offset;
1410       b_dim1 = bystride;
1411       b_offset = 1 + b_dim1;
1412       b -= b_offset;
1413
1414       /* Early exit if possible */
1415       if (m == 0 || n == 0 || k == 0)
1416         return;
1417
1418       /* Adjust size of t1 to what is needed.  */
1419       index_type t1_dim;
1420       t1_dim = (a_dim1-1) * 256 + b_dim1;
1421       if (t1_dim > 65536)
1422         t1_dim = 65536;
1423
1424       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
1425
1426       /* Empty c first.  */
1427       for (j=1; j<=n; j++)
1428         for (i=1; i<=m; i++)
1429           c[i + j * c_dim1] = (GFC_INTEGER_4)0;
1430
1431       /* Start turning the crank. */
1432       i1 = n;
1433       for (jj = 1; jj <= i1; jj += 512)
1434         {
1435           /* Computing MIN */
1436           i2 = 512;
1437           i3 = n - jj + 1;
1438           jsec = min(i2,i3);
1439           ujsec = jsec - jsec % 4;
1440           i2 = k;
1441           for (ll = 1; ll <= i2; ll += 256)
1442             {
1443               /* Computing MIN */
1444               i3 = 256;
1445               i4 = k - ll + 1;
1446               lsec = min(i3,i4);
1447               ulsec = lsec - lsec % 2;
1448
1449               i3 = m;
1450               for (ii = 1; ii <= i3; ii += 256)
1451                 {
1452                   /* Computing MIN */
1453                   i4 = 256;
1454                   i5 = m - ii + 1;
1455                   isec = min(i4,i5);
1456                   uisec = isec - isec % 2;
1457                   i4 = ll + ulsec - 1;
1458                   for (l = ll; l <= i4; l += 2)
1459                     {
1460                       i5 = ii + uisec - 1;
1461                       for (i = ii; i <= i5; i += 2)
1462                         {
1463                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1464                                         a[i + l * a_dim1];
1465                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1466                                         a[i + (l + 1) * a_dim1];
1467                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1468                                         a[i + 1 + l * a_dim1];
1469                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1470                                         a[i + 1 + (l + 1) * a_dim1];
1471                         }
1472                       if (uisec < isec)
1473                         {
1474                           t1[l - ll + 1 + (isec << 8) - 257] =
1475                                     a[ii + isec - 1 + l * a_dim1];
1476                           t1[l - ll + 2 + (isec << 8) - 257] =
1477                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1478                         }
1479                     }
1480                   if (ulsec < lsec)
1481                     {
1482                       i4 = ii + isec - 1;
1483                       for (i = ii; i<= i4; ++i)
1484                         {
1485                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1486                                     a[i + (ll + lsec - 1) * a_dim1];
1487                         }
1488                     }
1489
1490                   uisec = isec - isec % 4;
1491                   i4 = jj + ujsec - 1;
1492                   for (j = jj; j <= i4; j += 4)
1493                     {
1494                       i5 = ii + uisec - 1;
1495                       for (i = ii; i <= i5; i += 4)
1496                         {
1497                           f11 = c[i + j * c_dim1];
1498                           f21 = c[i + 1 + j * c_dim1];
1499                           f12 = c[i + (j + 1) * c_dim1];
1500                           f22 = c[i + 1 + (j + 1) * c_dim1];
1501                           f13 = c[i + (j + 2) * c_dim1];
1502                           f23 = c[i + 1 + (j + 2) * c_dim1];
1503                           f14 = c[i + (j + 3) * c_dim1];
1504                           f24 = c[i + 1 + (j + 3) * c_dim1];
1505                           f31 = c[i + 2 + j * c_dim1];
1506                           f41 = c[i + 3 + j * c_dim1];
1507                           f32 = c[i + 2 + (j + 1) * c_dim1];
1508                           f42 = c[i + 3 + (j + 1) * c_dim1];
1509                           f33 = c[i + 2 + (j + 2) * c_dim1];
1510                           f43 = c[i + 3 + (j + 2) * c_dim1];
1511                           f34 = c[i + 2 + (j + 3) * c_dim1];
1512                           f44 = c[i + 3 + (j + 3) * c_dim1];
1513                           i6 = ll + lsec - 1;
1514                           for (l = ll; l <= i6; ++l)
1515                             {
1516                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1517                                       * b[l + j * b_dim1];
1518                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1519                                       * b[l + j * b_dim1];
1520                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1521                                       * b[l + (j + 1) * b_dim1];
1522                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1523                                       * b[l + (j + 1) * b_dim1];
1524                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1525                                       * b[l + (j + 2) * b_dim1];
1526                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1527                                       * b[l + (j + 2) * b_dim1];
1528                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1529                                       * b[l + (j + 3) * b_dim1];
1530                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1531                                       * b[l + (j + 3) * b_dim1];
1532                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1533                                       * b[l + j * b_dim1];
1534                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1535                                       * b[l + j * b_dim1];
1536                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1537                                       * b[l + (j + 1) * b_dim1];
1538                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1539                                       * b[l + (j + 1) * b_dim1];
1540                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1541                                       * b[l + (j + 2) * b_dim1];
1542                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1543                                       * b[l + (j + 2) * b_dim1];
1544                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1545                                       * b[l + (j + 3) * b_dim1];
1546                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1547                                       * b[l + (j + 3) * b_dim1];
1548                             }
1549                           c[i + j * c_dim1] = f11;
1550                           c[i + 1 + j * c_dim1] = f21;
1551                           c[i + (j + 1) * c_dim1] = f12;
1552                           c[i + 1 + (j + 1) * c_dim1] = f22;
1553                           c[i + (j + 2) * c_dim1] = f13;
1554                           c[i + 1 + (j + 2) * c_dim1] = f23;
1555                           c[i + (j + 3) * c_dim1] = f14;
1556                           c[i + 1 + (j + 3) * c_dim1] = f24;
1557                           c[i + 2 + j * c_dim1] = f31;
1558                           c[i + 3 + j * c_dim1] = f41;
1559                           c[i + 2 + (j + 1) * c_dim1] = f32;
1560                           c[i + 3 + (j + 1) * c_dim1] = f42;
1561                           c[i + 2 + (j + 2) * c_dim1] = f33;
1562                           c[i + 3 + (j + 2) * c_dim1] = f43;
1563                           c[i + 2 + (j + 3) * c_dim1] = f34;
1564                           c[i + 3 + (j + 3) * c_dim1] = f44;
1565                         }
1566                       if (uisec < isec)
1567                         {
1568                           i5 = ii + isec - 1;
1569                           for (i = ii + uisec; i <= i5; ++i)
1570                             {
1571                               f11 = c[i + j * c_dim1];
1572                               f12 = c[i + (j + 1) * c_dim1];
1573                               f13 = c[i + (j + 2) * c_dim1];
1574                               f14 = c[i + (j + 3) * c_dim1];
1575                               i6 = ll + lsec - 1;
1576                               for (l = ll; l <= i6; ++l)
1577                                 {
1578                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1579                                           257] * b[l + j * b_dim1];
1580                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1581                                           257] * b[l + (j + 1) * b_dim1];
1582                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1583                                           257] * b[l + (j + 2) * b_dim1];
1584                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1585                                           257] * b[l + (j + 3) * b_dim1];
1586                                 }
1587                               c[i + j * c_dim1] = f11;
1588                               c[i + (j + 1) * c_dim1] = f12;
1589                               c[i + (j + 2) * c_dim1] = f13;
1590                               c[i + (j + 3) * c_dim1] = f14;
1591                             }
1592                         }
1593                     }
1594                   if (ujsec < jsec)
1595                     {
1596                       i4 = jj + jsec - 1;
1597                       for (j = jj + ujsec; j <= i4; ++j)
1598                         {
1599                           i5 = ii + uisec - 1;
1600                           for (i = ii; i <= i5; i += 4)
1601                             {
1602                               f11 = c[i + j * c_dim1];
1603                               f21 = c[i + 1 + j * c_dim1];
1604                               f31 = c[i + 2 + j * c_dim1];
1605                               f41 = c[i + 3 + j * c_dim1];
1606                               i6 = ll + lsec - 1;
1607                               for (l = ll; l <= i6; ++l)
1608                                 {
1609                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1610                                           257] * b[l + j * b_dim1];
1611                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1612                                           257] * b[l + j * b_dim1];
1613                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1614                                           257] * b[l + j * b_dim1];
1615                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1616                                           257] * b[l + j * b_dim1];
1617                                 }
1618                               c[i + j * c_dim1] = f11;
1619                               c[i + 1 + j * c_dim1] = f21;
1620                               c[i + 2 + j * c_dim1] = f31;
1621                               c[i + 3 + j * c_dim1] = f41;
1622                             }
1623                           i5 = ii + isec - 1;
1624                           for (i = ii + uisec; i <= i5; ++i)
1625                             {
1626                               f11 = c[i + j * c_dim1];
1627                               i6 = ll + lsec - 1;
1628                               for (l = ll; l <= i6; ++l)
1629                                 {
1630                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1631                                           257] * b[l + j * b_dim1];
1632                                 }
1633                               c[i + j * c_dim1] = f11;
1634                             }
1635                         }
1636                     }
1637                 }
1638             }
1639         }
1640       free(t1);
1641       return;
1642     }
1643   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1644     {
1645       if (GFC_DESCRIPTOR_RANK (a) != 1)
1646         {
1647           const GFC_INTEGER_4 *restrict abase_x;
1648           const GFC_INTEGER_4 *restrict bbase_y;
1649           GFC_INTEGER_4 *restrict dest_y;
1650           GFC_INTEGER_4 s;
1651
1652           for (y = 0; y < ycount; y++)
1653             {
1654               bbase_y = &bbase[y*bystride];
1655               dest_y = &dest[y*rystride];
1656               for (x = 0; x < xcount; x++)
1657                 {
1658                   abase_x = &abase[x*axstride];
1659                   s = (GFC_INTEGER_4) 0;
1660                   for (n = 0; n < count; n++)
1661                     s += abase_x[n] * bbase_y[n];
1662                   dest_y[x] = s;
1663                 }
1664             }
1665         }
1666       else
1667         {
1668           const GFC_INTEGER_4 *restrict bbase_y;
1669           GFC_INTEGER_4 s;
1670
1671           for (y = 0; y < ycount; y++)
1672             {
1673               bbase_y = &bbase[y*bystride];
1674               s = (GFC_INTEGER_4) 0;
1675               for (n = 0; n < count; n++)
1676                 s += abase[n*axstride] * bbase_y[n];
1677               dest[y*rystride] = s;
1678             }
1679         }
1680     }
1681   else if (axstride < aystride)
1682     {
1683       for (y = 0; y < ycount; y++)
1684         for (x = 0; x < xcount; x++)
1685           dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
1686
1687       for (y = 0; y < ycount; y++)
1688         for (n = 0; n < count; n++)
1689           for (x = 0; x < xcount; x++)
1690             /* dest[x,y] += a[x,n] * b[n,y] */
1691             dest[x*rxstride + y*rystride] +=
1692                                         abase[x*axstride + n*aystride] *
1693                                         bbase[n*bxstride + y*bystride];
1694     }
1695   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1696     {
1697       const GFC_INTEGER_4 *restrict bbase_y;
1698       GFC_INTEGER_4 s;
1699
1700       for (y = 0; y < ycount; y++)
1701         {
1702           bbase_y = &bbase[y*bystride];
1703           s = (GFC_INTEGER_4) 0;
1704           for (n = 0; n < count; n++)
1705             s += abase[n*axstride] * bbase_y[n*bxstride];
1706           dest[y*rxstride] = s;
1707         }
1708     }
1709   else
1710     {
1711       const GFC_INTEGER_4 *restrict abase_x;
1712       const GFC_INTEGER_4 *restrict bbase_y;
1713       GFC_INTEGER_4 *restrict dest_y;
1714       GFC_INTEGER_4 s;
1715
1716       for (y = 0; y < ycount; y++)
1717         {
1718           bbase_y = &bbase[y*bystride];
1719           dest_y = &dest[y*rystride];
1720           for (x = 0; x < xcount; x++)
1721             {
1722               abase_x = &abase[x*axstride];
1723               s = (GFC_INTEGER_4) 0;
1724               for (n = 0; n < count; n++)
1725                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1726               dest_y[x*rxstride] = s;
1727             }
1728         }
1729     }
1730 }
1731 #undef POW3
1732 #undef min
1733 #undef max
1734
1735 #endif  /* HAVE_AVX512F */
1736
1737 /* Function to fall back to if there is no special processor-specific version.  */
1738 static void
1739 matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
1740         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
1741         int blas_limit, blas_call gemm)
1742 {
1743   const GFC_INTEGER_4 * restrict abase;
1744   const GFC_INTEGER_4 * restrict bbase;
1745   GFC_INTEGER_4 * restrict dest;
1746
1747   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1748   index_type x, y, n, count, xcount, ycount;
1749
1750   assert (GFC_DESCRIPTOR_RANK (a) == 2
1751           || GFC_DESCRIPTOR_RANK (b) == 2);
1752
1753 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1754
1755    Either A or B (but not both) can be rank 1:
1756
1757    o One-dimensional argument A is implicitly treated as a row matrix
1758      dimensioned [1,count], so xcount=1.
1759
1760    o One-dimensional argument B is implicitly treated as a column matrix
1761      dimensioned [count, 1], so ycount=1.
1762 */
1763
1764   if (retarray->base_addr == NULL)
1765     {
1766       if (GFC_DESCRIPTOR_RANK (a) == 1)
1767         {
1768           GFC_DIMENSION_SET(retarray->dim[0], 0,
1769                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1770         }
1771       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1772         {
1773           GFC_DIMENSION_SET(retarray->dim[0], 0,
1774                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1775         }
1776       else
1777         {
1778           GFC_DIMENSION_SET(retarray->dim[0], 0,
1779                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1780
1781           GFC_DIMENSION_SET(retarray->dim[1], 0,
1782                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1783                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1784         }
1785
1786       retarray->base_addr
1787         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
1788       retarray->offset = 0;
1789     }
1790   else if (unlikely (compile_options.bounds_check))
1791     {
1792       index_type ret_extent, arg_extent;
1793
1794       if (GFC_DESCRIPTOR_RANK (a) == 1)
1795         {
1796           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1797           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1798           if (arg_extent != ret_extent)
1799             runtime_error ("Incorrect extent in return array in"
1800                            " MATMUL intrinsic: is %ld, should be %ld",
1801                            (long int) ret_extent, (long int) arg_extent);
1802         }
1803       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1804         {
1805           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1806           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1807           if (arg_extent != ret_extent)
1808             runtime_error ("Incorrect extent in return array in"
1809                            " MATMUL intrinsic: is %ld, should be %ld",
1810                            (long int) ret_extent, (long int) arg_extent);
1811         }
1812       else
1813         {
1814           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1815           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1816           if (arg_extent != ret_extent)
1817             runtime_error ("Incorrect extent in return array in"
1818                            " MATMUL intrinsic for dimension 1:"
1819                            " is %ld, should be %ld",
1820                            (long int) ret_extent, (long int) arg_extent);
1821
1822           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1823           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1824           if (arg_extent != ret_extent)
1825             runtime_error ("Incorrect extent in return array in"
1826                            " MATMUL intrinsic for dimension 2:"
1827                            " is %ld, should be %ld",
1828                            (long int) ret_extent, (long int) arg_extent);
1829         }
1830     }
1831
1832
1833   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1834     {
1835       /* One-dimensional result may be addressed in the code below
1836          either as a row or a column matrix. We want both cases to
1837          work. */
1838       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1839     }
1840   else
1841     {
1842       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1843       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1844     }
1845
1846
1847   if (GFC_DESCRIPTOR_RANK (a) == 1)
1848     {
1849       /* Treat it as a a row matrix A[1,count]. */
1850       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1851       aystride = 1;
1852
1853       xcount = 1;
1854       count = GFC_DESCRIPTOR_EXTENT(a,0);
1855     }
1856   else
1857     {
1858       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1859       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1860
1861       count = GFC_DESCRIPTOR_EXTENT(a,1);
1862       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1863     }
1864
1865   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1866     {
1867       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1868         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1869     }
1870
1871   if (GFC_DESCRIPTOR_RANK (b) == 1)
1872     {
1873       /* Treat it as a column matrix B[count,1] */
1874       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1875
1876       /* bystride should never be used for 1-dimensional b.
1877          in case it is we want it to cause a segfault, rather than
1878          an incorrect result. */
1879       bystride = 0xDEADBEEF;
1880       ycount = 1;
1881     }
1882   else
1883     {
1884       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1885       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1886       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1887     }
1888
1889   abase = a->base_addr;
1890   bbase = b->base_addr;
1891   dest = retarray->base_addr;
1892
1893   /* Now that everything is set up, we perform the multiplication
1894      itself.  */
1895
1896 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1897 #define min(a,b) ((a) <= (b) ? (a) : (b))
1898 #define max(a,b) ((a) >= (b) ? (a) : (b))
1899
1900   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1901       && (bxstride == 1 || bystride == 1)
1902       && (((float) xcount) * ((float) ycount) * ((float) count)
1903           > POW3(blas_limit)))
1904     {
1905       const int m = xcount, n = ycount, k = count, ldc = rystride;
1906       const GFC_INTEGER_4 one = 1, zero = 0;
1907       const int lda = (axstride == 1) ? aystride : axstride,
1908                 ldb = (bxstride == 1) ? bystride : bxstride;
1909
1910       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1911         {
1912           assert (gemm != NULL);
1913           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1914                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1915                 &ldc, 1, 1);
1916           return;
1917         }
1918     }
1919
1920   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1921     {
1922       /* This block of code implements a tuned matmul, derived from
1923          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1924
1925                Bo Kagstrom and Per Ling
1926                Department of Computing Science
1927                Umea University
1928                S-901 87 Umea, Sweden
1929
1930          from netlib.org, translated to C, and modified for matmul.m4.  */
1931
1932       const GFC_INTEGER_4 *a, *b;
1933       GFC_INTEGER_4 *c;
1934       const index_type m = xcount, n = ycount, k = count;
1935
1936       /* System generated locals */
1937       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1938                  i1, i2, i3, i4, i5, i6;
1939
1940       /* Local variables */
1941       GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
1942                  f13, f14, f23, f24, f33, f34, f43, f44;
1943       index_type i, j, l, ii, jj, ll;
1944       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1945       GFC_INTEGER_4 *t1;
1946
1947       a = abase;
1948       b = bbase;
1949       c = retarray->base_addr;
1950
1951       /* Parameter adjustments */
1952       c_dim1 = rystride;
1953       c_offset = 1 + c_dim1;
1954       c -= c_offset;
1955       a_dim1 = aystride;
1956       a_offset = 1 + a_dim1;
1957       a -= a_offset;
1958       b_dim1 = bystride;
1959       b_offset = 1 + b_dim1;
1960       b -= b_offset;
1961
1962       /* Early exit if possible */
1963       if (m == 0 || n == 0 || k == 0)
1964         return;
1965
1966       /* Adjust size of t1 to what is needed.  */
1967       index_type t1_dim;
1968       t1_dim = (a_dim1-1) * 256 + b_dim1;
1969       if (t1_dim > 65536)
1970         t1_dim = 65536;
1971
1972       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
1973
1974       /* Empty c first.  */
1975       for (j=1; j<=n; j++)
1976         for (i=1; i<=m; i++)
1977           c[i + j * c_dim1] = (GFC_INTEGER_4)0;
1978
1979       /* Start turning the crank. */
1980       i1 = n;
1981       for (jj = 1; jj <= i1; jj += 512)
1982         {
1983           /* Computing MIN */
1984           i2 = 512;
1985           i3 = n - jj + 1;
1986           jsec = min(i2,i3);
1987           ujsec = jsec - jsec % 4;
1988           i2 = k;
1989           for (ll = 1; ll <= i2; ll += 256)
1990             {
1991               /* Computing MIN */
1992               i3 = 256;
1993               i4 = k - ll + 1;
1994               lsec = min(i3,i4);
1995               ulsec = lsec - lsec % 2;
1996
1997               i3 = m;
1998               for (ii = 1; ii <= i3; ii += 256)
1999                 {
2000                   /* Computing MIN */
2001                   i4 = 256;
2002                   i5 = m - ii + 1;
2003                   isec = min(i4,i5);
2004                   uisec = isec - isec % 2;
2005                   i4 = ll + ulsec - 1;
2006                   for (l = ll; l <= i4; l += 2)
2007                     {
2008                       i5 = ii + uisec - 1;
2009                       for (i = ii; i <= i5; i += 2)
2010                         {
2011                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2012                                         a[i + l * a_dim1];
2013                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2014                                         a[i + (l + 1) * a_dim1];
2015                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2016                                         a[i + 1 + l * a_dim1];
2017                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2018                                         a[i + 1 + (l + 1) * a_dim1];
2019                         }
2020                       if (uisec < isec)
2021                         {
2022                           t1[l - ll + 1 + (isec << 8) - 257] =
2023                                     a[ii + isec - 1 + l * a_dim1];
2024                           t1[l - ll + 2 + (isec << 8) - 257] =
2025                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2026                         }
2027                     }
2028                   if (ulsec < lsec)
2029                     {
2030                       i4 = ii + isec - 1;
2031                       for (i = ii; i<= i4; ++i)
2032                         {
2033                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2034                                     a[i + (ll + lsec - 1) * a_dim1];
2035                         }
2036                     }
2037
2038                   uisec = isec - isec % 4;
2039                   i4 = jj + ujsec - 1;
2040                   for (j = jj; j <= i4; j += 4)
2041                     {
2042                       i5 = ii + uisec - 1;
2043                       for (i = ii; i <= i5; i += 4)
2044                         {
2045                           f11 = c[i + j * c_dim1];
2046                           f21 = c[i + 1 + j * c_dim1];
2047                           f12 = c[i + (j + 1) * c_dim1];
2048                           f22 = c[i + 1 + (j + 1) * c_dim1];
2049                           f13 = c[i + (j + 2) * c_dim1];
2050                           f23 = c[i + 1 + (j + 2) * c_dim1];
2051                           f14 = c[i + (j + 3) * c_dim1];
2052                           f24 = c[i + 1 + (j + 3) * c_dim1];
2053                           f31 = c[i + 2 + j * c_dim1];
2054                           f41 = c[i + 3 + j * c_dim1];
2055                           f32 = c[i + 2 + (j + 1) * c_dim1];
2056                           f42 = c[i + 3 + (j + 1) * c_dim1];
2057                           f33 = c[i + 2 + (j + 2) * c_dim1];
2058                           f43 = c[i + 3 + (j + 2) * c_dim1];
2059                           f34 = c[i + 2 + (j + 3) * c_dim1];
2060                           f44 = c[i + 3 + (j + 3) * c_dim1];
2061                           i6 = ll + lsec - 1;
2062                           for (l = ll; l <= i6; ++l)
2063                             {
2064                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2065                                       * b[l + j * b_dim1];
2066                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2067                                       * b[l + j * b_dim1];
2068                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2069                                       * b[l + (j + 1) * b_dim1];
2070                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2071                                       * b[l + (j + 1) * b_dim1];
2072                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2073                                       * b[l + (j + 2) * b_dim1];
2074                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2075                                       * b[l + (j + 2) * b_dim1];
2076                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2077                                       * b[l + (j + 3) * b_dim1];
2078                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2079                                       * b[l + (j + 3) * b_dim1];
2080                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2081                                       * b[l + j * b_dim1];
2082                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2083                                       * b[l + j * b_dim1];
2084                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2085                                       * b[l + (j + 1) * b_dim1];
2086                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2087                                       * b[l + (j + 1) * b_dim1];
2088                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2089                                       * b[l + (j + 2) * b_dim1];
2090                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2091                                       * b[l + (j + 2) * b_dim1];
2092                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2093                                       * b[l + (j + 3) * b_dim1];
2094                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2095                                       * b[l + (j + 3) * b_dim1];
2096                             }
2097                           c[i + j * c_dim1] = f11;
2098                           c[i + 1 + j * c_dim1] = f21;
2099                           c[i + (j + 1) * c_dim1] = f12;
2100                           c[i + 1 + (j + 1) * c_dim1] = f22;
2101                           c[i + (j + 2) * c_dim1] = f13;
2102                           c[i + 1 + (j + 2) * c_dim1] = f23;
2103                           c[i + (j + 3) * c_dim1] = f14;
2104                           c[i + 1 + (j + 3) * c_dim1] = f24;
2105                           c[i + 2 + j * c_dim1] = f31;
2106                           c[i + 3 + j * c_dim1] = f41;
2107                           c[i + 2 + (j + 1) * c_dim1] = f32;
2108                           c[i + 3 + (j + 1) * c_dim1] = f42;
2109                           c[i + 2 + (j + 2) * c_dim1] = f33;
2110                           c[i + 3 + (j + 2) * c_dim1] = f43;
2111                           c[i + 2 + (j + 3) * c_dim1] = f34;
2112                           c[i + 3 + (j + 3) * c_dim1] = f44;
2113                         }
2114                       if (uisec < isec)
2115                         {
2116                           i5 = ii + isec - 1;
2117                           for (i = ii + uisec; i <= i5; ++i)
2118                             {
2119                               f11 = c[i + j * c_dim1];
2120                               f12 = c[i + (j + 1) * c_dim1];
2121                               f13 = c[i + (j + 2) * c_dim1];
2122                               f14 = c[i + (j + 3) * c_dim1];
2123                               i6 = ll + lsec - 1;
2124                               for (l = ll; l <= i6; ++l)
2125                                 {
2126                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2127                                           257] * b[l + j * b_dim1];
2128                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2129                                           257] * b[l + (j + 1) * b_dim1];
2130                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2131                                           257] * b[l + (j + 2) * b_dim1];
2132                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2133                                           257] * b[l + (j + 3) * b_dim1];
2134                                 }
2135                               c[i + j * c_dim1] = f11;
2136                               c[i + (j + 1) * c_dim1] = f12;
2137                               c[i + (j + 2) * c_dim1] = f13;
2138                               c[i + (j + 3) * c_dim1] = f14;
2139                             }
2140                         }
2141                     }
2142                   if (ujsec < jsec)
2143                     {
2144                       i4 = jj + jsec - 1;
2145                       for (j = jj + ujsec; j <= i4; ++j)
2146                         {
2147                           i5 = ii + uisec - 1;
2148                           for (i = ii; i <= i5; i += 4)
2149                             {
2150                               f11 = c[i + j * c_dim1];
2151                               f21 = c[i + 1 + j * c_dim1];
2152                               f31 = c[i + 2 + j * c_dim1];
2153                               f41 = c[i + 3 + j * c_dim1];
2154                               i6 = ll + lsec - 1;
2155                               for (l = ll; l <= i6; ++l)
2156                                 {
2157                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2158                                           257] * b[l + j * b_dim1];
2159                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2160                                           257] * b[l + j * b_dim1];
2161                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2162                                           257] * b[l + j * b_dim1];
2163                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2164                                           257] * b[l + j * b_dim1];
2165                                 }
2166                               c[i + j * c_dim1] = f11;
2167                               c[i + 1 + j * c_dim1] = f21;
2168                               c[i + 2 + j * c_dim1] = f31;
2169                               c[i + 3 + j * c_dim1] = f41;
2170                             }
2171                           i5 = ii + isec - 1;
2172                           for (i = ii + uisec; i <= i5; ++i)
2173                             {
2174                               f11 = c[i + j * c_dim1];
2175                               i6 = ll + lsec - 1;
2176                               for (l = ll; l <= i6; ++l)
2177                                 {
2178                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2179                                           257] * b[l + j * b_dim1];
2180                                 }
2181                               c[i + j * c_dim1] = f11;
2182                             }
2183                         }
2184                     }
2185                 }
2186             }
2187         }
2188       free(t1);
2189       return;
2190     }
2191   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2192     {
2193       if (GFC_DESCRIPTOR_RANK (a) != 1)
2194         {
2195           const GFC_INTEGER_4 *restrict abase_x;
2196           const GFC_INTEGER_4 *restrict bbase_y;
2197           GFC_INTEGER_4 *restrict dest_y;
2198           GFC_INTEGER_4 s;
2199
2200           for (y = 0; y < ycount; y++)
2201             {
2202               bbase_y = &bbase[y*bystride];
2203               dest_y = &dest[y*rystride];
2204               for (x = 0; x < xcount; x++)
2205                 {
2206                   abase_x = &abase[x*axstride];
2207                   s = (GFC_INTEGER_4) 0;
2208                   for (n = 0; n < count; n++)
2209                     s += abase_x[n] * bbase_y[n];
2210                   dest_y[x] = s;
2211                 }
2212             }
2213         }
2214       else
2215         {
2216           const GFC_INTEGER_4 *restrict bbase_y;
2217           GFC_INTEGER_4 s;
2218
2219           for (y = 0; y < ycount; y++)
2220             {
2221               bbase_y = &bbase[y*bystride];
2222               s = (GFC_INTEGER_4) 0;
2223               for (n = 0; n < count; n++)
2224                 s += abase[n*axstride] * bbase_y[n];
2225               dest[y*rystride] = s;
2226             }
2227         }
2228     }
2229   else if (axstride < aystride)
2230     {
2231       for (y = 0; y < ycount; y++)
2232         for (x = 0; x < xcount; x++)
2233           dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
2234
2235       for (y = 0; y < ycount; y++)
2236         for (n = 0; n < count; n++)
2237           for (x = 0; x < xcount; x++)
2238             /* dest[x,y] += a[x,n] * b[n,y] */
2239             dest[x*rxstride + y*rystride] +=
2240                                         abase[x*axstride + n*aystride] *
2241                                         bbase[n*bxstride + y*bystride];
2242     }
2243   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2244     {
2245       const GFC_INTEGER_4 *restrict bbase_y;
2246       GFC_INTEGER_4 s;
2247
2248       for (y = 0; y < ycount; y++)
2249         {
2250           bbase_y = &bbase[y*bystride];
2251           s = (GFC_INTEGER_4) 0;
2252           for (n = 0; n < count; n++)
2253             s += abase[n*axstride] * bbase_y[n*bxstride];
2254           dest[y*rxstride] = s;
2255         }
2256     }
2257   else
2258     {
2259       const GFC_INTEGER_4 *restrict abase_x;
2260       const GFC_INTEGER_4 *restrict bbase_y;
2261       GFC_INTEGER_4 *restrict dest_y;
2262       GFC_INTEGER_4 s;
2263
2264       for (y = 0; y < ycount; y++)
2265         {
2266           bbase_y = &bbase[y*bystride];
2267           dest_y = &dest[y*rystride];
2268           for (x = 0; x < xcount; x++)
2269             {
2270               abase_x = &abase[x*axstride];
2271               s = (GFC_INTEGER_4) 0;
2272               for (n = 0; n < count; n++)
2273                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2274               dest_y[x*rxstride] = s;
2275             }
2276         }
2277     }
2278 }
2279 #undef POW3
2280 #undef min
2281 #undef max
2282
2283
2284 /* Compiling main function, with selection code for the processor.  */
2285
2286 /* Currently, this is i386 only.  Adjust for other architectures.  */
2287
2288 #include <config/i386/cpuinfo.h>
2289 void matmul_i4 (gfc_array_i4 * const restrict retarray,
2290         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2291         int blas_limit, blas_call gemm)
2292 {
2293   static void (*matmul_p) (gfc_array_i4 * const restrict retarray,
2294         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2295         int blas_limit, blas_call gemm);
2296
2297   void (*matmul_fn) (gfc_array_i4 * const restrict retarray,
2298         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2299         int blas_limit, blas_call gemm);
2300
2301   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2302   if (matmul_fn == NULL)
2303     {
2304       matmul_fn = matmul_i4_vanilla;
2305       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2306         {
2307           /* Run down the available processors in order of preference.  */
2308 #ifdef HAVE_AVX512F
2309           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2310             {
2311               matmul_fn = matmul_i4_avx512f;
2312               goto store;
2313             }
2314
2315 #endif  /* HAVE_AVX512F */
2316
2317 #ifdef HAVE_AVX2
2318           if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2319              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2320             {
2321               matmul_fn = matmul_i4_avx2;
2322               goto store;
2323             }
2324
2325 #endif
2326
2327 #ifdef HAVE_AVX
2328           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2329             {
2330               matmul_fn = matmul_i4_avx;
2331               goto store;
2332             }
2333 #endif  /* HAVE_AVX */
2334         }
2335    store:
2336       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2337    }
2338
2339    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2340 }
2341
2342 #else  /* Just the vanilla function.  */
2343
2344 void
2345 matmul_i4 (gfc_array_i4 * const restrict retarray,
2346         gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2347         int blas_limit, blas_call gemm)
2348 {
2349   const GFC_INTEGER_4 * restrict abase;
2350   const GFC_INTEGER_4 * restrict bbase;
2351   GFC_INTEGER_4 * restrict dest;
2352
2353   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2354   index_type x, y, n, count, xcount, ycount;
2355
2356   assert (GFC_DESCRIPTOR_RANK (a) == 2
2357           || GFC_DESCRIPTOR_RANK (b) == 2);
2358
2359 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2360
2361    Either A or B (but not both) can be rank 1:
2362
2363    o One-dimensional argument A is implicitly treated as a row matrix
2364      dimensioned [1,count], so xcount=1.
2365
2366    o One-dimensional argument B is implicitly treated as a column matrix
2367      dimensioned [count, 1], so ycount=1.
2368 */
2369
2370   if (retarray->base_addr == NULL)
2371     {
2372       if (GFC_DESCRIPTOR_RANK (a) == 1)
2373         {
2374           GFC_DIMENSION_SET(retarray->dim[0], 0,
2375                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2376         }
2377       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2378         {
2379           GFC_DIMENSION_SET(retarray->dim[0], 0,
2380                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2381         }
2382       else
2383         {
2384           GFC_DIMENSION_SET(retarray->dim[0], 0,
2385                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2386
2387           GFC_DIMENSION_SET(retarray->dim[1], 0,
2388                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2389                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2390         }
2391
2392       retarray->base_addr
2393         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
2394       retarray->offset = 0;
2395     }
2396   else if (unlikely (compile_options.bounds_check))
2397     {
2398       index_type ret_extent, arg_extent;
2399
2400       if (GFC_DESCRIPTOR_RANK (a) == 1)
2401         {
2402           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2403           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2404           if (arg_extent != ret_extent)
2405             runtime_error ("Incorrect extent in return array in"
2406                            " MATMUL intrinsic: is %ld, should be %ld",
2407                            (long int) ret_extent, (long int) arg_extent);
2408         }
2409       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2410         {
2411           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2412           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2413           if (arg_extent != ret_extent)
2414             runtime_error ("Incorrect extent in return array in"
2415                            " MATMUL intrinsic: is %ld, should be %ld",
2416                            (long int) ret_extent, (long int) arg_extent);
2417         }
2418       else
2419         {
2420           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2421           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2422           if (arg_extent != ret_extent)
2423             runtime_error ("Incorrect extent in return array in"
2424                            " MATMUL intrinsic for dimension 1:"
2425                            " is %ld, should be %ld",
2426                            (long int) ret_extent, (long int) arg_extent);
2427
2428           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2429           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2430           if (arg_extent != ret_extent)
2431             runtime_error ("Incorrect extent in return array in"
2432                            " MATMUL intrinsic for dimension 2:"
2433                            " is %ld, should be %ld",
2434                            (long int) ret_extent, (long int) arg_extent);
2435         }
2436     }
2437
2438
2439   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2440     {
2441       /* One-dimensional result may be addressed in the code below
2442          either as a row or a column matrix. We want both cases to
2443          work. */
2444       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2445     }
2446   else
2447     {
2448       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2449       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2450     }
2451
2452
2453   if (GFC_DESCRIPTOR_RANK (a) == 1)
2454     {
2455       /* Treat it as a a row matrix A[1,count]. */
2456       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2457       aystride = 1;
2458
2459       xcount = 1;
2460       count = GFC_DESCRIPTOR_EXTENT(a,0);
2461     }
2462   else
2463     {
2464       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2465       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2466
2467       count = GFC_DESCRIPTOR_EXTENT(a,1);
2468       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2469     }
2470
2471   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2472     {
2473       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2474         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2475     }
2476
2477   if (GFC_DESCRIPTOR_RANK (b) == 1)
2478     {
2479       /* Treat it as a column matrix B[count,1] */
2480       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2481
2482       /* bystride should never be used for 1-dimensional b.
2483          in case it is we want it to cause a segfault, rather than
2484          an incorrect result. */
2485       bystride = 0xDEADBEEF;
2486       ycount = 1;
2487     }
2488   else
2489     {
2490       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2491       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2492       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2493     }
2494
2495   abase = a->base_addr;
2496   bbase = b->base_addr;
2497   dest = retarray->base_addr;
2498
2499   /* Now that everything is set up, we perform the multiplication
2500      itself.  */
2501
2502 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2503 #define min(a,b) ((a) <= (b) ? (a) : (b))
2504 #define max(a,b) ((a) >= (b) ? (a) : (b))
2505
2506   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2507       && (bxstride == 1 || bystride == 1)
2508       && (((float) xcount) * ((float) ycount) * ((float) count)
2509           > POW3(blas_limit)))
2510     {
2511       const int m = xcount, n = ycount, k = count, ldc = rystride;
2512       const GFC_INTEGER_4 one = 1, zero = 0;
2513       const int lda = (axstride == 1) ? aystride : axstride,
2514                 ldb = (bxstride == 1) ? bystride : bxstride;
2515
2516       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2517         {
2518           assert (gemm != NULL);
2519           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2520                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2521                 &ldc, 1, 1);
2522           return;
2523         }
2524     }
2525
2526   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2527     {
2528       /* This block of code implements a tuned matmul, derived from
2529          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2530
2531                Bo Kagstrom and Per Ling
2532                Department of Computing Science
2533                Umea University
2534                S-901 87 Umea, Sweden
2535
2536          from netlib.org, translated to C, and modified for matmul.m4.  */
2537
2538       const GFC_INTEGER_4 *a, *b;
2539       GFC_INTEGER_4 *c;
2540       const index_type m = xcount, n = ycount, k = count;
2541
2542       /* System generated locals */
2543       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2544                  i1, i2, i3, i4, i5, i6;
2545
2546       /* Local variables */
2547       GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
2548                  f13, f14, f23, f24, f33, f34, f43, f44;
2549       index_type i, j, l, ii, jj, ll;
2550       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2551       GFC_INTEGER_4 *t1;
2552
2553       a = abase;
2554       b = bbase;
2555       c = retarray->base_addr;
2556
2557       /* Parameter adjustments */
2558       c_dim1 = rystride;
2559       c_offset = 1 + c_dim1;
2560       c -= c_offset;
2561       a_dim1 = aystride;
2562       a_offset = 1 + a_dim1;
2563       a -= a_offset;
2564       b_dim1 = bystride;
2565       b_offset = 1 + b_dim1;
2566       b -= b_offset;
2567
2568       /* Early exit if possible */
2569       if (m == 0 || n == 0 || k == 0)
2570         return;
2571
2572       /* Adjust size of t1 to what is needed.  */
2573       index_type t1_dim;
2574       t1_dim = (a_dim1-1) * 256 + b_dim1;
2575       if (t1_dim > 65536)
2576         t1_dim = 65536;
2577
2578       t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
2579
2580       /* Empty c first.  */
2581       for (j=1; j<=n; j++)
2582         for (i=1; i<=m; i++)
2583           c[i + j * c_dim1] = (GFC_INTEGER_4)0;
2584
2585       /* Start turning the crank. */
2586       i1 = n;
2587       for (jj = 1; jj <= i1; jj += 512)
2588         {
2589           /* Computing MIN */
2590           i2 = 512;
2591           i3 = n - jj + 1;
2592           jsec = min(i2,i3);
2593           ujsec = jsec - jsec % 4;
2594           i2 = k;
2595           for (ll = 1; ll <= i2; ll += 256)
2596             {
2597               /* Computing MIN */
2598               i3 = 256;
2599               i4 = k - ll + 1;
2600               lsec = min(i3,i4);
2601               ulsec = lsec - lsec % 2;
2602
2603               i3 = m;
2604               for (ii = 1; ii <= i3; ii += 256)
2605                 {
2606                   /* Computing MIN */
2607                   i4 = 256;
2608                   i5 = m - ii + 1;
2609                   isec = min(i4,i5);
2610                   uisec = isec - isec % 2;
2611                   i4 = ll + ulsec - 1;
2612                   for (l = ll; l <= i4; l += 2)
2613                     {
2614                       i5 = ii + uisec - 1;
2615                       for (i = ii; i <= i5; i += 2)
2616                         {
2617                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2618                                         a[i + l * a_dim1];
2619                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2620                                         a[i + (l + 1) * a_dim1];
2621                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2622                                         a[i + 1 + l * a_dim1];
2623                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2624                                         a[i + 1 + (l + 1) * a_dim1];
2625                         }
2626                       if (uisec < isec)
2627                         {
2628                           t1[l - ll + 1 + (isec << 8) - 257] =
2629                                     a[ii + isec - 1 + l * a_dim1];
2630                           t1[l - ll + 2 + (isec << 8) - 257] =
2631                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2632                         }
2633                     }
2634                   if (ulsec < lsec)
2635                     {
2636                       i4 = ii + isec - 1;
2637                       for (i = ii; i<= i4; ++i)
2638                         {
2639                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2640                                     a[i + (ll + lsec - 1) * a_dim1];
2641                         }
2642                     }
2643
2644                   uisec = isec - isec % 4;
2645                   i4 = jj + ujsec - 1;
2646                   for (j = jj; j <= i4; j += 4)
2647                     {
2648                       i5 = ii + uisec - 1;
2649                       for (i = ii; i <= i5; i += 4)
2650                         {
2651                           f11 = c[i + j * c_dim1];
2652                           f21 = c[i + 1 + j * c_dim1];
2653                           f12 = c[i + (j + 1) * c_dim1];
2654                           f22 = c[i + 1 + (j + 1) * c_dim1];
2655                           f13 = c[i + (j + 2) * c_dim1];
2656                           f23 = c[i + 1 + (j + 2) * c_dim1];
2657                           f14 = c[i + (j + 3) * c_dim1];
2658                           f24 = c[i + 1 + (j + 3) * c_dim1];
2659                           f31 = c[i + 2 + j * c_dim1];
2660                           f41 = c[i + 3 + j * c_dim1];
2661                           f32 = c[i + 2 + (j + 1) * c_dim1];
2662                           f42 = c[i + 3 + (j + 1) * c_dim1];
2663                           f33 = c[i + 2 + (j + 2) * c_dim1];
2664                           f43 = c[i + 3 + (j + 2) * c_dim1];
2665                           f34 = c[i + 2 + (j + 3) * c_dim1];
2666                           f44 = c[i + 3 + (j + 3) * c_dim1];
2667                           i6 = ll + lsec - 1;
2668                           for (l = ll; l <= i6; ++l)
2669                             {
2670                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2671                                       * b[l + j * b_dim1];
2672                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2673                                       * b[l + j * b_dim1];
2674                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2675                                       * b[l + (j + 1) * b_dim1];
2676                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2677                                       * b[l + (j + 1) * b_dim1];
2678                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2679                                       * b[l + (j + 2) * b_dim1];
2680                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2681                                       * b[l + (j + 2) * b_dim1];
2682                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2683                                       * b[l + (j + 3) * b_dim1];
2684                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2685                                       * b[l + (j + 3) * b_dim1];
2686                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2687                                       * b[l + j * b_dim1];
2688                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2689                                       * b[l + j * b_dim1];
2690                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2691                                       * b[l + (j + 1) * b_dim1];
2692                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2693                                       * b[l + (j + 1) * b_dim1];
2694                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2695                                       * b[l + (j + 2) * b_dim1];
2696                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2697                                       * b[l + (j + 2) * b_dim1];
2698                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2699                                       * b[l + (j + 3) * b_dim1];
2700                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2701                                       * b[l + (j + 3) * b_dim1];
2702                             }
2703                           c[i + j * c_dim1] = f11;
2704                           c[i + 1 + j * c_dim1] = f21;
2705                           c[i + (j + 1) * c_dim1] = f12;
2706                           c[i + 1 + (j + 1) * c_dim1] = f22;
2707                           c[i + (j + 2) * c_dim1] = f13;
2708                           c[i + 1 + (j + 2) * c_dim1] = f23;
2709                           c[i + (j + 3) * c_dim1] = f14;
2710                           c[i + 1 + (j + 3) * c_dim1] = f24;
2711                           c[i + 2 + j * c_dim1] = f31;
2712                           c[i + 3 + j * c_dim1] = f41;
2713                           c[i + 2 + (j + 1) * c_dim1] = f32;
2714                           c[i + 3 + (j + 1) * c_dim1] = f42;
2715                           c[i + 2 + (j + 2) * c_dim1] = f33;
2716                           c[i + 3 + (j + 2) * c_dim1] = f43;
2717                           c[i + 2 + (j + 3) * c_dim1] = f34;
2718                           c[i + 3 + (j + 3) * c_dim1] = f44;
2719                         }
2720                       if (uisec < isec)
2721                         {
2722                           i5 = ii + isec - 1;
2723                           for (i = ii + uisec; i <= i5; ++i)
2724                             {
2725                               f11 = c[i + j * c_dim1];
2726                               f12 = c[i + (j + 1) * c_dim1];
2727                               f13 = c[i + (j + 2) * c_dim1];
2728                               f14 = c[i + (j + 3) * c_dim1];
2729                               i6 = ll + lsec - 1;
2730                               for (l = ll; l <= i6; ++l)
2731                                 {
2732                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2733                                           257] * b[l + j * b_dim1];
2734                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2735                                           257] * b[l + (j + 1) * b_dim1];
2736                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2737                                           257] * b[l + (j + 2) * b_dim1];
2738                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2739                                           257] * b[l + (j + 3) * b_dim1];
2740                                 }
2741                               c[i + j * c_dim1] = f11;
2742                               c[i + (j + 1) * c_dim1] = f12;
2743                               c[i + (j + 2) * c_dim1] = f13;
2744                               c[i + (j + 3) * c_dim1] = f14;
2745                             }
2746                         }
2747                     }
2748                   if (ujsec < jsec)
2749                     {
2750                       i4 = jj + jsec - 1;
2751                       for (j = jj + ujsec; j <= i4; ++j)
2752                         {
2753                           i5 = ii + uisec - 1;
2754                           for (i = ii; i <= i5; i += 4)
2755                             {
2756                               f11 = c[i + j * c_dim1];
2757                               f21 = c[i + 1 + j * c_dim1];
2758                               f31 = c[i + 2 + j * c_dim1];
2759                               f41 = c[i + 3 + j * c_dim1];
2760                               i6 = ll + lsec - 1;
2761                               for (l = ll; l <= i6; ++l)
2762                                 {
2763                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2764                                           257] * b[l + j * b_dim1];
2765                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2766                                           257] * b[l + j * b_dim1];
2767                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2768                                           257] * b[l + j * b_dim1];
2769                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2770                                           257] * b[l + j * b_dim1];
2771                                 }
2772                               c[i + j * c_dim1] = f11;
2773                               c[i + 1 + j * c_dim1] = f21;
2774                               c[i + 2 + j * c_dim1] = f31;
2775                               c[i + 3 + j * c_dim1] = f41;
2776                             }
2777                           i5 = ii + isec - 1;
2778                           for (i = ii + uisec; i <= i5; ++i)
2779                             {
2780                               f11 = c[i + j * c_dim1];
2781                               i6 = ll + lsec - 1;
2782                               for (l = ll; l <= i6; ++l)
2783                                 {
2784                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2785                                           257] * b[l + j * b_dim1];
2786                                 }
2787                               c[i + j * c_dim1] = f11;
2788                             }
2789                         }
2790                     }
2791                 }
2792             }
2793         }
2794       free(t1);
2795       return;
2796     }
2797   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2798     {
2799       if (GFC_DESCRIPTOR_RANK (a) != 1)
2800         {
2801           const GFC_INTEGER_4 *restrict abase_x;
2802           const GFC_INTEGER_4 *restrict bbase_y;
2803           GFC_INTEGER_4 *restrict dest_y;
2804           GFC_INTEGER_4 s;
2805
2806           for (y = 0; y < ycount; y++)
2807             {
2808               bbase_y = &bbase[y*bystride];
2809               dest_y = &dest[y*rystride];
2810               for (x = 0; x < xcount; x++)
2811                 {
2812                   abase_x = &abase[x*axstride];
2813                   s = (GFC_INTEGER_4) 0;
2814                   for (n = 0; n < count; n++)
2815                     s += abase_x[n] * bbase_y[n];
2816                   dest_y[x] = s;
2817                 }
2818             }
2819         }
2820       else
2821         {
2822           const GFC_INTEGER_4 *restrict bbase_y;
2823           GFC_INTEGER_4 s;
2824
2825           for (y = 0; y < ycount; y++)
2826             {
2827               bbase_y = &bbase[y*bystride];
2828               s = (GFC_INTEGER_4) 0;
2829               for (n = 0; n < count; n++)
2830                 s += abase[n*axstride] * bbase_y[n];
2831               dest[y*rystride] = s;
2832             }
2833         }
2834     }
2835   else if (axstride < aystride)
2836     {
2837       for (y = 0; y < ycount; y++)
2838         for (x = 0; x < xcount; x++)
2839           dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
2840
2841       for (y = 0; y < ycount; y++)
2842         for (n = 0; n < count; n++)
2843           for (x = 0; x < xcount; x++)
2844             /* dest[x,y] += a[x,n] * b[n,y] */
2845             dest[x*rxstride + y*rystride] +=
2846                                         abase[x*axstride + n*aystride] *
2847                                         bbase[n*bxstride + y*bystride];
2848     }
2849   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2850     {
2851       const GFC_INTEGER_4 *restrict bbase_y;
2852       GFC_INTEGER_4 s;
2853
2854       for (y = 0; y < ycount; y++)
2855         {
2856           bbase_y = &bbase[y*bystride];
2857           s = (GFC_INTEGER_4) 0;
2858           for (n = 0; n < count; n++)
2859             s += abase[n*axstride] * bbase_y[n*bxstride];
2860           dest[y*rxstride] = s;
2861         }
2862     }
2863   else
2864     {
2865       const GFC_INTEGER_4 *restrict abase_x;
2866       const GFC_INTEGER_4 *restrict bbase_y;
2867       GFC_INTEGER_4 *restrict dest_y;
2868       GFC_INTEGER_4 s;
2869
2870       for (y = 0; y < ycount; y++)
2871         {
2872           bbase_y = &bbase[y*bystride];
2873           dest_y = &dest[y*rystride];
2874           for (x = 0; x < xcount; x++)
2875             {
2876               abase_x = &abase[x*axstride];
2877               s = (GFC_INTEGER_4) 0;
2878               for (n = 0; n < count; n++)
2879                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2880               dest_y[x*rxstride] = s;
2881             }
2882         }
2883     }
2884 }
2885 #undef POW3
2886 #undef min
2887 #undef max
2888
2889 #endif
2890 #endif
2891