]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgfortran/generated/matmul_i2.c
Daily bump.
[thirdparty/gcc.git] / libgfortran / generated / matmul_i2.c
CommitLineData
567c915b 1/* Implementation of the MATMUL intrinsic
85ec4feb 2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
567c915b
TK
3 Contributed by Paul Brook <paul@nowt.org>
4
21d1335b 5This file is part of the GNU Fortran runtime library (libgfortran).
567c915b
TK
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
748086b7 10version 3 of the License, or (at your option) any later version.
567c915b
TK
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
748086b7
JJ
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24<http://www.gnu.org/licenses/>. */
567c915b 25
36ae8a61 26#include "libgfortran.h"
567c915b
TK
27#include <string.h>
28#include <assert.h>
36ae8a61 29
567c915b
TK
30
31#if defined (HAVE_GFC_INTEGER_2)
32
33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
5d70ab07 34 passed to us by the front-end, in which case we call it for large
567c915b
TK
35 matrices. */
36
37typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
39 const int *, const GFC_INTEGER_2 *, const int *,
40 const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
41 int, int);
42
43/* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66*/
67
68/* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72extern void matmul_i2 (gfc_array_i2 * const restrict retarray,
73 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75export_proto(matmul_i2);
76
31cfd832
TK
77/* Put exhaustive list of possible architectures here here, ORed together. */
78
79#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81#ifdef HAVE_AVX
82static void
83matmul_i2_avx (gfc_array_i2 * const restrict retarray,
84 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86static void
87matmul_i2_avx (gfc_array_i2 * const restrict retarray,
88 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90{
91 const GFC_INTEGER_2 * restrict abase;
92 const GFC_INTEGER_2 * restrict bbase;
93 GFC_INTEGER_2 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110*/
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
169
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
177 }
178 }
179
180
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182 {
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187 }
188 else
189 {
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192 }
193
194
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
196 {
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
200
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
203 }
204 else
205 {
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211 }
212
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214 {
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
6ce6a84a
TK
225 The value is only used for calculation of the
226 memory by the buffer. */
227 bystride = 256;
31cfd832
TK
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245#define min(a,b) ((a) <= (b) ? (a) : (b))
246#define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_INTEGER_2 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
265 }
266 }
267
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
269 {
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
272
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
277
278 from netlib.org, translated to C, and modified for matmul.m4. */
279
280 const GFC_INTEGER_2 *a, *b;
281 GFC_INTEGER_2 *c;
282 const index_type m = xcount, n = ycount, k = count;
283
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
287
288 /* Local variables */
fd991039 289 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
31cfd832
TK
290 f13, f14, f23, f24, f33, f34, f43, f44;
291 index_type i, j, l, ii, jj, ll;
292 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
8e5f30dc 293 GFC_INTEGER_2 *t1;
31cfd832
TK
294
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
298
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
309
bbf97416
TK
310 /* Empty c first. */
311 for (j=1; j<=n; j++)
312 for (i=1; i<=m; i++)
313 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
314
31cfd832
TK
315 /* Early exit if possible */
316 if (m == 0 || n == 0 || k == 0)
317 return;
318
fd991039 319 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
320 index_type t1_dim, a_sz;
321 if (aystride == 1)
322 a_sz = rystride;
323 else
324 a_sz = a_dim1;
325
326 t1_dim = a_sz * 256 + b_dim1;
fd991039
TK
327 if (t1_dim > 65536)
328 t1_dim = 65536;
329
8e5f30dc 330 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
fd991039 331
31cfd832
TK
332 /* Start turning the crank. */
333 i1 = n;
334 for (jj = 1; jj <= i1; jj += 512)
335 {
336 /* Computing MIN */
337 i2 = 512;
338 i3 = n - jj + 1;
339 jsec = min(i2,i3);
340 ujsec = jsec - jsec % 4;
341 i2 = k;
342 for (ll = 1; ll <= i2; ll += 256)
343 {
344 /* Computing MIN */
345 i3 = 256;
346 i4 = k - ll + 1;
347 lsec = min(i3,i4);
348 ulsec = lsec - lsec % 2;
349
350 i3 = m;
351 for (ii = 1; ii <= i3; ii += 256)
352 {
353 /* Computing MIN */
354 i4 = 256;
355 i5 = m - ii + 1;
356 isec = min(i4,i5);
357 uisec = isec - isec % 2;
358 i4 = ll + ulsec - 1;
359 for (l = ll; l <= i4; l += 2)
360 {
361 i5 = ii + uisec - 1;
362 for (i = ii; i <= i5; i += 2)
363 {
364 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
365 a[i + l * a_dim1];
366 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
367 a[i + (l + 1) * a_dim1];
368 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
369 a[i + 1 + l * a_dim1];
370 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
371 a[i + 1 + (l + 1) * a_dim1];
372 }
373 if (uisec < isec)
374 {
375 t1[l - ll + 1 + (isec << 8) - 257] =
376 a[ii + isec - 1 + l * a_dim1];
377 t1[l - ll + 2 + (isec << 8) - 257] =
378 a[ii + isec - 1 + (l + 1) * a_dim1];
379 }
380 }
381 if (ulsec < lsec)
382 {
383 i4 = ii + isec - 1;
384 for (i = ii; i<= i4; ++i)
385 {
386 t1[lsec + ((i - ii + 1) << 8) - 257] =
387 a[i + (ll + lsec - 1) * a_dim1];
388 }
389 }
390
391 uisec = isec - isec % 4;
392 i4 = jj + ujsec - 1;
393 for (j = jj; j <= i4; j += 4)
394 {
395 i5 = ii + uisec - 1;
396 for (i = ii; i <= i5; i += 4)
397 {
398 f11 = c[i + j * c_dim1];
399 f21 = c[i + 1 + j * c_dim1];
400 f12 = c[i + (j + 1) * c_dim1];
401 f22 = c[i + 1 + (j + 1) * c_dim1];
402 f13 = c[i + (j + 2) * c_dim1];
403 f23 = c[i + 1 + (j + 2) * c_dim1];
404 f14 = c[i + (j + 3) * c_dim1];
405 f24 = c[i + 1 + (j + 3) * c_dim1];
406 f31 = c[i + 2 + j * c_dim1];
407 f41 = c[i + 3 + j * c_dim1];
408 f32 = c[i + 2 + (j + 1) * c_dim1];
409 f42 = c[i + 3 + (j + 1) * c_dim1];
410 f33 = c[i + 2 + (j + 2) * c_dim1];
411 f43 = c[i + 3 + (j + 2) * c_dim1];
412 f34 = c[i + 2 + (j + 3) * c_dim1];
413 f44 = c[i + 3 + (j + 3) * c_dim1];
414 i6 = ll + lsec - 1;
415 for (l = ll; l <= i6; ++l)
416 {
417 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
418 * b[l + j * b_dim1];
419 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
420 * b[l + j * b_dim1];
421 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
422 * b[l + (j + 1) * b_dim1];
423 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
424 * b[l + (j + 1) * b_dim1];
425 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
426 * b[l + (j + 2) * b_dim1];
427 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
428 * b[l + (j + 2) * b_dim1];
429 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430 * b[l + (j + 3) * b_dim1];
431 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432 * b[l + (j + 3) * b_dim1];
433 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
434 * b[l + j * b_dim1];
435 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
436 * b[l + j * b_dim1];
437 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
438 * b[l + (j + 1) * b_dim1];
439 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
440 * b[l + (j + 1) * b_dim1];
441 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
442 * b[l + (j + 2) * b_dim1];
443 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
444 * b[l + (j + 2) * b_dim1];
445 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446 * b[l + (j + 3) * b_dim1];
447 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448 * b[l + (j + 3) * b_dim1];
449 }
450 c[i + j * c_dim1] = f11;
451 c[i + 1 + j * c_dim1] = f21;
452 c[i + (j + 1) * c_dim1] = f12;
453 c[i + 1 + (j + 1) * c_dim1] = f22;
454 c[i + (j + 2) * c_dim1] = f13;
455 c[i + 1 + (j + 2) * c_dim1] = f23;
456 c[i + (j + 3) * c_dim1] = f14;
457 c[i + 1 + (j + 3) * c_dim1] = f24;
458 c[i + 2 + j * c_dim1] = f31;
459 c[i + 3 + j * c_dim1] = f41;
460 c[i + 2 + (j + 1) * c_dim1] = f32;
461 c[i + 3 + (j + 1) * c_dim1] = f42;
462 c[i + 2 + (j + 2) * c_dim1] = f33;
463 c[i + 3 + (j + 2) * c_dim1] = f43;
464 c[i + 2 + (j + 3) * c_dim1] = f34;
465 c[i + 3 + (j + 3) * c_dim1] = f44;
466 }
467 if (uisec < isec)
468 {
469 i5 = ii + isec - 1;
470 for (i = ii + uisec; i <= i5; ++i)
471 {
472 f11 = c[i + j * c_dim1];
473 f12 = c[i + (j + 1) * c_dim1];
474 f13 = c[i + (j + 2) * c_dim1];
475 f14 = c[i + (j + 3) * c_dim1];
476 i6 = ll + lsec - 1;
477 for (l = ll; l <= i6; ++l)
478 {
479 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
480 257] * b[l + j * b_dim1];
481 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
482 257] * b[l + (j + 1) * b_dim1];
483 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
484 257] * b[l + (j + 2) * b_dim1];
485 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
486 257] * b[l + (j + 3) * b_dim1];
487 }
488 c[i + j * c_dim1] = f11;
489 c[i + (j + 1) * c_dim1] = f12;
490 c[i + (j + 2) * c_dim1] = f13;
491 c[i + (j + 3) * c_dim1] = f14;
492 }
493 }
494 }
495 if (ujsec < jsec)
496 {
497 i4 = jj + jsec - 1;
498 for (j = jj + ujsec; j <= i4; ++j)
499 {
500 i5 = ii + uisec - 1;
501 for (i = ii; i <= i5; i += 4)
502 {
503 f11 = c[i + j * c_dim1];
504 f21 = c[i + 1 + j * c_dim1];
505 f31 = c[i + 2 + j * c_dim1];
506 f41 = c[i + 3 + j * c_dim1];
507 i6 = ll + lsec - 1;
508 for (l = ll; l <= i6; ++l)
509 {
510 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
511 257] * b[l + j * b_dim1];
512 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
513 257] * b[l + j * b_dim1];
514 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
515 257] * b[l + j * b_dim1];
516 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
517 257] * b[l + j * b_dim1];
518 }
519 c[i + j * c_dim1] = f11;
520 c[i + 1 + j * c_dim1] = f21;
521 c[i + 2 + j * c_dim1] = f31;
522 c[i + 3 + j * c_dim1] = f41;
523 }
524 i5 = ii + isec - 1;
525 for (i = ii + uisec; i <= i5; ++i)
526 {
527 f11 = c[i + j * c_dim1];
528 i6 = ll + lsec - 1;
529 for (l = ll; l <= i6; ++l)
530 {
531 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
532 257] * b[l + j * b_dim1];
533 }
534 c[i + j * c_dim1] = f11;
535 }
536 }
537 }
538 }
539 }
540 }
8e5f30dc 541 free(t1);
31cfd832
TK
542 return;
543 }
544 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
545 {
546 if (GFC_DESCRIPTOR_RANK (a) != 1)
547 {
548 const GFC_INTEGER_2 *restrict abase_x;
549 const GFC_INTEGER_2 *restrict bbase_y;
550 GFC_INTEGER_2 *restrict dest_y;
551 GFC_INTEGER_2 s;
552
553 for (y = 0; y < ycount; y++)
554 {
555 bbase_y = &bbase[y*bystride];
556 dest_y = &dest[y*rystride];
557 for (x = 0; x < xcount; x++)
558 {
559 abase_x = &abase[x*axstride];
560 s = (GFC_INTEGER_2) 0;
561 for (n = 0; n < count; n++)
562 s += abase_x[n] * bbase_y[n];
563 dest_y[x] = s;
564 }
565 }
566 }
567 else
568 {
569 const GFC_INTEGER_2 *restrict bbase_y;
570 GFC_INTEGER_2 s;
571
572 for (y = 0; y < ycount; y++)
573 {
574 bbase_y = &bbase[y*bystride];
575 s = (GFC_INTEGER_2) 0;
576 for (n = 0; n < count; n++)
577 s += abase[n*axstride] * bbase_y[n];
578 dest[y*rystride] = s;
579 }
580 }
581 }
582 else if (axstride < aystride)
583 {
584 for (y = 0; y < ycount; y++)
585 for (x = 0; x < xcount; x++)
586 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
587
588 for (y = 0; y < ycount; y++)
589 for (n = 0; n < count; n++)
590 for (x = 0; x < xcount; x++)
591 /* dest[x,y] += a[x,n] * b[n,y] */
592 dest[x*rxstride + y*rystride] +=
593 abase[x*axstride + n*aystride] *
594 bbase[n*bxstride + y*bystride];
595 }
596 else if (GFC_DESCRIPTOR_RANK (a) == 1)
597 {
598 const GFC_INTEGER_2 *restrict bbase_y;
599 GFC_INTEGER_2 s;
600
601 for (y = 0; y < ycount; y++)
602 {
603 bbase_y = &bbase[y*bystride];
604 s = (GFC_INTEGER_2) 0;
605 for (n = 0; n < count; n++)
606 s += abase[n*axstride] * bbase_y[n*bxstride];
607 dest[y*rxstride] = s;
608 }
609 }
610 else
611 {
612 const GFC_INTEGER_2 *restrict abase_x;
613 const GFC_INTEGER_2 *restrict bbase_y;
614 GFC_INTEGER_2 *restrict dest_y;
615 GFC_INTEGER_2 s;
616
617 for (y = 0; y < ycount; y++)
618 {
619 bbase_y = &bbase[y*bystride];
620 dest_y = &dest[y*rystride];
621 for (x = 0; x < xcount; x++)
622 {
623 abase_x = &abase[x*axstride];
624 s = (GFC_INTEGER_2) 0;
625 for (n = 0; n < count; n++)
626 s += abase_x[n*aystride] * bbase_y[n*bxstride];
627 dest_y[x*rxstride] = s;
628 }
629 }
630 }
631}
632#undef POW3
633#undef min
634#undef max
635
636#endif /* HAVE_AVX */
637
638#ifdef HAVE_AVX2
639static void
640matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
641 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
6d03bdcc 642 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
31cfd832
TK
643static void
644matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
645 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
646 int blas_limit, blas_call gemm)
647{
648 const GFC_INTEGER_2 * restrict abase;
649 const GFC_INTEGER_2 * restrict bbase;
650 GFC_INTEGER_2 * restrict dest;
651
652 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
653 index_type x, y, n, count, xcount, ycount;
654
655 assert (GFC_DESCRIPTOR_RANK (a) == 2
656 || GFC_DESCRIPTOR_RANK (b) == 2);
657
658/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
659
660 Either A or B (but not both) can be rank 1:
661
662 o One-dimensional argument A is implicitly treated as a row matrix
663 dimensioned [1,count], so xcount=1.
664
665 o One-dimensional argument B is implicitly treated as a column matrix
666 dimensioned [count, 1], so ycount=1.
667*/
668
669 if (retarray->base_addr == NULL)
670 {
671 if (GFC_DESCRIPTOR_RANK (a) == 1)
672 {
673 GFC_DIMENSION_SET(retarray->dim[0], 0,
674 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
675 }
676 else if (GFC_DESCRIPTOR_RANK (b) == 1)
677 {
678 GFC_DIMENSION_SET(retarray->dim[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
680 }
681 else
682 {
683 GFC_DIMENSION_SET(retarray->dim[0], 0,
684 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
685
686 GFC_DIMENSION_SET(retarray->dim[1], 0,
687 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
688 GFC_DESCRIPTOR_EXTENT(retarray,0));
689 }
690
691 retarray->base_addr
692 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
693 retarray->offset = 0;
694 }
695 else if (unlikely (compile_options.bounds_check))
696 {
697 index_type ret_extent, arg_extent;
698
699 if (GFC_DESCRIPTOR_RANK (a) == 1)
700 {
701 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
702 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
703 if (arg_extent != ret_extent)
704 runtime_error ("Incorrect extent in return array in"
705 " MATMUL intrinsic: is %ld, should be %ld",
706 (long int) ret_extent, (long int) arg_extent);
707 }
708 else if (GFC_DESCRIPTOR_RANK (b) == 1)
709 {
710 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
711 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
712 if (arg_extent != ret_extent)
713 runtime_error ("Incorrect extent in return array in"
714 " MATMUL intrinsic: is %ld, should be %ld",
715 (long int) ret_extent, (long int) arg_extent);
716 }
717 else
718 {
719 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
720 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
721 if (arg_extent != ret_extent)
722 runtime_error ("Incorrect extent in return array in"
723 " MATMUL intrinsic for dimension 1:"
724 " is %ld, should be %ld",
725 (long int) ret_extent, (long int) arg_extent);
726
727 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
728 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
729 if (arg_extent != ret_extent)
730 runtime_error ("Incorrect extent in return array in"
731 " MATMUL intrinsic for dimension 2:"
732 " is %ld, should be %ld",
733 (long int) ret_extent, (long int) arg_extent);
734 }
735 }
736
737
738 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
739 {
740 /* One-dimensional result may be addressed in the code below
741 either as a row or a column matrix. We want both cases to
742 work. */
743 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
744 }
745 else
746 {
747 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
748 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
749 }
750
751
752 if (GFC_DESCRIPTOR_RANK (a) == 1)
753 {
754 /* Treat it as a a row matrix A[1,count]. */
755 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
756 aystride = 1;
757
758 xcount = 1;
759 count = GFC_DESCRIPTOR_EXTENT(a,0);
760 }
761 else
762 {
763 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
764 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
765
766 count = GFC_DESCRIPTOR_EXTENT(a,1);
767 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
768 }
769
770 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
771 {
772 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
773 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
774 }
775
776 if (GFC_DESCRIPTOR_RANK (b) == 1)
777 {
778 /* Treat it as a column matrix B[count,1] */
779 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
780
781 /* bystride should never be used for 1-dimensional b.
6ce6a84a
TK
782 The value is only used for calculation of the
783 memory by the buffer. */
784 bystride = 256;
31cfd832
TK
785 ycount = 1;
786 }
787 else
788 {
789 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
790 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
791 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
792 }
793
794 abase = a->base_addr;
795 bbase = b->base_addr;
796 dest = retarray->base_addr;
797
798 /* Now that everything is set up, we perform the multiplication
799 itself. */
800
801#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
802#define min(a,b) ((a) <= (b) ? (a) : (b))
803#define max(a,b) ((a) >= (b) ? (a) : (b))
804
805 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
806 && (bxstride == 1 || bystride == 1)
807 && (((float) xcount) * ((float) ycount) * ((float) count)
808 > POW3(blas_limit)))
809 {
810 const int m = xcount, n = ycount, k = count, ldc = rystride;
811 const GFC_INTEGER_2 one = 1, zero = 0;
812 const int lda = (axstride == 1) ? aystride : axstride,
813 ldb = (bxstride == 1) ? bystride : bxstride;
814
815 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
816 {
817 assert (gemm != NULL);
818 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
819 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
820 &ldc, 1, 1);
821 return;
822 }
823 }
824
825 if (rxstride == 1 && axstride == 1 && bxstride == 1)
826 {
827 /* This block of code implements a tuned matmul, derived from
828 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
829
830 Bo Kagstrom and Per Ling
831 Department of Computing Science
832 Umea University
833 S-901 87 Umea, Sweden
834
835 from netlib.org, translated to C, and modified for matmul.m4. */
836
837 const GFC_INTEGER_2 *a, *b;
838 GFC_INTEGER_2 *c;
839 const index_type m = xcount, n = ycount, k = count;
840
841 /* System generated locals */
842 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
843 i1, i2, i3, i4, i5, i6;
844
845 /* Local variables */
fd991039 846 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
31cfd832
TK
847 f13, f14, f23, f24, f33, f34, f43, f44;
848 index_type i, j, l, ii, jj, ll;
849 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
8e5f30dc 850 GFC_INTEGER_2 *t1;
31cfd832
TK
851
852 a = abase;
853 b = bbase;
854 c = retarray->base_addr;
855
856 /* Parameter adjustments */
857 c_dim1 = rystride;
858 c_offset = 1 + c_dim1;
859 c -= c_offset;
860 a_dim1 = aystride;
861 a_offset = 1 + a_dim1;
862 a -= a_offset;
863 b_dim1 = bystride;
864 b_offset = 1 + b_dim1;
865 b -= b_offset;
866
bbf97416
TK
867 /* Empty c first. */
868 for (j=1; j<=n; j++)
869 for (i=1; i<=m; i++)
870 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
871
31cfd832
TK
872 /* Early exit if possible */
873 if (m == 0 || n == 0 || k == 0)
874 return;
875
fd991039 876 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
877 index_type t1_dim, a_sz;
878 if (aystride == 1)
879 a_sz = rystride;
880 else
881 a_sz = a_dim1;
882
883 t1_dim = a_sz * 256 + b_dim1;
fd991039
TK
884 if (t1_dim > 65536)
885 t1_dim = 65536;
886
8e5f30dc 887 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
fd991039 888
31cfd832
TK
889 /* Start turning the crank. */
890 i1 = n;
891 for (jj = 1; jj <= i1; jj += 512)
892 {
893 /* Computing MIN */
894 i2 = 512;
895 i3 = n - jj + 1;
896 jsec = min(i2,i3);
897 ujsec = jsec - jsec % 4;
898 i2 = k;
899 for (ll = 1; ll <= i2; ll += 256)
900 {
901 /* Computing MIN */
902 i3 = 256;
903 i4 = k - ll + 1;
904 lsec = min(i3,i4);
905 ulsec = lsec - lsec % 2;
906
907 i3 = m;
908 for (ii = 1; ii <= i3; ii += 256)
909 {
910 /* Computing MIN */
911 i4 = 256;
912 i5 = m - ii + 1;
913 isec = min(i4,i5);
914 uisec = isec - isec % 2;
915 i4 = ll + ulsec - 1;
916 for (l = ll; l <= i4; l += 2)
917 {
918 i5 = ii + uisec - 1;
919 for (i = ii; i <= i5; i += 2)
920 {
921 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
922 a[i + l * a_dim1];
923 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
924 a[i + (l + 1) * a_dim1];
925 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
926 a[i + 1 + l * a_dim1];
927 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
928 a[i + 1 + (l + 1) * a_dim1];
929 }
930 if (uisec < isec)
931 {
932 t1[l - ll + 1 + (isec << 8) - 257] =
933 a[ii + isec - 1 + l * a_dim1];
934 t1[l - ll + 2 + (isec << 8) - 257] =
935 a[ii + isec - 1 + (l + 1) * a_dim1];
936 }
937 }
938 if (ulsec < lsec)
939 {
940 i4 = ii + isec - 1;
941 for (i = ii; i<= i4; ++i)
942 {
943 t1[lsec + ((i - ii + 1) << 8) - 257] =
944 a[i + (ll + lsec - 1) * a_dim1];
945 }
946 }
947
948 uisec = isec - isec % 4;
949 i4 = jj + ujsec - 1;
950 for (j = jj; j <= i4; j += 4)
951 {
952 i5 = ii + uisec - 1;
953 for (i = ii; i <= i5; i += 4)
954 {
955 f11 = c[i + j * c_dim1];
956 f21 = c[i + 1 + j * c_dim1];
957 f12 = c[i + (j + 1) * c_dim1];
958 f22 = c[i + 1 + (j + 1) * c_dim1];
959 f13 = c[i + (j + 2) * c_dim1];
960 f23 = c[i + 1 + (j + 2) * c_dim1];
961 f14 = c[i + (j + 3) * c_dim1];
962 f24 = c[i + 1 + (j + 3) * c_dim1];
963 f31 = c[i + 2 + j * c_dim1];
964 f41 = c[i + 3 + j * c_dim1];
965 f32 = c[i + 2 + (j + 1) * c_dim1];
966 f42 = c[i + 3 + (j + 1) * c_dim1];
967 f33 = c[i + 2 + (j + 2) * c_dim1];
968 f43 = c[i + 3 + (j + 2) * c_dim1];
969 f34 = c[i + 2 + (j + 3) * c_dim1];
970 f44 = c[i + 3 + (j + 3) * c_dim1];
971 i6 = ll + lsec - 1;
972 for (l = ll; l <= i6; ++l)
973 {
974 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
975 * b[l + j * b_dim1];
976 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
977 * b[l + j * b_dim1];
978 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
979 * b[l + (j + 1) * b_dim1];
980 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
981 * b[l + (j + 1) * b_dim1];
982 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
983 * b[l + (j + 2) * b_dim1];
984 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
985 * b[l + (j + 2) * b_dim1];
986 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
987 * b[l + (j + 3) * b_dim1];
988 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
989 * b[l + (j + 3) * b_dim1];
990 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
991 * b[l + j * b_dim1];
992 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
993 * b[l + j * b_dim1];
994 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
995 * b[l + (j + 1) * b_dim1];
996 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
997 * b[l + (j + 1) * b_dim1];
998 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
999 * b[l + (j + 2) * b_dim1];
1000 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1001 * b[l + (j + 2) * b_dim1];
1002 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1003 * b[l + (j + 3) * b_dim1];
1004 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1005 * b[l + (j + 3) * b_dim1];
1006 }
1007 c[i + j * c_dim1] = f11;
1008 c[i + 1 + j * c_dim1] = f21;
1009 c[i + (j + 1) * c_dim1] = f12;
1010 c[i + 1 + (j + 1) * c_dim1] = f22;
1011 c[i + (j + 2) * c_dim1] = f13;
1012 c[i + 1 + (j + 2) * c_dim1] = f23;
1013 c[i + (j + 3) * c_dim1] = f14;
1014 c[i + 1 + (j + 3) * c_dim1] = f24;
1015 c[i + 2 + j * c_dim1] = f31;
1016 c[i + 3 + j * c_dim1] = f41;
1017 c[i + 2 + (j + 1) * c_dim1] = f32;
1018 c[i + 3 + (j + 1) * c_dim1] = f42;
1019 c[i + 2 + (j + 2) * c_dim1] = f33;
1020 c[i + 3 + (j + 2) * c_dim1] = f43;
1021 c[i + 2 + (j + 3) * c_dim1] = f34;
1022 c[i + 3 + (j + 3) * c_dim1] = f44;
1023 }
1024 if (uisec < isec)
1025 {
1026 i5 = ii + isec - 1;
1027 for (i = ii + uisec; i <= i5; ++i)
1028 {
1029 f11 = c[i + j * c_dim1];
1030 f12 = c[i + (j + 1) * c_dim1];
1031 f13 = c[i + (j + 2) * c_dim1];
1032 f14 = c[i + (j + 3) * c_dim1];
1033 i6 = ll + lsec - 1;
1034 for (l = ll; l <= i6; ++l)
1035 {
1036 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1037 257] * b[l + j * b_dim1];
1038 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1039 257] * b[l + (j + 1) * b_dim1];
1040 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 257] * b[l + (j + 2) * b_dim1];
1042 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1043 257] * b[l + (j + 3) * b_dim1];
1044 }
1045 c[i + j * c_dim1] = f11;
1046 c[i + (j + 1) * c_dim1] = f12;
1047 c[i + (j + 2) * c_dim1] = f13;
1048 c[i + (j + 3) * c_dim1] = f14;
1049 }
1050 }
1051 }
1052 if (ujsec < jsec)
1053 {
1054 i4 = jj + jsec - 1;
1055 for (j = jj + ujsec; j <= i4; ++j)
1056 {
1057 i5 = ii + uisec - 1;
1058 for (i = ii; i <= i5; i += 4)
1059 {
1060 f11 = c[i + j * c_dim1];
1061 f21 = c[i + 1 + j * c_dim1];
1062 f31 = c[i + 2 + j * c_dim1];
1063 f41 = c[i + 3 + j * c_dim1];
1064 i6 = ll + lsec - 1;
1065 for (l = ll; l <= i6; ++l)
1066 {
1067 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1068 257] * b[l + j * b_dim1];
1069 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1070 257] * b[l + j * b_dim1];
1071 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1072 257] * b[l + j * b_dim1];
1073 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1074 257] * b[l + j * b_dim1];
1075 }
1076 c[i + j * c_dim1] = f11;
1077 c[i + 1 + j * c_dim1] = f21;
1078 c[i + 2 + j * c_dim1] = f31;
1079 c[i + 3 + j * c_dim1] = f41;
1080 }
1081 i5 = ii + isec - 1;
1082 for (i = ii + uisec; i <= i5; ++i)
1083 {
1084 f11 = c[i + j * c_dim1];
1085 i6 = ll + lsec - 1;
1086 for (l = ll; l <= i6; ++l)
1087 {
1088 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1089 257] * b[l + j * b_dim1];
1090 }
1091 c[i + j * c_dim1] = f11;
1092 }
1093 }
1094 }
1095 }
1096 }
1097 }
8e5f30dc 1098 free(t1);
31cfd832
TK
1099 return;
1100 }
1101 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1102 {
1103 if (GFC_DESCRIPTOR_RANK (a) != 1)
1104 {
1105 const GFC_INTEGER_2 *restrict abase_x;
1106 const GFC_INTEGER_2 *restrict bbase_y;
1107 GFC_INTEGER_2 *restrict dest_y;
1108 GFC_INTEGER_2 s;
1109
1110 for (y = 0; y < ycount; y++)
1111 {
1112 bbase_y = &bbase[y*bystride];
1113 dest_y = &dest[y*rystride];
1114 for (x = 0; x < xcount; x++)
1115 {
1116 abase_x = &abase[x*axstride];
1117 s = (GFC_INTEGER_2) 0;
1118 for (n = 0; n < count; n++)
1119 s += abase_x[n] * bbase_y[n];
1120 dest_y[x] = s;
1121 }
1122 }
1123 }
1124 else
1125 {
1126 const GFC_INTEGER_2 *restrict bbase_y;
1127 GFC_INTEGER_2 s;
1128
1129 for (y = 0; y < ycount; y++)
1130 {
1131 bbase_y = &bbase[y*bystride];
1132 s = (GFC_INTEGER_2) 0;
1133 for (n = 0; n < count; n++)
1134 s += abase[n*axstride] * bbase_y[n];
1135 dest[y*rystride] = s;
1136 }
1137 }
1138 }
1139 else if (axstride < aystride)
1140 {
1141 for (y = 0; y < ycount; y++)
1142 for (x = 0; x < xcount; x++)
1143 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1144
1145 for (y = 0; y < ycount; y++)
1146 for (n = 0; n < count; n++)
1147 for (x = 0; x < xcount; x++)
1148 /* dest[x,y] += a[x,n] * b[n,y] */
1149 dest[x*rxstride + y*rystride] +=
1150 abase[x*axstride + n*aystride] *
1151 bbase[n*bxstride + y*bystride];
1152 }
1153 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1154 {
1155 const GFC_INTEGER_2 *restrict bbase_y;
1156 GFC_INTEGER_2 s;
1157
1158 for (y = 0; y < ycount; y++)
1159 {
1160 bbase_y = &bbase[y*bystride];
1161 s = (GFC_INTEGER_2) 0;
1162 for (n = 0; n < count; n++)
1163 s += abase[n*axstride] * bbase_y[n*bxstride];
1164 dest[y*rxstride] = s;
1165 }
1166 }
1167 else
1168 {
1169 const GFC_INTEGER_2 *restrict abase_x;
1170 const GFC_INTEGER_2 *restrict bbase_y;
1171 GFC_INTEGER_2 *restrict dest_y;
1172 GFC_INTEGER_2 s;
1173
1174 for (y = 0; y < ycount; y++)
1175 {
1176 bbase_y = &bbase[y*bystride];
1177 dest_y = &dest[y*rystride];
1178 for (x = 0; x < xcount; x++)
1179 {
1180 abase_x = &abase[x*axstride];
1181 s = (GFC_INTEGER_2) 0;
1182 for (n = 0; n < count; n++)
1183 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1184 dest_y[x*rxstride] = s;
1185 }
1186 }
1187 }
1188}
1189#undef POW3
1190#undef min
1191#undef max
1192
1193#endif /* HAVE_AVX2 */
1194
1195#ifdef HAVE_AVX512F
1196static void
1197matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
1198 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1199 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1200static void
1201matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
1202 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1203 int blas_limit, blas_call gemm)
1204{
1205 const GFC_INTEGER_2 * restrict abase;
1206 const GFC_INTEGER_2 * restrict bbase;
1207 GFC_INTEGER_2 * restrict dest;
1208
1209 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1210 index_type x, y, n, count, xcount, ycount;
1211
1212 assert (GFC_DESCRIPTOR_RANK (a) == 2
1213 || GFC_DESCRIPTOR_RANK (b) == 2);
1214
1215/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1216
1217 Either A or B (but not both) can be rank 1:
1218
1219 o One-dimensional argument A is implicitly treated as a row matrix
1220 dimensioned [1,count], so xcount=1.
1221
1222 o One-dimensional argument B is implicitly treated as a column matrix
1223 dimensioned [count, 1], so ycount=1.
1224*/
1225
1226 if (retarray->base_addr == NULL)
1227 {
1228 if (GFC_DESCRIPTOR_RANK (a) == 1)
1229 {
1230 GFC_DIMENSION_SET(retarray->dim[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1232 }
1233 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1234 {
1235 GFC_DIMENSION_SET(retarray->dim[0], 0,
1236 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1237 }
1238 else
1239 {
1240 GFC_DIMENSION_SET(retarray->dim[0], 0,
1241 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1242
1243 GFC_DIMENSION_SET(retarray->dim[1], 0,
1244 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1245 GFC_DESCRIPTOR_EXTENT(retarray,0));
1246 }
1247
1248 retarray->base_addr
1249 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
1250 retarray->offset = 0;
1251 }
1252 else if (unlikely (compile_options.bounds_check))
1253 {
1254 index_type ret_extent, arg_extent;
1255
1256 if (GFC_DESCRIPTOR_RANK (a) == 1)
1257 {
1258 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1259 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1260 if (arg_extent != ret_extent)
1261 runtime_error ("Incorrect extent in return array in"
1262 " MATMUL intrinsic: is %ld, should be %ld",
1263 (long int) ret_extent, (long int) arg_extent);
1264 }
1265 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1266 {
1267 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1268 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1269 if (arg_extent != ret_extent)
1270 runtime_error ("Incorrect extent in return array in"
1271 " MATMUL intrinsic: is %ld, should be %ld",
1272 (long int) ret_extent, (long int) arg_extent);
1273 }
1274 else
1275 {
1276 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1277 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1278 if (arg_extent != ret_extent)
1279 runtime_error ("Incorrect extent in return array in"
1280 " MATMUL intrinsic for dimension 1:"
1281 " is %ld, should be %ld",
1282 (long int) ret_extent, (long int) arg_extent);
1283
1284 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1285 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1286 if (arg_extent != ret_extent)
1287 runtime_error ("Incorrect extent in return array in"
1288 " MATMUL intrinsic for dimension 2:"
1289 " is %ld, should be %ld",
1290 (long int) ret_extent, (long int) arg_extent);
1291 }
1292 }
1293
1294
1295 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1296 {
1297 /* One-dimensional result may be addressed in the code below
1298 either as a row or a column matrix. We want both cases to
1299 work. */
1300 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1301 }
1302 else
1303 {
1304 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1305 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1306 }
1307
1308
1309 if (GFC_DESCRIPTOR_RANK (a) == 1)
1310 {
1311 /* Treat it as a a row matrix A[1,count]. */
1312 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1313 aystride = 1;
1314
1315 xcount = 1;
1316 count = GFC_DESCRIPTOR_EXTENT(a,0);
1317 }
1318 else
1319 {
1320 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1321 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1322
1323 count = GFC_DESCRIPTOR_EXTENT(a,1);
1324 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1325 }
1326
1327 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1328 {
1329 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1330 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1331 }
1332
1333 if (GFC_DESCRIPTOR_RANK (b) == 1)
1334 {
1335 /* Treat it as a column matrix B[count,1] */
1336 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337
1338 /* bystride should never be used for 1-dimensional b.
6ce6a84a
TK
1339 The value is only used for calculation of the
1340 memory by the buffer. */
1341 bystride = 256;
31cfd832
TK
1342 ycount = 1;
1343 }
1344 else
1345 {
1346 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1347 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1348 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1349 }
1350
1351 abase = a->base_addr;
1352 bbase = b->base_addr;
1353 dest = retarray->base_addr;
1354
1355 /* Now that everything is set up, we perform the multiplication
1356 itself. */
1357
1358#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1359#define min(a,b) ((a) <= (b) ? (a) : (b))
1360#define max(a,b) ((a) >= (b) ? (a) : (b))
1361
1362 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1363 && (bxstride == 1 || bystride == 1)
1364 && (((float) xcount) * ((float) ycount) * ((float) count)
1365 > POW3(blas_limit)))
1366 {
1367 const int m = xcount, n = ycount, k = count, ldc = rystride;
1368 const GFC_INTEGER_2 one = 1, zero = 0;
1369 const int lda = (axstride == 1) ? aystride : axstride,
1370 ldb = (bxstride == 1) ? bystride : bxstride;
1371
1372 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1373 {
1374 assert (gemm != NULL);
1375 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1376 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1377 &ldc, 1, 1);
1378 return;
1379 }
1380 }
1381
1382 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1383 {
1384 /* This block of code implements a tuned matmul, derived from
1385 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1386
1387 Bo Kagstrom and Per Ling
1388 Department of Computing Science
1389 Umea University
1390 S-901 87 Umea, Sweden
1391
1392 from netlib.org, translated to C, and modified for matmul.m4. */
1393
1394 const GFC_INTEGER_2 *a, *b;
1395 GFC_INTEGER_2 *c;
1396 const index_type m = xcount, n = ycount, k = count;
1397
1398 /* System generated locals */
1399 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1400 i1, i2, i3, i4, i5, i6;
1401
1402 /* Local variables */
fd991039 1403 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
31cfd832
TK
1404 f13, f14, f23, f24, f33, f34, f43, f44;
1405 index_type i, j, l, ii, jj, ll;
1406 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
8e5f30dc 1407 GFC_INTEGER_2 *t1;
31cfd832
TK
1408
1409 a = abase;
1410 b = bbase;
1411 c = retarray->base_addr;
1412
1413 /* Parameter adjustments */
1414 c_dim1 = rystride;
1415 c_offset = 1 + c_dim1;
1416 c -= c_offset;
1417 a_dim1 = aystride;
1418 a_offset = 1 + a_dim1;
1419 a -= a_offset;
1420 b_dim1 = bystride;
1421 b_offset = 1 + b_dim1;
1422 b -= b_offset;
1423
bbf97416
TK
1424 /* Empty c first. */
1425 for (j=1; j<=n; j++)
1426 for (i=1; i<=m; i++)
1427 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
1428
31cfd832
TK
1429 /* Early exit if possible */
1430 if (m == 0 || n == 0 || k == 0)
1431 return;
1432
fd991039 1433 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
1434 index_type t1_dim, a_sz;
1435 if (aystride == 1)
1436 a_sz = rystride;
1437 else
1438 a_sz = a_dim1;
1439
1440 t1_dim = a_sz * 256 + b_dim1;
fd991039
TK
1441 if (t1_dim > 65536)
1442 t1_dim = 65536;
1443
8e5f30dc 1444 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
fd991039 1445
31cfd832
TK
1446 /* Start turning the crank. */
1447 i1 = n;
1448 for (jj = 1; jj <= i1; jj += 512)
1449 {
1450 /* Computing MIN */
1451 i2 = 512;
1452 i3 = n - jj + 1;
1453 jsec = min(i2,i3);
1454 ujsec = jsec - jsec % 4;
1455 i2 = k;
1456 for (ll = 1; ll <= i2; ll += 256)
1457 {
1458 /* Computing MIN */
1459 i3 = 256;
1460 i4 = k - ll + 1;
1461 lsec = min(i3,i4);
1462 ulsec = lsec - lsec % 2;
1463
1464 i3 = m;
1465 for (ii = 1; ii <= i3; ii += 256)
1466 {
1467 /* Computing MIN */
1468 i4 = 256;
1469 i5 = m - ii + 1;
1470 isec = min(i4,i5);
1471 uisec = isec - isec % 2;
1472 i4 = ll + ulsec - 1;
1473 for (l = ll; l <= i4; l += 2)
1474 {
1475 i5 = ii + uisec - 1;
1476 for (i = ii; i <= i5; i += 2)
1477 {
1478 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1479 a[i + l * a_dim1];
1480 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1481 a[i + (l + 1) * a_dim1];
1482 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1483 a[i + 1 + l * a_dim1];
1484 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1485 a[i + 1 + (l + 1) * a_dim1];
1486 }
1487 if (uisec < isec)
1488 {
1489 t1[l - ll + 1 + (isec << 8) - 257] =
1490 a[ii + isec - 1 + l * a_dim1];
1491 t1[l - ll + 2 + (isec << 8) - 257] =
1492 a[ii + isec - 1 + (l + 1) * a_dim1];
1493 }
1494 }
1495 if (ulsec < lsec)
1496 {
1497 i4 = ii + isec - 1;
1498 for (i = ii; i<= i4; ++i)
1499 {
1500 t1[lsec + ((i - ii + 1) << 8) - 257] =
1501 a[i + (ll + lsec - 1) * a_dim1];
1502 }
1503 }
1504
1505 uisec = isec - isec % 4;
1506 i4 = jj + ujsec - 1;
1507 for (j = jj; j <= i4; j += 4)
1508 {
1509 i5 = ii + uisec - 1;
1510 for (i = ii; i <= i5; i += 4)
1511 {
1512 f11 = c[i + j * c_dim1];
1513 f21 = c[i + 1 + j * c_dim1];
1514 f12 = c[i + (j + 1) * c_dim1];
1515 f22 = c[i + 1 + (j + 1) * c_dim1];
1516 f13 = c[i + (j + 2) * c_dim1];
1517 f23 = c[i + 1 + (j + 2) * c_dim1];
1518 f14 = c[i + (j + 3) * c_dim1];
1519 f24 = c[i + 1 + (j + 3) * c_dim1];
1520 f31 = c[i + 2 + j * c_dim1];
1521 f41 = c[i + 3 + j * c_dim1];
1522 f32 = c[i + 2 + (j + 1) * c_dim1];
1523 f42 = c[i + 3 + (j + 1) * c_dim1];
1524 f33 = c[i + 2 + (j + 2) * c_dim1];
1525 f43 = c[i + 3 + (j + 2) * c_dim1];
1526 f34 = c[i + 2 + (j + 3) * c_dim1];
1527 f44 = c[i + 3 + (j + 3) * c_dim1];
1528 i6 = ll + lsec - 1;
1529 for (l = ll; l <= i6; ++l)
1530 {
1531 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1532 * b[l + j * b_dim1];
1533 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1534 * b[l + j * b_dim1];
1535 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1536 * b[l + (j + 1) * b_dim1];
1537 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1538 * b[l + (j + 1) * b_dim1];
1539 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1540 * b[l + (j + 2) * b_dim1];
1541 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1542 * b[l + (j + 2) * b_dim1];
1543 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1544 * b[l + (j + 3) * b_dim1];
1545 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1546 * b[l + (j + 3) * b_dim1];
1547 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1548 * b[l + j * b_dim1];
1549 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1550 * b[l + j * b_dim1];
1551 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1552 * b[l + (j + 1) * b_dim1];
1553 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1554 * b[l + (j + 1) * b_dim1];
1555 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1556 * b[l + (j + 2) * b_dim1];
1557 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1558 * b[l + (j + 2) * b_dim1];
1559 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1560 * b[l + (j + 3) * b_dim1];
1561 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1562 * b[l + (j + 3) * b_dim1];
1563 }
1564 c[i + j * c_dim1] = f11;
1565 c[i + 1 + j * c_dim1] = f21;
1566 c[i + (j + 1) * c_dim1] = f12;
1567 c[i + 1 + (j + 1) * c_dim1] = f22;
1568 c[i + (j + 2) * c_dim1] = f13;
1569 c[i + 1 + (j + 2) * c_dim1] = f23;
1570 c[i + (j + 3) * c_dim1] = f14;
1571 c[i + 1 + (j + 3) * c_dim1] = f24;
1572 c[i + 2 + j * c_dim1] = f31;
1573 c[i + 3 + j * c_dim1] = f41;
1574 c[i + 2 + (j + 1) * c_dim1] = f32;
1575 c[i + 3 + (j + 1) * c_dim1] = f42;
1576 c[i + 2 + (j + 2) * c_dim1] = f33;
1577 c[i + 3 + (j + 2) * c_dim1] = f43;
1578 c[i + 2 + (j + 3) * c_dim1] = f34;
1579 c[i + 3 + (j + 3) * c_dim1] = f44;
1580 }
1581 if (uisec < isec)
1582 {
1583 i5 = ii + isec - 1;
1584 for (i = ii + uisec; i <= i5; ++i)
1585 {
1586 f11 = c[i + j * c_dim1];
1587 f12 = c[i + (j + 1) * c_dim1];
1588 f13 = c[i + (j + 2) * c_dim1];
1589 f14 = c[i + (j + 3) * c_dim1];
1590 i6 = ll + lsec - 1;
1591 for (l = ll; l <= i6; ++l)
1592 {
1593 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1594 257] * b[l + j * b_dim1];
1595 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1596 257] * b[l + (j + 1) * b_dim1];
1597 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1598 257] * b[l + (j + 2) * b_dim1];
1599 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1600 257] * b[l + (j + 3) * b_dim1];
1601 }
1602 c[i + j * c_dim1] = f11;
1603 c[i + (j + 1) * c_dim1] = f12;
1604 c[i + (j + 2) * c_dim1] = f13;
1605 c[i + (j + 3) * c_dim1] = f14;
1606 }
1607 }
1608 }
1609 if (ujsec < jsec)
1610 {
1611 i4 = jj + jsec - 1;
1612 for (j = jj + ujsec; j <= i4; ++j)
1613 {
1614 i5 = ii + uisec - 1;
1615 for (i = ii; i <= i5; i += 4)
1616 {
1617 f11 = c[i + j * c_dim1];
1618 f21 = c[i + 1 + j * c_dim1];
1619 f31 = c[i + 2 + j * c_dim1];
1620 f41 = c[i + 3 + j * c_dim1];
1621 i6 = ll + lsec - 1;
1622 for (l = ll; l <= i6; ++l)
1623 {
1624 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1625 257] * b[l + j * b_dim1];
1626 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1627 257] * b[l + j * b_dim1];
1628 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1629 257] * b[l + j * b_dim1];
1630 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1631 257] * b[l + j * b_dim1];
1632 }
1633 c[i + j * c_dim1] = f11;
1634 c[i + 1 + j * c_dim1] = f21;
1635 c[i + 2 + j * c_dim1] = f31;
1636 c[i + 3 + j * c_dim1] = f41;
1637 }
1638 i5 = ii + isec - 1;
1639 for (i = ii + uisec; i <= i5; ++i)
1640 {
1641 f11 = c[i + j * c_dim1];
1642 i6 = ll + lsec - 1;
1643 for (l = ll; l <= i6; ++l)
1644 {
1645 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1646 257] * b[l + j * b_dim1];
1647 }
1648 c[i + j * c_dim1] = f11;
1649 }
1650 }
1651 }
1652 }
1653 }
1654 }
8e5f30dc 1655 free(t1);
31cfd832
TK
1656 return;
1657 }
1658 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1659 {
1660 if (GFC_DESCRIPTOR_RANK (a) != 1)
1661 {
1662 const GFC_INTEGER_2 *restrict abase_x;
1663 const GFC_INTEGER_2 *restrict bbase_y;
1664 GFC_INTEGER_2 *restrict dest_y;
1665 GFC_INTEGER_2 s;
1666
1667 for (y = 0; y < ycount; y++)
1668 {
1669 bbase_y = &bbase[y*bystride];
1670 dest_y = &dest[y*rystride];
1671 for (x = 0; x < xcount; x++)
1672 {
1673 abase_x = &abase[x*axstride];
1674 s = (GFC_INTEGER_2) 0;
1675 for (n = 0; n < count; n++)
1676 s += abase_x[n] * bbase_y[n];
1677 dest_y[x] = s;
1678 }
1679 }
1680 }
1681 else
1682 {
1683 const GFC_INTEGER_2 *restrict bbase_y;
1684 GFC_INTEGER_2 s;
1685
1686 for (y = 0; y < ycount; y++)
1687 {
1688 bbase_y = &bbase[y*bystride];
1689 s = (GFC_INTEGER_2) 0;
1690 for (n = 0; n < count; n++)
1691 s += abase[n*axstride] * bbase_y[n];
1692 dest[y*rystride] = s;
1693 }
1694 }
1695 }
1696 else if (axstride < aystride)
1697 {
1698 for (y = 0; y < ycount; y++)
1699 for (x = 0; x < xcount; x++)
1700 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1701
1702 for (y = 0; y < ycount; y++)
1703 for (n = 0; n < count; n++)
1704 for (x = 0; x < xcount; x++)
1705 /* dest[x,y] += a[x,n] * b[n,y] */
1706 dest[x*rxstride + y*rystride] +=
1707 abase[x*axstride + n*aystride] *
1708 bbase[n*bxstride + y*bystride];
1709 }
1710 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1711 {
1712 const GFC_INTEGER_2 *restrict bbase_y;
1713 GFC_INTEGER_2 s;
1714
1715 for (y = 0; y < ycount; y++)
1716 {
1717 bbase_y = &bbase[y*bystride];
1718 s = (GFC_INTEGER_2) 0;
1719 for (n = 0; n < count; n++)
1720 s += abase[n*axstride] * bbase_y[n*bxstride];
1721 dest[y*rxstride] = s;
1722 }
1723 }
1724 else
1725 {
1726 const GFC_INTEGER_2 *restrict abase_x;
1727 const GFC_INTEGER_2 *restrict bbase_y;
1728 GFC_INTEGER_2 *restrict dest_y;
1729 GFC_INTEGER_2 s;
1730
1731 for (y = 0; y < ycount; y++)
1732 {
1733 bbase_y = &bbase[y*bystride];
1734 dest_y = &dest[y*rystride];
1735 for (x = 0; x < xcount; x++)
1736 {
1737 abase_x = &abase[x*axstride];
1738 s = (GFC_INTEGER_2) 0;
1739 for (n = 0; n < count; n++)
1740 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1741 dest_y[x*rxstride] = s;
1742 }
1743 }
1744 }
1745}
1746#undef POW3
1747#undef min
1748#undef max
1749
1750#endif /* HAVE_AVX512F */
1751
1d5cf7fc
TK
1752/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1753
1754#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1755void
1756matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
1757 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1758 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1759internal_proto(matmul_i2_avx128_fma3);
1760#endif
1761
1762#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1763void
1764matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
1765 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1766 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1767internal_proto(matmul_i2_avx128_fma4);
1768#endif
1769
31cfd832
TK
1770/* Function to fall back to if there is no special processor-specific version. */
1771static void
1772matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
1773 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1774 int blas_limit, blas_call gemm)
1775{
1776 const GFC_INTEGER_2 * restrict abase;
1777 const GFC_INTEGER_2 * restrict bbase;
1778 GFC_INTEGER_2 * restrict dest;
1779
1780 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1781 index_type x, y, n, count, xcount, ycount;
1782
1783 assert (GFC_DESCRIPTOR_RANK (a) == 2
1784 || GFC_DESCRIPTOR_RANK (b) == 2);
1785
1786/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1787
1788 Either A or B (but not both) can be rank 1:
1789
1790 o One-dimensional argument A is implicitly treated as a row matrix
1791 dimensioned [1,count], so xcount=1.
1792
1793 o One-dimensional argument B is implicitly treated as a column matrix
1794 dimensioned [count, 1], so ycount=1.
1795*/
1796
1797 if (retarray->base_addr == NULL)
1798 {
1799 if (GFC_DESCRIPTOR_RANK (a) == 1)
1800 {
1801 GFC_DIMENSION_SET(retarray->dim[0], 0,
1802 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1803 }
1804 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1805 {
1806 GFC_DIMENSION_SET(retarray->dim[0], 0,
1807 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1808 }
1809 else
1810 {
1811 GFC_DIMENSION_SET(retarray->dim[0], 0,
1812 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1813
1814 GFC_DIMENSION_SET(retarray->dim[1], 0,
1815 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1816 GFC_DESCRIPTOR_EXTENT(retarray,0));
1817 }
1818
1819 retarray->base_addr
1820 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
1821 retarray->offset = 0;
1822 }
1823 else if (unlikely (compile_options.bounds_check))
1824 {
1825 index_type ret_extent, arg_extent;
1826
1827 if (GFC_DESCRIPTOR_RANK (a) == 1)
1828 {
1829 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1830 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1831 if (arg_extent != ret_extent)
1832 runtime_error ("Incorrect extent in return array in"
1833 " MATMUL intrinsic: is %ld, should be %ld",
1834 (long int) ret_extent, (long int) arg_extent);
1835 }
1836 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1837 {
1838 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1839 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1840 if (arg_extent != ret_extent)
1841 runtime_error ("Incorrect extent in return array in"
1842 " MATMUL intrinsic: is %ld, should be %ld",
1843 (long int) ret_extent, (long int) arg_extent);
1844 }
1845 else
1846 {
1847 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1848 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1849 if (arg_extent != ret_extent)
1850 runtime_error ("Incorrect extent in return array in"
1851 " MATMUL intrinsic for dimension 1:"
1852 " is %ld, should be %ld",
1853 (long int) ret_extent, (long int) arg_extent);
1854
1855 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1856 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1857 if (arg_extent != ret_extent)
1858 runtime_error ("Incorrect extent in return array in"
1859 " MATMUL intrinsic for dimension 2:"
1860 " is %ld, should be %ld",
1861 (long int) ret_extent, (long int) arg_extent);
1862 }
1863 }
1864
1865
1866 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1867 {
1868 /* One-dimensional result may be addressed in the code below
1869 either as a row or a column matrix. We want both cases to
1870 work. */
1871 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1872 }
1873 else
1874 {
1875 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1876 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1877 }
1878
1879
1880 if (GFC_DESCRIPTOR_RANK (a) == 1)
1881 {
1882 /* Treat it as a a row matrix A[1,count]. */
1883 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1884 aystride = 1;
1885
1886 xcount = 1;
1887 count = GFC_DESCRIPTOR_EXTENT(a,0);
1888 }
1889 else
1890 {
1891 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1892 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1893
1894 count = GFC_DESCRIPTOR_EXTENT(a,1);
1895 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1896 }
1897
1898 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1899 {
1900 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1901 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1902 }
1903
1904 if (GFC_DESCRIPTOR_RANK (b) == 1)
1905 {
1906 /* Treat it as a column matrix B[count,1] */
1907 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1908
1909 /* bystride should never be used for 1-dimensional b.
6ce6a84a
TK
1910 The value is only used for calculation of the
1911 memory by the buffer. */
1912 bystride = 256;
31cfd832
TK
1913 ycount = 1;
1914 }
1915 else
1916 {
1917 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1918 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1919 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1920 }
1921
1922 abase = a->base_addr;
1923 bbase = b->base_addr;
1924 dest = retarray->base_addr;
1925
1926 /* Now that everything is set up, we perform the multiplication
1927 itself. */
1928
1929#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1930#define min(a,b) ((a) <= (b) ? (a) : (b))
1931#define max(a,b) ((a) >= (b) ? (a) : (b))
1932
1933 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1934 && (bxstride == 1 || bystride == 1)
1935 && (((float) xcount) * ((float) ycount) * ((float) count)
1936 > POW3(blas_limit)))
1937 {
1938 const int m = xcount, n = ycount, k = count, ldc = rystride;
1939 const GFC_INTEGER_2 one = 1, zero = 0;
1940 const int lda = (axstride == 1) ? aystride : axstride,
1941 ldb = (bxstride == 1) ? bystride : bxstride;
1942
1943 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1944 {
1945 assert (gemm != NULL);
1946 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1947 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1948 &ldc, 1, 1);
1949 return;
1950 }
1951 }
1952
1953 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1954 {
1955 /* This block of code implements a tuned matmul, derived from
1956 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1957
1958 Bo Kagstrom and Per Ling
1959 Department of Computing Science
1960 Umea University
1961 S-901 87 Umea, Sweden
1962
1963 from netlib.org, translated to C, and modified for matmul.m4. */
1964
1965 const GFC_INTEGER_2 *a, *b;
1966 GFC_INTEGER_2 *c;
1967 const index_type m = xcount, n = ycount, k = count;
1968
1969 /* System generated locals */
1970 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1971 i1, i2, i3, i4, i5, i6;
1972
1973 /* Local variables */
fd991039 1974 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
31cfd832
TK
1975 f13, f14, f23, f24, f33, f34, f43, f44;
1976 index_type i, j, l, ii, jj, ll;
1977 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
8e5f30dc 1978 GFC_INTEGER_2 *t1;
31cfd832
TK
1979
1980 a = abase;
1981 b = bbase;
1982 c = retarray->base_addr;
1983
1984 /* Parameter adjustments */
1985 c_dim1 = rystride;
1986 c_offset = 1 + c_dim1;
1987 c -= c_offset;
1988 a_dim1 = aystride;
1989 a_offset = 1 + a_dim1;
1990 a -= a_offset;
1991 b_dim1 = bystride;
1992 b_offset = 1 + b_dim1;
1993 b -= b_offset;
1994
bbf97416
TK
1995 /* Empty c first. */
1996 for (j=1; j<=n; j++)
1997 for (i=1; i<=m; i++)
1998 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
1999
31cfd832
TK
2000 /* Early exit if possible */
2001 if (m == 0 || n == 0 || k == 0)
2002 return;
2003
fd991039 2004 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
2005 index_type t1_dim, a_sz;
2006 if (aystride == 1)
2007 a_sz = rystride;
2008 else
2009 a_sz = a_dim1;
2010
2011 t1_dim = a_sz * 256 + b_dim1;
fd991039
TK
2012 if (t1_dim > 65536)
2013 t1_dim = 65536;
2014
8e5f30dc 2015 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
fd991039 2016
31cfd832
TK
2017 /* Start turning the crank. */
2018 i1 = n;
2019 for (jj = 1; jj <= i1; jj += 512)
2020 {
2021 /* Computing MIN */
2022 i2 = 512;
2023 i3 = n - jj + 1;
2024 jsec = min(i2,i3);
2025 ujsec = jsec - jsec % 4;
2026 i2 = k;
2027 for (ll = 1; ll <= i2; ll += 256)
2028 {
2029 /* Computing MIN */
2030 i3 = 256;
2031 i4 = k - ll + 1;
2032 lsec = min(i3,i4);
2033 ulsec = lsec - lsec % 2;
2034
2035 i3 = m;
2036 for (ii = 1; ii <= i3; ii += 256)
2037 {
2038 /* Computing MIN */
2039 i4 = 256;
2040 i5 = m - ii + 1;
2041 isec = min(i4,i5);
2042 uisec = isec - isec % 2;
2043 i4 = ll + ulsec - 1;
2044 for (l = ll; l <= i4; l += 2)
2045 {
2046 i5 = ii + uisec - 1;
2047 for (i = ii; i <= i5; i += 2)
2048 {
2049 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2050 a[i + l * a_dim1];
2051 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2052 a[i + (l + 1) * a_dim1];
2053 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2054 a[i + 1 + l * a_dim1];
2055 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2056 a[i + 1 + (l + 1) * a_dim1];
2057 }
2058 if (uisec < isec)
2059 {
2060 t1[l - ll + 1 + (isec << 8) - 257] =
2061 a[ii + isec - 1 + l * a_dim1];
2062 t1[l - ll + 2 + (isec << 8) - 257] =
2063 a[ii + isec - 1 + (l + 1) * a_dim1];
2064 }
2065 }
2066 if (ulsec < lsec)
2067 {
2068 i4 = ii + isec - 1;
2069 for (i = ii; i<= i4; ++i)
2070 {
2071 t1[lsec + ((i - ii + 1) << 8) - 257] =
2072 a[i + (ll + lsec - 1) * a_dim1];
2073 }
2074 }
2075
2076 uisec = isec - isec % 4;
2077 i4 = jj + ujsec - 1;
2078 for (j = jj; j <= i4; j += 4)
2079 {
2080 i5 = ii + uisec - 1;
2081 for (i = ii; i <= i5; i += 4)
2082 {
2083 f11 = c[i + j * c_dim1];
2084 f21 = c[i + 1 + j * c_dim1];
2085 f12 = c[i + (j + 1) * c_dim1];
2086 f22 = c[i + 1 + (j + 1) * c_dim1];
2087 f13 = c[i + (j + 2) * c_dim1];
2088 f23 = c[i + 1 + (j + 2) * c_dim1];
2089 f14 = c[i + (j + 3) * c_dim1];
2090 f24 = c[i + 1 + (j + 3) * c_dim1];
2091 f31 = c[i + 2 + j * c_dim1];
2092 f41 = c[i + 3 + j * c_dim1];
2093 f32 = c[i + 2 + (j + 1) * c_dim1];
2094 f42 = c[i + 3 + (j + 1) * c_dim1];
2095 f33 = c[i + 2 + (j + 2) * c_dim1];
2096 f43 = c[i + 3 + (j + 2) * c_dim1];
2097 f34 = c[i + 2 + (j + 3) * c_dim1];
2098 f44 = c[i + 3 + (j + 3) * c_dim1];
2099 i6 = ll + lsec - 1;
2100 for (l = ll; l <= i6; ++l)
2101 {
2102 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2103 * b[l + j * b_dim1];
2104 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2105 * b[l + j * b_dim1];
2106 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2107 * b[l + (j + 1) * b_dim1];
2108 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2109 * b[l + (j + 1) * b_dim1];
2110 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2111 * b[l + (j + 2) * b_dim1];
2112 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2113 * b[l + (j + 2) * b_dim1];
2114 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2115 * b[l + (j + 3) * b_dim1];
2116 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2117 * b[l + (j + 3) * b_dim1];
2118 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2119 * b[l + j * b_dim1];
2120 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2121 * b[l + j * b_dim1];
2122 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2123 * b[l + (j + 1) * b_dim1];
2124 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2125 * b[l + (j + 1) * b_dim1];
2126 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2127 * b[l + (j + 2) * b_dim1];
2128 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2129 * b[l + (j + 2) * b_dim1];
2130 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2131 * b[l + (j + 3) * b_dim1];
2132 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2133 * b[l + (j + 3) * b_dim1];
2134 }
2135 c[i + j * c_dim1] = f11;
2136 c[i + 1 + j * c_dim1] = f21;
2137 c[i + (j + 1) * c_dim1] = f12;
2138 c[i + 1 + (j + 1) * c_dim1] = f22;
2139 c[i + (j + 2) * c_dim1] = f13;
2140 c[i + 1 + (j + 2) * c_dim1] = f23;
2141 c[i + (j + 3) * c_dim1] = f14;
2142 c[i + 1 + (j + 3) * c_dim1] = f24;
2143 c[i + 2 + j * c_dim1] = f31;
2144 c[i + 3 + j * c_dim1] = f41;
2145 c[i + 2 + (j + 1) * c_dim1] = f32;
2146 c[i + 3 + (j + 1) * c_dim1] = f42;
2147 c[i + 2 + (j + 2) * c_dim1] = f33;
2148 c[i + 3 + (j + 2) * c_dim1] = f43;
2149 c[i + 2 + (j + 3) * c_dim1] = f34;
2150 c[i + 3 + (j + 3) * c_dim1] = f44;
2151 }
2152 if (uisec < isec)
2153 {
2154 i5 = ii + isec - 1;
2155 for (i = ii + uisec; i <= i5; ++i)
2156 {
2157 f11 = c[i + j * c_dim1];
2158 f12 = c[i + (j + 1) * c_dim1];
2159 f13 = c[i + (j + 2) * c_dim1];
2160 f14 = c[i + (j + 3) * c_dim1];
2161 i6 = ll + lsec - 1;
2162 for (l = ll; l <= i6; ++l)
2163 {
2164 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2165 257] * b[l + j * b_dim1];
2166 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2167 257] * b[l + (j + 1) * b_dim1];
2168 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2169 257] * b[l + (j + 2) * b_dim1];
2170 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2171 257] * b[l + (j + 3) * b_dim1];
2172 }
2173 c[i + j * c_dim1] = f11;
2174 c[i + (j + 1) * c_dim1] = f12;
2175 c[i + (j + 2) * c_dim1] = f13;
2176 c[i + (j + 3) * c_dim1] = f14;
2177 }
2178 }
2179 }
2180 if (ujsec < jsec)
2181 {
2182 i4 = jj + jsec - 1;
2183 for (j = jj + ujsec; j <= i4; ++j)
2184 {
2185 i5 = ii + uisec - 1;
2186 for (i = ii; i <= i5; i += 4)
2187 {
2188 f11 = c[i + j * c_dim1];
2189 f21 = c[i + 1 + j * c_dim1];
2190 f31 = c[i + 2 + j * c_dim1];
2191 f41 = c[i + 3 + j * c_dim1];
2192 i6 = ll + lsec - 1;
2193 for (l = ll; l <= i6; ++l)
2194 {
2195 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2196 257] * b[l + j * b_dim1];
2197 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2198 257] * b[l + j * b_dim1];
2199 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2200 257] * b[l + j * b_dim1];
2201 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2202 257] * b[l + j * b_dim1];
2203 }
2204 c[i + j * c_dim1] = f11;
2205 c[i + 1 + j * c_dim1] = f21;
2206 c[i + 2 + j * c_dim1] = f31;
2207 c[i + 3 + j * c_dim1] = f41;
2208 }
2209 i5 = ii + isec - 1;
2210 for (i = ii + uisec; i <= i5; ++i)
2211 {
2212 f11 = c[i + j * c_dim1];
2213 i6 = ll + lsec - 1;
2214 for (l = ll; l <= i6; ++l)
2215 {
2216 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217 257] * b[l + j * b_dim1];
2218 }
2219 c[i + j * c_dim1] = f11;
2220 }
2221 }
2222 }
2223 }
2224 }
2225 }
8e5f30dc 2226 free(t1);
31cfd832
TK
2227 return;
2228 }
2229 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2230 {
2231 if (GFC_DESCRIPTOR_RANK (a) != 1)
2232 {
2233 const GFC_INTEGER_2 *restrict abase_x;
2234 const GFC_INTEGER_2 *restrict bbase_y;
2235 GFC_INTEGER_2 *restrict dest_y;
2236 GFC_INTEGER_2 s;
2237
2238 for (y = 0; y < ycount; y++)
2239 {
2240 bbase_y = &bbase[y*bystride];
2241 dest_y = &dest[y*rystride];
2242 for (x = 0; x < xcount; x++)
2243 {
2244 abase_x = &abase[x*axstride];
2245 s = (GFC_INTEGER_2) 0;
2246 for (n = 0; n < count; n++)
2247 s += abase_x[n] * bbase_y[n];
2248 dest_y[x] = s;
2249 }
2250 }
2251 }
2252 else
2253 {
2254 const GFC_INTEGER_2 *restrict bbase_y;
2255 GFC_INTEGER_2 s;
2256
2257 for (y = 0; y < ycount; y++)
2258 {
2259 bbase_y = &bbase[y*bystride];
2260 s = (GFC_INTEGER_2) 0;
2261 for (n = 0; n < count; n++)
2262 s += abase[n*axstride] * bbase_y[n];
2263 dest[y*rystride] = s;
2264 }
2265 }
2266 }
2267 else if (axstride < aystride)
2268 {
2269 for (y = 0; y < ycount; y++)
2270 for (x = 0; x < xcount; x++)
2271 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
2272
2273 for (y = 0; y < ycount; y++)
2274 for (n = 0; n < count; n++)
2275 for (x = 0; x < xcount; x++)
2276 /* dest[x,y] += a[x,n] * b[n,y] */
2277 dest[x*rxstride + y*rystride] +=
2278 abase[x*axstride + n*aystride] *
2279 bbase[n*bxstride + y*bystride];
2280 }
2281 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2282 {
2283 const GFC_INTEGER_2 *restrict bbase_y;
2284 GFC_INTEGER_2 s;
2285
2286 for (y = 0; y < ycount; y++)
2287 {
2288 bbase_y = &bbase[y*bystride];
2289 s = (GFC_INTEGER_2) 0;
2290 for (n = 0; n < count; n++)
2291 s += abase[n*axstride] * bbase_y[n*bxstride];
2292 dest[y*rxstride] = s;
2293 }
2294 }
2295 else
2296 {
2297 const GFC_INTEGER_2 *restrict abase_x;
2298 const GFC_INTEGER_2 *restrict bbase_y;
2299 GFC_INTEGER_2 *restrict dest_y;
2300 GFC_INTEGER_2 s;
2301
2302 for (y = 0; y < ycount; y++)
2303 {
2304 bbase_y = &bbase[y*bystride];
2305 dest_y = &dest[y*rystride];
2306 for (x = 0; x < xcount; x++)
2307 {
2308 abase_x = &abase[x*axstride];
2309 s = (GFC_INTEGER_2) 0;
2310 for (n = 0; n < count; n++)
2311 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2312 dest_y[x*rxstride] = s;
2313 }
2314 }
2315 }
2316}
2317#undef POW3
2318#undef min
2319#undef max
2320
2321
2322/* Compiling main function, with selection code for the processor. */
2323
2324/* Currently, this is i386 only. Adjust for other architectures. */
2325
2326#include <config/i386/cpuinfo.h>
2327void matmul_i2 (gfc_array_i2 * const restrict retarray,
2328 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2329 int blas_limit, blas_call gemm)
2330{
2331 static void (*matmul_p) (gfc_array_i2 * const restrict retarray,
2332 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
f03e9217
TK
2333 int blas_limit, blas_call gemm);
2334
2335 void (*matmul_fn) (gfc_array_i2 * const restrict retarray,
2336 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2337 int blas_limit, blas_call gemm);
31cfd832 2338
f03e9217
TK
2339 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2340 if (matmul_fn == NULL)
31cfd832 2341 {
f03e9217 2342 matmul_fn = matmul_i2_vanilla;
31cfd832
TK
2343 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2344 {
2345 /* Run down the available processors in order of preference. */
2346#ifdef HAVE_AVX512F
2347 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2348 {
f03e9217
TK
2349 matmul_fn = matmul_i2_avx512f;
2350 goto store;
31cfd832
TK
2351 }
2352
2353#endif /* HAVE_AVX512F */
2354
2355#ifdef HAVE_AVX2
6d03bdcc
TK
2356 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2357 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
31cfd832 2358 {
f03e9217
TK
2359 matmul_fn = matmul_i2_avx2;
2360 goto store;
31cfd832
TK
2361 }
2362
2363#endif
2364
2365#ifdef HAVE_AVX
2366 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2367 {
f03e9217
TK
2368 matmul_fn = matmul_i2_avx;
2369 goto store;
31cfd832
TK
2370 }
2371#endif /* HAVE_AVX */
2372 }
1d5cf7fc
TK
2373 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2374 {
2375#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2376 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2377 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2378 {
2379 matmul_fn = matmul_i2_avx128_fma3;
2380 goto store;
2381 }
2382#endif
2383#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2384 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2385 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2386 {
2387 matmul_fn = matmul_i2_avx128_fma4;
2388 goto store;
2389 }
2390#endif
2391
2392 }
f03e9217
TK
2393 store:
2394 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
31cfd832
TK
2395 }
2396
f03e9217 2397 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
31cfd832
TK
2398}
2399
2400#else /* Just the vanilla function. */
2401
567c915b
TK
2402void
2403matmul_i2 (gfc_array_i2 * const restrict retarray,
2404 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2405 int blas_limit, blas_call gemm)
2406{
2407 const GFC_INTEGER_2 * restrict abase;
2408 const GFC_INTEGER_2 * restrict bbase;
2409 GFC_INTEGER_2 * restrict dest;
2410
2411 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2412 index_type x, y, n, count, xcount, ycount;
2413
2414 assert (GFC_DESCRIPTOR_RANK (a) == 2
2415 || GFC_DESCRIPTOR_RANK (b) == 2);
2416
2417/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2418
2419 Either A or B (but not both) can be rank 1:
2420
2421 o One-dimensional argument A is implicitly treated as a row matrix
2422 dimensioned [1,count], so xcount=1.
2423
2424 o One-dimensional argument B is implicitly treated as a column matrix
2425 dimensioned [count, 1], so ycount=1.
5d70ab07 2426*/
567c915b 2427
21d1335b 2428 if (retarray->base_addr == NULL)
567c915b
TK
2429 {
2430 if (GFC_DESCRIPTOR_RANK (a) == 1)
2431 {
dfb55fdc
TK
2432 GFC_DIMENSION_SET(retarray->dim[0], 0,
2433 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
567c915b
TK
2434 }
2435 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2436 {
dfb55fdc
TK
2437 GFC_DIMENSION_SET(retarray->dim[0], 0,
2438 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
567c915b
TK
2439 }
2440 else
2441 {
dfb55fdc
TK
2442 GFC_DIMENSION_SET(retarray->dim[0], 0,
2443 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
567c915b 2444
dfb55fdc
TK
2445 GFC_DIMENSION_SET(retarray->dim[1], 0,
2446 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2447 GFC_DESCRIPTOR_EXTENT(retarray,0));
567c915b
TK
2448 }
2449
21d1335b 2450 retarray->base_addr
92e6f3a4 2451 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
567c915b
TK
2452 retarray->offset = 0;
2453 }
5d70ab07
JD
2454 else if (unlikely (compile_options.bounds_check))
2455 {
2456 index_type ret_extent, arg_extent;
2457
2458 if (GFC_DESCRIPTOR_RANK (a) == 1)
2459 {
2460 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2461 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2462 if (arg_extent != ret_extent)
2463 runtime_error ("Incorrect extent in return array in"
2464 " MATMUL intrinsic: is %ld, should be %ld",
2465 (long int) ret_extent, (long int) arg_extent);
2466 }
2467 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2468 {
2469 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2470 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2471 if (arg_extent != ret_extent)
2472 runtime_error ("Incorrect extent in return array in"
2473 " MATMUL intrinsic: is %ld, should be %ld",
2474 (long int) ret_extent, (long int) arg_extent);
2475 }
2476 else
2477 {
2478 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2479 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2480 if (arg_extent != ret_extent)
2481 runtime_error ("Incorrect extent in return array in"
2482 " MATMUL intrinsic for dimension 1:"
2483 " is %ld, should be %ld",
2484 (long int) ret_extent, (long int) arg_extent);
2485
2486 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2487 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2488 if (arg_extent != ret_extent)
2489 runtime_error ("Incorrect extent in return array in"
2490 " MATMUL intrinsic for dimension 2:"
2491 " is %ld, should be %ld",
2492 (long int) ret_extent, (long int) arg_extent);
2493 }
2494 }
567c915b
TK
2495
2496
2497 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2498 {
2499 /* One-dimensional result may be addressed in the code below
2500 either as a row or a column matrix. We want both cases to
2501 work. */
dfb55fdc 2502 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
567c915b
TK
2503 }
2504 else
2505 {
dfb55fdc
TK
2506 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2507 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
567c915b
TK
2508 }
2509
2510
2511 if (GFC_DESCRIPTOR_RANK (a) == 1)
2512 {
2513 /* Treat it as a a row matrix A[1,count]. */
dfb55fdc 2514 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
567c915b
TK
2515 aystride = 1;
2516
2517 xcount = 1;
dfb55fdc 2518 count = GFC_DESCRIPTOR_EXTENT(a,0);
567c915b
TK
2519 }
2520 else
2521 {
dfb55fdc
TK
2522 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2523 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
567c915b 2524
dfb55fdc
TK
2525 count = GFC_DESCRIPTOR_EXTENT(a,1);
2526 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
567c915b
TK
2527 }
2528
dfb55fdc 2529 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
7edc89d4 2530 {
dfb55fdc 2531 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
7edc89d4
TK
2532 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2533 }
567c915b
TK
2534
2535 if (GFC_DESCRIPTOR_RANK (b) == 1)
2536 {
2537 /* Treat it as a column matrix B[count,1] */
dfb55fdc 2538 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
567c915b
TK
2539
2540 /* bystride should never be used for 1-dimensional b.
6ce6a84a
TK
2541 The value is only used for calculation of the
2542 memory by the buffer. */
2543 bystride = 256;
567c915b
TK
2544 ycount = 1;
2545 }
2546 else
2547 {
dfb55fdc
TK
2548 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2549 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2550 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
567c915b
TK
2551 }
2552
21d1335b
TB
2553 abase = a->base_addr;
2554 bbase = b->base_addr;
2555 dest = retarray->base_addr;
567c915b 2556
5d70ab07 2557 /* Now that everything is set up, we perform the multiplication
567c915b
TK
2558 itself. */
2559
2560#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
5d70ab07
JD
2561#define min(a,b) ((a) <= (b) ? (a) : (b))
2562#define max(a,b) ((a) >= (b) ? (a) : (b))
567c915b
TK
2563
2564 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2565 && (bxstride == 1 || bystride == 1)
2566 && (((float) xcount) * ((float) ycount) * ((float) count)
2567 > POW3(blas_limit)))
567c915b 2568 {
5d70ab07
JD
2569 const int m = xcount, n = ycount, k = count, ldc = rystride;
2570 const GFC_INTEGER_2 one = 1, zero = 0;
2571 const int lda = (axstride == 1) ? aystride : axstride,
2572 ldb = (bxstride == 1) ? bystride : bxstride;
567c915b 2573
5d70ab07 2574 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
567c915b 2575 {
5d70ab07
JD
2576 assert (gemm != NULL);
2577 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2578 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2579 &ldc, 1, 1);
2580 return;
567c915b 2581 }
5d70ab07 2582 }
567c915b 2583
5d70ab07
JD
2584 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2585 {
2586 /* This block of code implements a tuned matmul, derived from
2587 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2588
2589 Bo Kagstrom and Per Ling
2590 Department of Computing Science
2591 Umea University
2592 S-901 87 Umea, Sweden
2593
2594 from netlib.org, translated to C, and modified for matmul.m4. */
2595
2596 const GFC_INTEGER_2 *a, *b;
2597 GFC_INTEGER_2 *c;
2598 const index_type m = xcount, n = ycount, k = count;
2599
2600 /* System generated locals */
2601 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2602 i1, i2, i3, i4, i5, i6;
2603
2604 /* Local variables */
fd991039 2605 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
5d70ab07
JD
2606 f13, f14, f23, f24, f33, f34, f43, f44;
2607 index_type i, j, l, ii, jj, ll;
2608 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
8e5f30dc 2609 GFC_INTEGER_2 *t1;
5d70ab07
JD
2610
2611 a = abase;
2612 b = bbase;
2613 c = retarray->base_addr;
2614
2615 /* Parameter adjustments */
2616 c_dim1 = rystride;
2617 c_offset = 1 + c_dim1;
2618 c -= c_offset;
2619 a_dim1 = aystride;
2620 a_offset = 1 + a_dim1;
2621 a -= a_offset;
2622 b_dim1 = bystride;
2623 b_offset = 1 + b_dim1;
2624 b -= b_offset;
2625
bbf97416
TK
2626 /* Empty c first. */
2627 for (j=1; j<=n; j++)
2628 for (i=1; i<=m; i++)
2629 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
2630
5d70ab07
JD
2631 /* Early exit if possible */
2632 if (m == 0 || n == 0 || k == 0)
2633 return;
2634
fd991039 2635 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
2636 index_type t1_dim, a_sz;
2637 if (aystride == 1)
2638 a_sz = rystride;
2639 else
2640 a_sz = a_dim1;
2641
2642 t1_dim = a_sz * 256 + b_dim1;
fd991039
TK
2643 if (t1_dim > 65536)
2644 t1_dim = 65536;
2645
8e5f30dc 2646 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
fd991039 2647
5d70ab07
JD
2648 /* Start turning the crank. */
2649 i1 = n;
2650 for (jj = 1; jj <= i1; jj += 512)
567c915b 2651 {
5d70ab07
JD
2652 /* Computing MIN */
2653 i2 = 512;
2654 i3 = n - jj + 1;
2655 jsec = min(i2,i3);
2656 ujsec = jsec - jsec % 4;
2657 i2 = k;
2658 for (ll = 1; ll <= i2; ll += 256)
567c915b 2659 {
5d70ab07
JD
2660 /* Computing MIN */
2661 i3 = 256;
2662 i4 = k - ll + 1;
2663 lsec = min(i3,i4);
2664 ulsec = lsec - lsec % 2;
2665
2666 i3 = m;
2667 for (ii = 1; ii <= i3; ii += 256)
567c915b 2668 {
5d70ab07
JD
2669 /* Computing MIN */
2670 i4 = 256;
2671 i5 = m - ii + 1;
2672 isec = min(i4,i5);
2673 uisec = isec - isec % 2;
2674 i4 = ll + ulsec - 1;
2675 for (l = ll; l <= i4; l += 2)
2676 {
2677 i5 = ii + uisec - 1;
2678 for (i = ii; i <= i5; i += 2)
2679 {
2680 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2681 a[i + l * a_dim1];
2682 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2683 a[i + (l + 1) * a_dim1];
2684 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2685 a[i + 1 + l * a_dim1];
2686 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2687 a[i + 1 + (l + 1) * a_dim1];
2688 }
2689 if (uisec < isec)
2690 {
2691 t1[l - ll + 1 + (isec << 8) - 257] =
2692 a[ii + isec - 1 + l * a_dim1];
2693 t1[l - ll + 2 + (isec << 8) - 257] =
2694 a[ii + isec - 1 + (l + 1) * a_dim1];
2695 }
2696 }
2697 if (ulsec < lsec)
2698 {
2699 i4 = ii + isec - 1;
2700 for (i = ii; i<= i4; ++i)
2701 {
2702 t1[lsec + ((i - ii + 1) << 8) - 257] =
2703 a[i + (ll + lsec - 1) * a_dim1];
2704 }
2705 }
2706
2707 uisec = isec - isec % 4;
2708 i4 = jj + ujsec - 1;
2709 for (j = jj; j <= i4; j += 4)
2710 {
2711 i5 = ii + uisec - 1;
2712 for (i = ii; i <= i5; i += 4)
2713 {
2714 f11 = c[i + j * c_dim1];
2715 f21 = c[i + 1 + j * c_dim1];
2716 f12 = c[i + (j + 1) * c_dim1];
2717 f22 = c[i + 1 + (j + 1) * c_dim1];
2718 f13 = c[i + (j + 2) * c_dim1];
2719 f23 = c[i + 1 + (j + 2) * c_dim1];
2720 f14 = c[i + (j + 3) * c_dim1];
2721 f24 = c[i + 1 + (j + 3) * c_dim1];
2722 f31 = c[i + 2 + j * c_dim1];
2723 f41 = c[i + 3 + j * c_dim1];
2724 f32 = c[i + 2 + (j + 1) * c_dim1];
2725 f42 = c[i + 3 + (j + 1) * c_dim1];
2726 f33 = c[i + 2 + (j + 2) * c_dim1];
2727 f43 = c[i + 3 + (j + 2) * c_dim1];
2728 f34 = c[i + 2 + (j + 3) * c_dim1];
2729 f44 = c[i + 3 + (j + 3) * c_dim1];
2730 i6 = ll + lsec - 1;
2731 for (l = ll; l <= i6; ++l)
2732 {
2733 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2734 * b[l + j * b_dim1];
2735 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2736 * b[l + j * b_dim1];
2737 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2738 * b[l + (j + 1) * b_dim1];
2739 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2740 * b[l + (j + 1) * b_dim1];
2741 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2742 * b[l + (j + 2) * b_dim1];
2743 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2744 * b[l + (j + 2) * b_dim1];
2745 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2746 * b[l + (j + 3) * b_dim1];
2747 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2748 * b[l + (j + 3) * b_dim1];
2749 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2750 * b[l + j * b_dim1];
2751 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2752 * b[l + j * b_dim1];
2753 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2754 * b[l + (j + 1) * b_dim1];
2755 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2756 * b[l + (j + 1) * b_dim1];
2757 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2758 * b[l + (j + 2) * b_dim1];
2759 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2760 * b[l + (j + 2) * b_dim1];
2761 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2762 * b[l + (j + 3) * b_dim1];
2763 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2764 * b[l + (j + 3) * b_dim1];
2765 }
2766 c[i + j * c_dim1] = f11;
2767 c[i + 1 + j * c_dim1] = f21;
2768 c[i + (j + 1) * c_dim1] = f12;
2769 c[i + 1 + (j + 1) * c_dim1] = f22;
2770 c[i + (j + 2) * c_dim1] = f13;
2771 c[i + 1 + (j + 2) * c_dim1] = f23;
2772 c[i + (j + 3) * c_dim1] = f14;
2773 c[i + 1 + (j + 3) * c_dim1] = f24;
2774 c[i + 2 + j * c_dim1] = f31;
2775 c[i + 3 + j * c_dim1] = f41;
2776 c[i + 2 + (j + 1) * c_dim1] = f32;
2777 c[i + 3 + (j + 1) * c_dim1] = f42;
2778 c[i + 2 + (j + 2) * c_dim1] = f33;
2779 c[i + 3 + (j + 2) * c_dim1] = f43;
2780 c[i + 2 + (j + 3) * c_dim1] = f34;
2781 c[i + 3 + (j + 3) * c_dim1] = f44;
2782 }
2783 if (uisec < isec)
2784 {
2785 i5 = ii + isec - 1;
2786 for (i = ii + uisec; i <= i5; ++i)
2787 {
2788 f11 = c[i + j * c_dim1];
2789 f12 = c[i + (j + 1) * c_dim1];
2790 f13 = c[i + (j + 2) * c_dim1];
2791 f14 = c[i + (j + 3) * c_dim1];
2792 i6 = ll + lsec - 1;
2793 for (l = ll; l <= i6; ++l)
2794 {
2795 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2796 257] * b[l + j * b_dim1];
2797 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2798 257] * b[l + (j + 1) * b_dim1];
2799 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2800 257] * b[l + (j + 2) * b_dim1];
2801 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2802 257] * b[l + (j + 3) * b_dim1];
2803 }
2804 c[i + j * c_dim1] = f11;
2805 c[i + (j + 1) * c_dim1] = f12;
2806 c[i + (j + 2) * c_dim1] = f13;
2807 c[i + (j + 3) * c_dim1] = f14;
2808 }
2809 }
2810 }
2811 if (ujsec < jsec)
2812 {
2813 i4 = jj + jsec - 1;
2814 for (j = jj + ujsec; j <= i4; ++j)
2815 {
2816 i5 = ii + uisec - 1;
2817 for (i = ii; i <= i5; i += 4)
2818 {
2819 f11 = c[i + j * c_dim1];
2820 f21 = c[i + 1 + j * c_dim1];
2821 f31 = c[i + 2 + j * c_dim1];
2822 f41 = c[i + 3 + j * c_dim1];
2823 i6 = ll + lsec - 1;
2824 for (l = ll; l <= i6; ++l)
2825 {
2826 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2827 257] * b[l + j * b_dim1];
2828 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2829 257] * b[l + j * b_dim1];
2830 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2831 257] * b[l + j * b_dim1];
2832 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2833 257] * b[l + j * b_dim1];
2834 }
2835 c[i + j * c_dim1] = f11;
2836 c[i + 1 + j * c_dim1] = f21;
2837 c[i + 2 + j * c_dim1] = f31;
2838 c[i + 3 + j * c_dim1] = f41;
2839 }
2840 i5 = ii + isec - 1;
2841 for (i = ii + uisec; i <= i5; ++i)
2842 {
2843 f11 = c[i + j * c_dim1];
2844 i6 = ll + lsec - 1;
2845 for (l = ll; l <= i6; ++l)
2846 {
2847 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2848 257] * b[l + j * b_dim1];
2849 }
2850 c[i + j * c_dim1] = f11;
2851 }
2852 }
2853 }
567c915b
TK
2854 }
2855 }
2856 }
8e5f30dc 2857 free(t1);
5d70ab07 2858 return;
567c915b
TK
2859 }
2860 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2861 {
2862 if (GFC_DESCRIPTOR_RANK (a) != 1)
2863 {
2864 const GFC_INTEGER_2 *restrict abase_x;
2865 const GFC_INTEGER_2 *restrict bbase_y;
2866 GFC_INTEGER_2 *restrict dest_y;
2867 GFC_INTEGER_2 s;
2868
2869 for (y = 0; y < ycount; y++)
2870 {
2871 bbase_y = &bbase[y*bystride];
2872 dest_y = &dest[y*rystride];
2873 for (x = 0; x < xcount; x++)
2874 {
2875 abase_x = &abase[x*axstride];
2876 s = (GFC_INTEGER_2) 0;
2877 for (n = 0; n < count; n++)
2878 s += abase_x[n] * bbase_y[n];
2879 dest_y[x] = s;
2880 }
2881 }
2882 }
2883 else
2884 {
2885 const GFC_INTEGER_2 *restrict bbase_y;
2886 GFC_INTEGER_2 s;
2887
2888 for (y = 0; y < ycount; y++)
2889 {
2890 bbase_y = &bbase[y*bystride];
2891 s = (GFC_INTEGER_2) 0;
2892 for (n = 0; n < count; n++)
2893 s += abase[n*axstride] * bbase_y[n];
2894 dest[y*rystride] = s;
2895 }
2896 }
2897 }
2898 else if (axstride < aystride)
2899 {
2900 for (y = 0; y < ycount; y++)
2901 for (x = 0; x < xcount; x++)
2902 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
2903
2904 for (y = 0; y < ycount; y++)
2905 for (n = 0; n < count; n++)
2906 for (x = 0; x < xcount; x++)
2907 /* dest[x,y] += a[x,n] * b[n,y] */
5d70ab07
JD
2908 dest[x*rxstride + y*rystride] +=
2909 abase[x*axstride + n*aystride] *
2910 bbase[n*bxstride + y*bystride];
567c915b
TK
2911 }
2912 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2913 {
2914 const GFC_INTEGER_2 *restrict bbase_y;
2915 GFC_INTEGER_2 s;
2916
2917 for (y = 0; y < ycount; y++)
2918 {
2919 bbase_y = &bbase[y*bystride];
2920 s = (GFC_INTEGER_2) 0;
2921 for (n = 0; n < count; n++)
2922 s += abase[n*axstride] * bbase_y[n*bxstride];
2923 dest[y*rxstride] = s;
2924 }
2925 }
2926 else
2927 {
2928 const GFC_INTEGER_2 *restrict abase_x;
2929 const GFC_INTEGER_2 *restrict bbase_y;
2930 GFC_INTEGER_2 *restrict dest_y;
2931 GFC_INTEGER_2 s;
2932
2933 for (y = 0; y < ycount; y++)
2934 {
2935 bbase_y = &bbase[y*bystride];
2936 dest_y = &dest[y*rystride];
2937 for (x = 0; x < xcount; x++)
2938 {
2939 abase_x = &abase[x*axstride];
2940 s = (GFC_INTEGER_2) 0;
2941 for (n = 0; n < count; n++)
2942 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2943 dest_y[x*rxstride] = s;
2944 }
2945 }
2946 }
2947}
31cfd832
TK
2948#undef POW3
2949#undef min
2950#undef max
2951
567c915b 2952#endif
31cfd832
TK
2953#endif
2954