]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_r8.c
Update copyright years.
[thirdparty/gcc.git] / libgfortran / generated / matmul_r8.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_REAL_8)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
39 const int *, const GFC_REAL_8 *, const int *,
40 const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
73 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_r8);
76
77 /* Put exhaustive list of possible architectures here here, ORed together. */
78
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81 #ifdef HAVE_AVX
82 static void
83 matmul_r8_avx (gfc_array_r8 * const restrict retarray,
84 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_r8_avx (gfc_array_r8 * const restrict retarray,
88 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90 {
91 const GFC_REAL_8 * restrict abase;
92 const GFC_REAL_8 * restrict bbase;
93 GFC_REAL_8 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110 */
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Array bound mismatch for dimension 1 of "
148 "array (%ld/%ld) ",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Array bound mismatch for dimension 1 of "
157 "array (%ld/%ld) ",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Array bound mismatch for dimension 1 of "
166 "array (%ld/%ld) ",
167 (long int) ret_extent, (long int) arg_extent);
168
169 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
171 if (arg_extent != ret_extent)
172 runtime_error ("Array bound mismatch for dimension 2 of "
173 "array (%ld/%ld) ",
174 (long int) ret_extent, (long int) arg_extent);
175 }
176 }
177
178
179 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
180 {
181 /* One-dimensional result may be addressed in the code below
182 either as a row or a column matrix. We want both cases to
183 work. */
184 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185 }
186 else
187 {
188 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
189 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
190 }
191
192
193 if (GFC_DESCRIPTOR_RANK (a) == 1)
194 {
195 /* Treat it as a a row matrix A[1,count]. */
196 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
197 aystride = 1;
198
199 xcount = 1;
200 count = GFC_DESCRIPTOR_EXTENT(a,0);
201 }
202 else
203 {
204 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
205 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
206
207 count = GFC_DESCRIPTOR_EXTENT(a,1);
208 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
209 }
210
211 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
212 {
213 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
214 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
215 "in dimension 1: is %ld, should be %ld",
216 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 The value is only used for calculation of the
226 memory by the buffer. */
227 bystride = 256;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_REAL_8 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 const char *transa, *transb;
262 if (try_blas & 2)
263 transa = "C";
264 else
265 transa = axstride == 1 ? "N" : "T";
266
267 if (try_blas & 4)
268 transb = "C";
269 else
270 transb = bxstride == 1 ? "N" : "T";
271
272 gemm (transa, transb , &m,
273 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
274 &ldc, 1, 1);
275 return;
276 }
277 }
278
279 if (rxstride == 1 && axstride == 1 && bxstride == 1)
280 {
281 /* This block of code implements a tuned matmul, derived from
282 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
283
284 Bo Kagstrom and Per Ling
285 Department of Computing Science
286 Umea University
287 S-901 87 Umea, Sweden
288
289 from netlib.org, translated to C, and modified for matmul.m4. */
290
291 const GFC_REAL_8 *a, *b;
292 GFC_REAL_8 *c;
293 const index_type m = xcount, n = ycount, k = count;
294
295 /* System generated locals */
296 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
297 i1, i2, i3, i4, i5, i6;
298
299 /* Local variables */
300 GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
301 f13, f14, f23, f24, f33, f34, f43, f44;
302 index_type i, j, l, ii, jj, ll;
303 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
304 GFC_REAL_8 *t1;
305
306 a = abase;
307 b = bbase;
308 c = retarray->base_addr;
309
310 /* Parameter adjustments */
311 c_dim1 = rystride;
312 c_offset = 1 + c_dim1;
313 c -= c_offset;
314 a_dim1 = aystride;
315 a_offset = 1 + a_dim1;
316 a -= a_offset;
317 b_dim1 = bystride;
318 b_offset = 1 + b_dim1;
319 b -= b_offset;
320
321 /* Empty c first. */
322 for (j=1; j<=n; j++)
323 for (i=1; i<=m; i++)
324 c[i + j * c_dim1] = (GFC_REAL_8)0;
325
326 /* Early exit if possible */
327 if (m == 0 || n == 0 || k == 0)
328 return;
329
330 /* Adjust size of t1 to what is needed. */
331 index_type t1_dim, a_sz;
332 if (aystride == 1)
333 a_sz = rystride;
334 else
335 a_sz = a_dim1;
336
337 t1_dim = a_sz * 256 + b_dim1;
338 if (t1_dim > 65536)
339 t1_dim = 65536;
340
341 t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
342
343 /* Start turning the crank. */
344 i1 = n;
345 for (jj = 1; jj <= i1; jj += 512)
346 {
347 /* Computing MIN */
348 i2 = 512;
349 i3 = n - jj + 1;
350 jsec = min(i2,i3);
351 ujsec = jsec - jsec % 4;
352 i2 = k;
353 for (ll = 1; ll <= i2; ll += 256)
354 {
355 /* Computing MIN */
356 i3 = 256;
357 i4 = k - ll + 1;
358 lsec = min(i3,i4);
359 ulsec = lsec - lsec % 2;
360
361 i3 = m;
362 for (ii = 1; ii <= i3; ii += 256)
363 {
364 /* Computing MIN */
365 i4 = 256;
366 i5 = m - ii + 1;
367 isec = min(i4,i5);
368 uisec = isec - isec % 2;
369 i4 = ll + ulsec - 1;
370 for (l = ll; l <= i4; l += 2)
371 {
372 i5 = ii + uisec - 1;
373 for (i = ii; i <= i5; i += 2)
374 {
375 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
376 a[i + l * a_dim1];
377 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
378 a[i + (l + 1) * a_dim1];
379 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
380 a[i + 1 + l * a_dim1];
381 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
382 a[i + 1 + (l + 1) * a_dim1];
383 }
384 if (uisec < isec)
385 {
386 t1[l - ll + 1 + (isec << 8) - 257] =
387 a[ii + isec - 1 + l * a_dim1];
388 t1[l - ll + 2 + (isec << 8) - 257] =
389 a[ii + isec - 1 + (l + 1) * a_dim1];
390 }
391 }
392 if (ulsec < lsec)
393 {
394 i4 = ii + isec - 1;
395 for (i = ii; i<= i4; ++i)
396 {
397 t1[lsec + ((i - ii + 1) << 8) - 257] =
398 a[i + (ll + lsec - 1) * a_dim1];
399 }
400 }
401
402 uisec = isec - isec % 4;
403 i4 = jj + ujsec - 1;
404 for (j = jj; j <= i4; j += 4)
405 {
406 i5 = ii + uisec - 1;
407 for (i = ii; i <= i5; i += 4)
408 {
409 f11 = c[i + j * c_dim1];
410 f21 = c[i + 1 + j * c_dim1];
411 f12 = c[i + (j + 1) * c_dim1];
412 f22 = c[i + 1 + (j + 1) * c_dim1];
413 f13 = c[i + (j + 2) * c_dim1];
414 f23 = c[i + 1 + (j + 2) * c_dim1];
415 f14 = c[i + (j + 3) * c_dim1];
416 f24 = c[i + 1 + (j + 3) * c_dim1];
417 f31 = c[i + 2 + j * c_dim1];
418 f41 = c[i + 3 + j * c_dim1];
419 f32 = c[i + 2 + (j + 1) * c_dim1];
420 f42 = c[i + 3 + (j + 1) * c_dim1];
421 f33 = c[i + 2 + (j + 2) * c_dim1];
422 f43 = c[i + 3 + (j + 2) * c_dim1];
423 f34 = c[i + 2 + (j + 3) * c_dim1];
424 f44 = c[i + 3 + (j + 3) * c_dim1];
425 i6 = ll + lsec - 1;
426 for (l = ll; l <= i6; ++l)
427 {
428 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
429 * b[l + j * b_dim1];
430 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
431 * b[l + j * b_dim1];
432 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
433 * b[l + (j + 1) * b_dim1];
434 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
435 * b[l + (j + 1) * b_dim1];
436 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
437 * b[l + (j + 2) * b_dim1];
438 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
439 * b[l + (j + 2) * b_dim1];
440 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
441 * b[l + (j + 3) * b_dim1];
442 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
443 * b[l + (j + 3) * b_dim1];
444 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
445 * b[l + j * b_dim1];
446 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
447 * b[l + j * b_dim1];
448 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
449 * b[l + (j + 1) * b_dim1];
450 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
451 * b[l + (j + 1) * b_dim1];
452 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
453 * b[l + (j + 2) * b_dim1];
454 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
455 * b[l + (j + 2) * b_dim1];
456 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
457 * b[l + (j + 3) * b_dim1];
458 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
459 * b[l + (j + 3) * b_dim1];
460 }
461 c[i + j * c_dim1] = f11;
462 c[i + 1 + j * c_dim1] = f21;
463 c[i + (j + 1) * c_dim1] = f12;
464 c[i + 1 + (j + 1) * c_dim1] = f22;
465 c[i + (j + 2) * c_dim1] = f13;
466 c[i + 1 + (j + 2) * c_dim1] = f23;
467 c[i + (j + 3) * c_dim1] = f14;
468 c[i + 1 + (j + 3) * c_dim1] = f24;
469 c[i + 2 + j * c_dim1] = f31;
470 c[i + 3 + j * c_dim1] = f41;
471 c[i + 2 + (j + 1) * c_dim1] = f32;
472 c[i + 3 + (j + 1) * c_dim1] = f42;
473 c[i + 2 + (j + 2) * c_dim1] = f33;
474 c[i + 3 + (j + 2) * c_dim1] = f43;
475 c[i + 2 + (j + 3) * c_dim1] = f34;
476 c[i + 3 + (j + 3) * c_dim1] = f44;
477 }
478 if (uisec < isec)
479 {
480 i5 = ii + isec - 1;
481 for (i = ii + uisec; i <= i5; ++i)
482 {
483 f11 = c[i + j * c_dim1];
484 f12 = c[i + (j + 1) * c_dim1];
485 f13 = c[i + (j + 2) * c_dim1];
486 f14 = c[i + (j + 3) * c_dim1];
487 i6 = ll + lsec - 1;
488 for (l = ll; l <= i6; ++l)
489 {
490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
491 257] * b[l + j * b_dim1];
492 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
493 257] * b[l + (j + 1) * b_dim1];
494 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
495 257] * b[l + (j + 2) * b_dim1];
496 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
497 257] * b[l + (j + 3) * b_dim1];
498 }
499 c[i + j * c_dim1] = f11;
500 c[i + (j + 1) * c_dim1] = f12;
501 c[i + (j + 2) * c_dim1] = f13;
502 c[i + (j + 3) * c_dim1] = f14;
503 }
504 }
505 }
506 if (ujsec < jsec)
507 {
508 i4 = jj + jsec - 1;
509 for (j = jj + ujsec; j <= i4; ++j)
510 {
511 i5 = ii + uisec - 1;
512 for (i = ii; i <= i5; i += 4)
513 {
514 f11 = c[i + j * c_dim1];
515 f21 = c[i + 1 + j * c_dim1];
516 f31 = c[i + 2 + j * c_dim1];
517 f41 = c[i + 3 + j * c_dim1];
518 i6 = ll + lsec - 1;
519 for (l = ll; l <= i6; ++l)
520 {
521 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
522 257] * b[l + j * b_dim1];
523 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
524 257] * b[l + j * b_dim1];
525 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
526 257] * b[l + j * b_dim1];
527 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
528 257] * b[l + j * b_dim1];
529 }
530 c[i + j * c_dim1] = f11;
531 c[i + 1 + j * c_dim1] = f21;
532 c[i + 2 + j * c_dim1] = f31;
533 c[i + 3 + j * c_dim1] = f41;
534 }
535 i5 = ii + isec - 1;
536 for (i = ii + uisec; i <= i5; ++i)
537 {
538 f11 = c[i + j * c_dim1];
539 i6 = ll + lsec - 1;
540 for (l = ll; l <= i6; ++l)
541 {
542 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
543 257] * b[l + j * b_dim1];
544 }
545 c[i + j * c_dim1] = f11;
546 }
547 }
548 }
549 }
550 }
551 }
552 free(t1);
553 return;
554 }
555 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
556 {
557 if (GFC_DESCRIPTOR_RANK (a) != 1)
558 {
559 const GFC_REAL_8 *restrict abase_x;
560 const GFC_REAL_8 *restrict bbase_y;
561 GFC_REAL_8 *restrict dest_y;
562 GFC_REAL_8 s;
563
564 for (y = 0; y < ycount; y++)
565 {
566 bbase_y = &bbase[y*bystride];
567 dest_y = &dest[y*rystride];
568 for (x = 0; x < xcount; x++)
569 {
570 abase_x = &abase[x*axstride];
571 s = (GFC_REAL_8) 0;
572 for (n = 0; n < count; n++)
573 s += abase_x[n] * bbase_y[n];
574 dest_y[x] = s;
575 }
576 }
577 }
578 else
579 {
580 const GFC_REAL_8 *restrict bbase_y;
581 GFC_REAL_8 s;
582
583 for (y = 0; y < ycount; y++)
584 {
585 bbase_y = &bbase[y*bystride];
586 s = (GFC_REAL_8) 0;
587 for (n = 0; n < count; n++)
588 s += abase[n*axstride] * bbase_y[n];
589 dest[y*rystride] = s;
590 }
591 }
592 }
593 else if (GFC_DESCRIPTOR_RANK (a) == 1)
594 {
595 const GFC_REAL_8 *restrict bbase_y;
596 GFC_REAL_8 s;
597
598 for (y = 0; y < ycount; y++)
599 {
600 bbase_y = &bbase[y*bystride];
601 s = (GFC_REAL_8) 0;
602 for (n = 0; n < count; n++)
603 s += abase[n*axstride] * bbase_y[n*bxstride];
604 dest[y*rxstride] = s;
605 }
606 }
607 else if (axstride < aystride)
608 {
609 for (y = 0; y < ycount; y++)
610 for (x = 0; x < xcount; x++)
611 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
612
613 for (y = 0; y < ycount; y++)
614 for (n = 0; n < count; n++)
615 for (x = 0; x < xcount; x++)
616 /* dest[x,y] += a[x,n] * b[n,y] */
617 dest[x*rxstride + y*rystride] +=
618 abase[x*axstride + n*aystride] *
619 bbase[n*bxstride + y*bystride];
620 }
621 else
622 {
623 const GFC_REAL_8 *restrict abase_x;
624 const GFC_REAL_8 *restrict bbase_y;
625 GFC_REAL_8 *restrict dest_y;
626 GFC_REAL_8 s;
627
628 for (y = 0; y < ycount; y++)
629 {
630 bbase_y = &bbase[y*bystride];
631 dest_y = &dest[y*rystride];
632 for (x = 0; x < xcount; x++)
633 {
634 abase_x = &abase[x*axstride];
635 s = (GFC_REAL_8) 0;
636 for (n = 0; n < count; n++)
637 s += abase_x[n*aystride] * bbase_y[n*bxstride];
638 dest_y[x*rxstride] = s;
639 }
640 }
641 }
642 }
643 #undef POW3
644 #undef min
645 #undef max
646
647 #endif /* HAVE_AVX */
648
649 #ifdef HAVE_AVX2
650 static void
651 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
652 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
653 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
654 static void
655 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
656 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
657 int blas_limit, blas_call gemm)
658 {
659 const GFC_REAL_8 * restrict abase;
660 const GFC_REAL_8 * restrict bbase;
661 GFC_REAL_8 * restrict dest;
662
663 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
664 index_type x, y, n, count, xcount, ycount;
665
666 assert (GFC_DESCRIPTOR_RANK (a) == 2
667 || GFC_DESCRIPTOR_RANK (b) == 2);
668
669 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
670
671 Either A or B (but not both) can be rank 1:
672
673 o One-dimensional argument A is implicitly treated as a row matrix
674 dimensioned [1,count], so xcount=1.
675
676 o One-dimensional argument B is implicitly treated as a column matrix
677 dimensioned [count, 1], so ycount=1.
678 */
679
680 if (retarray->base_addr == NULL)
681 {
682 if (GFC_DESCRIPTOR_RANK (a) == 1)
683 {
684 GFC_DIMENSION_SET(retarray->dim[0], 0,
685 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
686 }
687 else if (GFC_DESCRIPTOR_RANK (b) == 1)
688 {
689 GFC_DIMENSION_SET(retarray->dim[0], 0,
690 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
691 }
692 else
693 {
694 GFC_DIMENSION_SET(retarray->dim[0], 0,
695 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
696
697 GFC_DIMENSION_SET(retarray->dim[1], 0,
698 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
699 GFC_DESCRIPTOR_EXTENT(retarray,0));
700 }
701
702 retarray->base_addr
703 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
704 retarray->offset = 0;
705 }
706 else if (unlikely (compile_options.bounds_check))
707 {
708 index_type ret_extent, arg_extent;
709
710 if (GFC_DESCRIPTOR_RANK (a) == 1)
711 {
712 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
713 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
714 if (arg_extent != ret_extent)
715 runtime_error ("Array bound mismatch for dimension 1 of "
716 "array (%ld/%ld) ",
717 (long int) ret_extent, (long int) arg_extent);
718 }
719 else if (GFC_DESCRIPTOR_RANK (b) == 1)
720 {
721 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
722 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
723 if (arg_extent != ret_extent)
724 runtime_error ("Array bound mismatch for dimension 1 of "
725 "array (%ld/%ld) ",
726 (long int) ret_extent, (long int) arg_extent);
727 }
728 else
729 {
730 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
731 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
732 if (arg_extent != ret_extent)
733 runtime_error ("Array bound mismatch for dimension 1 of "
734 "array (%ld/%ld) ",
735 (long int) ret_extent, (long int) arg_extent);
736
737 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
738 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
739 if (arg_extent != ret_extent)
740 runtime_error ("Array bound mismatch for dimension 2 of "
741 "array (%ld/%ld) ",
742 (long int) ret_extent, (long int) arg_extent);
743 }
744 }
745
746
747 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
748 {
749 /* One-dimensional result may be addressed in the code below
750 either as a row or a column matrix. We want both cases to
751 work. */
752 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
753 }
754 else
755 {
756 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
757 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
758 }
759
760
761 if (GFC_DESCRIPTOR_RANK (a) == 1)
762 {
763 /* Treat it as a a row matrix A[1,count]. */
764 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
765 aystride = 1;
766
767 xcount = 1;
768 count = GFC_DESCRIPTOR_EXTENT(a,0);
769 }
770 else
771 {
772 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
773 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
774
775 count = GFC_DESCRIPTOR_EXTENT(a,1);
776 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
777 }
778
779 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
780 {
781 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
782 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
783 "in dimension 1: is %ld, should be %ld",
784 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
785 }
786
787 if (GFC_DESCRIPTOR_RANK (b) == 1)
788 {
789 /* Treat it as a column matrix B[count,1] */
790 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
791
792 /* bystride should never be used for 1-dimensional b.
793 The value is only used for calculation of the
794 memory by the buffer. */
795 bystride = 256;
796 ycount = 1;
797 }
798 else
799 {
800 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
801 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
802 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
803 }
804
805 abase = a->base_addr;
806 bbase = b->base_addr;
807 dest = retarray->base_addr;
808
809 /* Now that everything is set up, we perform the multiplication
810 itself. */
811
812 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
813 #define min(a,b) ((a) <= (b) ? (a) : (b))
814 #define max(a,b) ((a) >= (b) ? (a) : (b))
815
816 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
817 && (bxstride == 1 || bystride == 1)
818 && (((float) xcount) * ((float) ycount) * ((float) count)
819 > POW3(blas_limit)))
820 {
821 const int m = xcount, n = ycount, k = count, ldc = rystride;
822 const GFC_REAL_8 one = 1, zero = 0;
823 const int lda = (axstride == 1) ? aystride : axstride,
824 ldb = (bxstride == 1) ? bystride : bxstride;
825
826 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
827 {
828 assert (gemm != NULL);
829 const char *transa, *transb;
830 if (try_blas & 2)
831 transa = "C";
832 else
833 transa = axstride == 1 ? "N" : "T";
834
835 if (try_blas & 4)
836 transb = "C";
837 else
838 transb = bxstride == 1 ? "N" : "T";
839
840 gemm (transa, transb , &m,
841 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
842 &ldc, 1, 1);
843 return;
844 }
845 }
846
847 if (rxstride == 1 && axstride == 1 && bxstride == 1)
848 {
849 /* This block of code implements a tuned matmul, derived from
850 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
851
852 Bo Kagstrom and Per Ling
853 Department of Computing Science
854 Umea University
855 S-901 87 Umea, Sweden
856
857 from netlib.org, translated to C, and modified for matmul.m4. */
858
859 const GFC_REAL_8 *a, *b;
860 GFC_REAL_8 *c;
861 const index_type m = xcount, n = ycount, k = count;
862
863 /* System generated locals */
864 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
865 i1, i2, i3, i4, i5, i6;
866
867 /* Local variables */
868 GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
869 f13, f14, f23, f24, f33, f34, f43, f44;
870 index_type i, j, l, ii, jj, ll;
871 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
872 GFC_REAL_8 *t1;
873
874 a = abase;
875 b = bbase;
876 c = retarray->base_addr;
877
878 /* Parameter adjustments */
879 c_dim1 = rystride;
880 c_offset = 1 + c_dim1;
881 c -= c_offset;
882 a_dim1 = aystride;
883 a_offset = 1 + a_dim1;
884 a -= a_offset;
885 b_dim1 = bystride;
886 b_offset = 1 + b_dim1;
887 b -= b_offset;
888
889 /* Empty c first. */
890 for (j=1; j<=n; j++)
891 for (i=1; i<=m; i++)
892 c[i + j * c_dim1] = (GFC_REAL_8)0;
893
894 /* Early exit if possible */
895 if (m == 0 || n == 0 || k == 0)
896 return;
897
898 /* Adjust size of t1 to what is needed. */
899 index_type t1_dim, a_sz;
900 if (aystride == 1)
901 a_sz = rystride;
902 else
903 a_sz = a_dim1;
904
905 t1_dim = a_sz * 256 + b_dim1;
906 if (t1_dim > 65536)
907 t1_dim = 65536;
908
909 t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
910
911 /* Start turning the crank. */
912 i1 = n;
913 for (jj = 1; jj <= i1; jj += 512)
914 {
915 /* Computing MIN */
916 i2 = 512;
917 i3 = n - jj + 1;
918 jsec = min(i2,i3);
919 ujsec = jsec - jsec % 4;
920 i2 = k;
921 for (ll = 1; ll <= i2; ll += 256)
922 {
923 /* Computing MIN */
924 i3 = 256;
925 i4 = k - ll + 1;
926 lsec = min(i3,i4);
927 ulsec = lsec - lsec % 2;
928
929 i3 = m;
930 for (ii = 1; ii <= i3; ii += 256)
931 {
932 /* Computing MIN */
933 i4 = 256;
934 i5 = m - ii + 1;
935 isec = min(i4,i5);
936 uisec = isec - isec % 2;
937 i4 = ll + ulsec - 1;
938 for (l = ll; l <= i4; l += 2)
939 {
940 i5 = ii + uisec - 1;
941 for (i = ii; i <= i5; i += 2)
942 {
943 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
944 a[i + l * a_dim1];
945 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
946 a[i + (l + 1) * a_dim1];
947 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
948 a[i + 1 + l * a_dim1];
949 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
950 a[i + 1 + (l + 1) * a_dim1];
951 }
952 if (uisec < isec)
953 {
954 t1[l - ll + 1 + (isec << 8) - 257] =
955 a[ii + isec - 1 + l * a_dim1];
956 t1[l - ll + 2 + (isec << 8) - 257] =
957 a[ii + isec - 1 + (l + 1) * a_dim1];
958 }
959 }
960 if (ulsec < lsec)
961 {
962 i4 = ii + isec - 1;
963 for (i = ii; i<= i4; ++i)
964 {
965 t1[lsec + ((i - ii + 1) << 8) - 257] =
966 a[i + (ll + lsec - 1) * a_dim1];
967 }
968 }
969
970 uisec = isec - isec % 4;
971 i4 = jj + ujsec - 1;
972 for (j = jj; j <= i4; j += 4)
973 {
974 i5 = ii + uisec - 1;
975 for (i = ii; i <= i5; i += 4)
976 {
977 f11 = c[i + j * c_dim1];
978 f21 = c[i + 1 + j * c_dim1];
979 f12 = c[i + (j + 1) * c_dim1];
980 f22 = c[i + 1 + (j + 1) * c_dim1];
981 f13 = c[i + (j + 2) * c_dim1];
982 f23 = c[i + 1 + (j + 2) * c_dim1];
983 f14 = c[i + (j + 3) * c_dim1];
984 f24 = c[i + 1 + (j + 3) * c_dim1];
985 f31 = c[i + 2 + j * c_dim1];
986 f41 = c[i + 3 + j * c_dim1];
987 f32 = c[i + 2 + (j + 1) * c_dim1];
988 f42 = c[i + 3 + (j + 1) * c_dim1];
989 f33 = c[i + 2 + (j + 2) * c_dim1];
990 f43 = c[i + 3 + (j + 2) * c_dim1];
991 f34 = c[i + 2 + (j + 3) * c_dim1];
992 f44 = c[i + 3 + (j + 3) * c_dim1];
993 i6 = ll + lsec - 1;
994 for (l = ll; l <= i6; ++l)
995 {
996 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
997 * b[l + j * b_dim1];
998 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
999 * b[l + j * b_dim1];
1000 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1001 * b[l + (j + 1) * b_dim1];
1002 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1003 * b[l + (j + 1) * b_dim1];
1004 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1005 * b[l + (j + 2) * b_dim1];
1006 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1007 * b[l + (j + 2) * b_dim1];
1008 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1009 * b[l + (j + 3) * b_dim1];
1010 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1011 * b[l + (j + 3) * b_dim1];
1012 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1013 * b[l + j * b_dim1];
1014 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1015 * b[l + j * b_dim1];
1016 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1017 * b[l + (j + 1) * b_dim1];
1018 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1019 * b[l + (j + 1) * b_dim1];
1020 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1021 * b[l + (j + 2) * b_dim1];
1022 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1023 * b[l + (j + 2) * b_dim1];
1024 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1025 * b[l + (j + 3) * b_dim1];
1026 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1027 * b[l + (j + 3) * b_dim1];
1028 }
1029 c[i + j * c_dim1] = f11;
1030 c[i + 1 + j * c_dim1] = f21;
1031 c[i + (j + 1) * c_dim1] = f12;
1032 c[i + 1 + (j + 1) * c_dim1] = f22;
1033 c[i + (j + 2) * c_dim1] = f13;
1034 c[i + 1 + (j + 2) * c_dim1] = f23;
1035 c[i + (j + 3) * c_dim1] = f14;
1036 c[i + 1 + (j + 3) * c_dim1] = f24;
1037 c[i + 2 + j * c_dim1] = f31;
1038 c[i + 3 + j * c_dim1] = f41;
1039 c[i + 2 + (j + 1) * c_dim1] = f32;
1040 c[i + 3 + (j + 1) * c_dim1] = f42;
1041 c[i + 2 + (j + 2) * c_dim1] = f33;
1042 c[i + 3 + (j + 2) * c_dim1] = f43;
1043 c[i + 2 + (j + 3) * c_dim1] = f34;
1044 c[i + 3 + (j + 3) * c_dim1] = f44;
1045 }
1046 if (uisec < isec)
1047 {
1048 i5 = ii + isec - 1;
1049 for (i = ii + uisec; i <= i5; ++i)
1050 {
1051 f11 = c[i + j * c_dim1];
1052 f12 = c[i + (j + 1) * c_dim1];
1053 f13 = c[i + (j + 2) * c_dim1];
1054 f14 = c[i + (j + 3) * c_dim1];
1055 i6 = ll + lsec - 1;
1056 for (l = ll; l <= i6; ++l)
1057 {
1058 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1059 257] * b[l + j * b_dim1];
1060 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061 257] * b[l + (j + 1) * b_dim1];
1062 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1063 257] * b[l + (j + 2) * b_dim1];
1064 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065 257] * b[l + (j + 3) * b_dim1];
1066 }
1067 c[i + j * c_dim1] = f11;
1068 c[i + (j + 1) * c_dim1] = f12;
1069 c[i + (j + 2) * c_dim1] = f13;
1070 c[i + (j + 3) * c_dim1] = f14;
1071 }
1072 }
1073 }
1074 if (ujsec < jsec)
1075 {
1076 i4 = jj + jsec - 1;
1077 for (j = jj + ujsec; j <= i4; ++j)
1078 {
1079 i5 = ii + uisec - 1;
1080 for (i = ii; i <= i5; i += 4)
1081 {
1082 f11 = c[i + j * c_dim1];
1083 f21 = c[i + 1 + j * c_dim1];
1084 f31 = c[i + 2 + j * c_dim1];
1085 f41 = c[i + 3 + j * c_dim1];
1086 i6 = ll + lsec - 1;
1087 for (l = ll; l <= i6; ++l)
1088 {
1089 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1090 257] * b[l + j * b_dim1];
1091 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1092 257] * b[l + j * b_dim1];
1093 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1094 257] * b[l + j * b_dim1];
1095 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1096 257] * b[l + j * b_dim1];
1097 }
1098 c[i + j * c_dim1] = f11;
1099 c[i + 1 + j * c_dim1] = f21;
1100 c[i + 2 + j * c_dim1] = f31;
1101 c[i + 3 + j * c_dim1] = f41;
1102 }
1103 i5 = ii + isec - 1;
1104 for (i = ii + uisec; i <= i5; ++i)
1105 {
1106 f11 = c[i + j * c_dim1];
1107 i6 = ll + lsec - 1;
1108 for (l = ll; l <= i6; ++l)
1109 {
1110 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1111 257] * b[l + j * b_dim1];
1112 }
1113 c[i + j * c_dim1] = f11;
1114 }
1115 }
1116 }
1117 }
1118 }
1119 }
1120 free(t1);
1121 return;
1122 }
1123 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1124 {
1125 if (GFC_DESCRIPTOR_RANK (a) != 1)
1126 {
1127 const GFC_REAL_8 *restrict abase_x;
1128 const GFC_REAL_8 *restrict bbase_y;
1129 GFC_REAL_8 *restrict dest_y;
1130 GFC_REAL_8 s;
1131
1132 for (y = 0; y < ycount; y++)
1133 {
1134 bbase_y = &bbase[y*bystride];
1135 dest_y = &dest[y*rystride];
1136 for (x = 0; x < xcount; x++)
1137 {
1138 abase_x = &abase[x*axstride];
1139 s = (GFC_REAL_8) 0;
1140 for (n = 0; n < count; n++)
1141 s += abase_x[n] * bbase_y[n];
1142 dest_y[x] = s;
1143 }
1144 }
1145 }
1146 else
1147 {
1148 const GFC_REAL_8 *restrict bbase_y;
1149 GFC_REAL_8 s;
1150
1151 for (y = 0; y < ycount; y++)
1152 {
1153 bbase_y = &bbase[y*bystride];
1154 s = (GFC_REAL_8) 0;
1155 for (n = 0; n < count; n++)
1156 s += abase[n*axstride] * bbase_y[n];
1157 dest[y*rystride] = s;
1158 }
1159 }
1160 }
1161 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1162 {
1163 const GFC_REAL_8 *restrict bbase_y;
1164 GFC_REAL_8 s;
1165
1166 for (y = 0; y < ycount; y++)
1167 {
1168 bbase_y = &bbase[y*bystride];
1169 s = (GFC_REAL_8) 0;
1170 for (n = 0; n < count; n++)
1171 s += abase[n*axstride] * bbase_y[n*bxstride];
1172 dest[y*rxstride] = s;
1173 }
1174 }
1175 else if (axstride < aystride)
1176 {
1177 for (y = 0; y < ycount; y++)
1178 for (x = 0; x < xcount; x++)
1179 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
1180
1181 for (y = 0; y < ycount; y++)
1182 for (n = 0; n < count; n++)
1183 for (x = 0; x < xcount; x++)
1184 /* dest[x,y] += a[x,n] * b[n,y] */
1185 dest[x*rxstride + y*rystride] +=
1186 abase[x*axstride + n*aystride] *
1187 bbase[n*bxstride + y*bystride];
1188 }
1189 else
1190 {
1191 const GFC_REAL_8 *restrict abase_x;
1192 const GFC_REAL_8 *restrict bbase_y;
1193 GFC_REAL_8 *restrict dest_y;
1194 GFC_REAL_8 s;
1195
1196 for (y = 0; y < ycount; y++)
1197 {
1198 bbase_y = &bbase[y*bystride];
1199 dest_y = &dest[y*rystride];
1200 for (x = 0; x < xcount; x++)
1201 {
1202 abase_x = &abase[x*axstride];
1203 s = (GFC_REAL_8) 0;
1204 for (n = 0; n < count; n++)
1205 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1206 dest_y[x*rxstride] = s;
1207 }
1208 }
1209 }
1210 }
1211 #undef POW3
1212 #undef min
1213 #undef max
1214
1215 #endif /* HAVE_AVX2 */
1216
1217 #ifdef HAVE_AVX512F
1218 static void
1219 matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
1220 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1221 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1222 static void
1223 matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
1224 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1225 int blas_limit, blas_call gemm)
1226 {
1227 const GFC_REAL_8 * restrict abase;
1228 const GFC_REAL_8 * restrict bbase;
1229 GFC_REAL_8 * restrict dest;
1230
1231 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1232 index_type x, y, n, count, xcount, ycount;
1233
1234 assert (GFC_DESCRIPTOR_RANK (a) == 2
1235 || GFC_DESCRIPTOR_RANK (b) == 2);
1236
1237 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1238
1239 Either A or B (but not both) can be rank 1:
1240
1241 o One-dimensional argument A is implicitly treated as a row matrix
1242 dimensioned [1,count], so xcount=1.
1243
1244 o One-dimensional argument B is implicitly treated as a column matrix
1245 dimensioned [count, 1], so ycount=1.
1246 */
1247
1248 if (retarray->base_addr == NULL)
1249 {
1250 if (GFC_DESCRIPTOR_RANK (a) == 1)
1251 {
1252 GFC_DIMENSION_SET(retarray->dim[0], 0,
1253 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1254 }
1255 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1256 {
1257 GFC_DIMENSION_SET(retarray->dim[0], 0,
1258 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1259 }
1260 else
1261 {
1262 GFC_DIMENSION_SET(retarray->dim[0], 0,
1263 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1264
1265 GFC_DIMENSION_SET(retarray->dim[1], 0,
1266 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1267 GFC_DESCRIPTOR_EXTENT(retarray,0));
1268 }
1269
1270 retarray->base_addr
1271 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
1272 retarray->offset = 0;
1273 }
1274 else if (unlikely (compile_options.bounds_check))
1275 {
1276 index_type ret_extent, arg_extent;
1277
1278 if (GFC_DESCRIPTOR_RANK (a) == 1)
1279 {
1280 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1281 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1282 if (arg_extent != ret_extent)
1283 runtime_error ("Array bound mismatch for dimension 1 of "
1284 "array (%ld/%ld) ",
1285 (long int) ret_extent, (long int) arg_extent);
1286 }
1287 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1288 {
1289 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1290 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1291 if (arg_extent != ret_extent)
1292 runtime_error ("Array bound mismatch for dimension 1 of "
1293 "array (%ld/%ld) ",
1294 (long int) ret_extent, (long int) arg_extent);
1295 }
1296 else
1297 {
1298 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1299 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1300 if (arg_extent != ret_extent)
1301 runtime_error ("Array bound mismatch for dimension 1 of "
1302 "array (%ld/%ld) ",
1303 (long int) ret_extent, (long int) arg_extent);
1304
1305 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1306 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1307 if (arg_extent != ret_extent)
1308 runtime_error ("Array bound mismatch for dimension 2 of "
1309 "array (%ld/%ld) ",
1310 (long int) ret_extent, (long int) arg_extent);
1311 }
1312 }
1313
1314
1315 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1316 {
1317 /* One-dimensional result may be addressed in the code below
1318 either as a row or a column matrix. We want both cases to
1319 work. */
1320 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1321 }
1322 else
1323 {
1324 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1325 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1326 }
1327
1328
1329 if (GFC_DESCRIPTOR_RANK (a) == 1)
1330 {
1331 /* Treat it as a a row matrix A[1,count]. */
1332 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1333 aystride = 1;
1334
1335 xcount = 1;
1336 count = GFC_DESCRIPTOR_EXTENT(a,0);
1337 }
1338 else
1339 {
1340 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1341 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1342
1343 count = GFC_DESCRIPTOR_EXTENT(a,1);
1344 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1345 }
1346
1347 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1348 {
1349 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1350 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1351 "in dimension 1: is %ld, should be %ld",
1352 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1353 }
1354
1355 if (GFC_DESCRIPTOR_RANK (b) == 1)
1356 {
1357 /* Treat it as a column matrix B[count,1] */
1358 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1359
1360 /* bystride should never be used for 1-dimensional b.
1361 The value is only used for calculation of the
1362 memory by the buffer. */
1363 bystride = 256;
1364 ycount = 1;
1365 }
1366 else
1367 {
1368 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1369 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1370 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1371 }
1372
1373 abase = a->base_addr;
1374 bbase = b->base_addr;
1375 dest = retarray->base_addr;
1376
1377 /* Now that everything is set up, we perform the multiplication
1378 itself. */
1379
1380 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1381 #define min(a,b) ((a) <= (b) ? (a) : (b))
1382 #define max(a,b) ((a) >= (b) ? (a) : (b))
1383
1384 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1385 && (bxstride == 1 || bystride == 1)
1386 && (((float) xcount) * ((float) ycount) * ((float) count)
1387 > POW3(blas_limit)))
1388 {
1389 const int m = xcount, n = ycount, k = count, ldc = rystride;
1390 const GFC_REAL_8 one = 1, zero = 0;
1391 const int lda = (axstride == 1) ? aystride : axstride,
1392 ldb = (bxstride == 1) ? bystride : bxstride;
1393
1394 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1395 {
1396 assert (gemm != NULL);
1397 const char *transa, *transb;
1398 if (try_blas & 2)
1399 transa = "C";
1400 else
1401 transa = axstride == 1 ? "N" : "T";
1402
1403 if (try_blas & 4)
1404 transb = "C";
1405 else
1406 transb = bxstride == 1 ? "N" : "T";
1407
1408 gemm (transa, transb , &m,
1409 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1410 &ldc, 1, 1);
1411 return;
1412 }
1413 }
1414
1415 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1416 {
1417 /* This block of code implements a tuned matmul, derived from
1418 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1419
1420 Bo Kagstrom and Per Ling
1421 Department of Computing Science
1422 Umea University
1423 S-901 87 Umea, Sweden
1424
1425 from netlib.org, translated to C, and modified for matmul.m4. */
1426
1427 const GFC_REAL_8 *a, *b;
1428 GFC_REAL_8 *c;
1429 const index_type m = xcount, n = ycount, k = count;
1430
1431 /* System generated locals */
1432 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1433 i1, i2, i3, i4, i5, i6;
1434
1435 /* Local variables */
1436 GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
1437 f13, f14, f23, f24, f33, f34, f43, f44;
1438 index_type i, j, l, ii, jj, ll;
1439 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1440 GFC_REAL_8 *t1;
1441
1442 a = abase;
1443 b = bbase;
1444 c = retarray->base_addr;
1445
1446 /* Parameter adjustments */
1447 c_dim1 = rystride;
1448 c_offset = 1 + c_dim1;
1449 c -= c_offset;
1450 a_dim1 = aystride;
1451 a_offset = 1 + a_dim1;
1452 a -= a_offset;
1453 b_dim1 = bystride;
1454 b_offset = 1 + b_dim1;
1455 b -= b_offset;
1456
1457 /* Empty c first. */
1458 for (j=1; j<=n; j++)
1459 for (i=1; i<=m; i++)
1460 c[i + j * c_dim1] = (GFC_REAL_8)0;
1461
1462 /* Early exit if possible */
1463 if (m == 0 || n == 0 || k == 0)
1464 return;
1465
1466 /* Adjust size of t1 to what is needed. */
1467 index_type t1_dim, a_sz;
1468 if (aystride == 1)
1469 a_sz = rystride;
1470 else
1471 a_sz = a_dim1;
1472
1473 t1_dim = a_sz * 256 + b_dim1;
1474 if (t1_dim > 65536)
1475 t1_dim = 65536;
1476
1477 t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
1478
1479 /* Start turning the crank. */
1480 i1 = n;
1481 for (jj = 1; jj <= i1; jj += 512)
1482 {
1483 /* Computing MIN */
1484 i2 = 512;
1485 i3 = n - jj + 1;
1486 jsec = min(i2,i3);
1487 ujsec = jsec - jsec % 4;
1488 i2 = k;
1489 for (ll = 1; ll <= i2; ll += 256)
1490 {
1491 /* Computing MIN */
1492 i3 = 256;
1493 i4 = k - ll + 1;
1494 lsec = min(i3,i4);
1495 ulsec = lsec - lsec % 2;
1496
1497 i3 = m;
1498 for (ii = 1; ii <= i3; ii += 256)
1499 {
1500 /* Computing MIN */
1501 i4 = 256;
1502 i5 = m - ii + 1;
1503 isec = min(i4,i5);
1504 uisec = isec - isec % 2;
1505 i4 = ll + ulsec - 1;
1506 for (l = ll; l <= i4; l += 2)
1507 {
1508 i5 = ii + uisec - 1;
1509 for (i = ii; i <= i5; i += 2)
1510 {
1511 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1512 a[i + l * a_dim1];
1513 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1514 a[i + (l + 1) * a_dim1];
1515 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1516 a[i + 1 + l * a_dim1];
1517 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1518 a[i + 1 + (l + 1) * a_dim1];
1519 }
1520 if (uisec < isec)
1521 {
1522 t1[l - ll + 1 + (isec << 8) - 257] =
1523 a[ii + isec - 1 + l * a_dim1];
1524 t1[l - ll + 2 + (isec << 8) - 257] =
1525 a[ii + isec - 1 + (l + 1) * a_dim1];
1526 }
1527 }
1528 if (ulsec < lsec)
1529 {
1530 i4 = ii + isec - 1;
1531 for (i = ii; i<= i4; ++i)
1532 {
1533 t1[lsec + ((i - ii + 1) << 8) - 257] =
1534 a[i + (ll + lsec - 1) * a_dim1];
1535 }
1536 }
1537
1538 uisec = isec - isec % 4;
1539 i4 = jj + ujsec - 1;
1540 for (j = jj; j <= i4; j += 4)
1541 {
1542 i5 = ii + uisec - 1;
1543 for (i = ii; i <= i5; i += 4)
1544 {
1545 f11 = c[i + j * c_dim1];
1546 f21 = c[i + 1 + j * c_dim1];
1547 f12 = c[i + (j + 1) * c_dim1];
1548 f22 = c[i + 1 + (j + 1) * c_dim1];
1549 f13 = c[i + (j + 2) * c_dim1];
1550 f23 = c[i + 1 + (j + 2) * c_dim1];
1551 f14 = c[i + (j + 3) * c_dim1];
1552 f24 = c[i + 1 + (j + 3) * c_dim1];
1553 f31 = c[i + 2 + j * c_dim1];
1554 f41 = c[i + 3 + j * c_dim1];
1555 f32 = c[i + 2 + (j + 1) * c_dim1];
1556 f42 = c[i + 3 + (j + 1) * c_dim1];
1557 f33 = c[i + 2 + (j + 2) * c_dim1];
1558 f43 = c[i + 3 + (j + 2) * c_dim1];
1559 f34 = c[i + 2 + (j + 3) * c_dim1];
1560 f44 = c[i + 3 + (j + 3) * c_dim1];
1561 i6 = ll + lsec - 1;
1562 for (l = ll; l <= i6; ++l)
1563 {
1564 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1565 * b[l + j * b_dim1];
1566 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1567 * b[l + j * b_dim1];
1568 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1569 * b[l + (j + 1) * b_dim1];
1570 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1571 * b[l + (j + 1) * b_dim1];
1572 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1573 * b[l + (j + 2) * b_dim1];
1574 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1575 * b[l + (j + 2) * b_dim1];
1576 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1577 * b[l + (j + 3) * b_dim1];
1578 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1579 * b[l + (j + 3) * b_dim1];
1580 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1581 * b[l + j * b_dim1];
1582 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1583 * b[l + j * b_dim1];
1584 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1585 * b[l + (j + 1) * b_dim1];
1586 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1587 * b[l + (j + 1) * b_dim1];
1588 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1589 * b[l + (j + 2) * b_dim1];
1590 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1591 * b[l + (j + 2) * b_dim1];
1592 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1593 * b[l + (j + 3) * b_dim1];
1594 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1595 * b[l + (j + 3) * b_dim1];
1596 }
1597 c[i + j * c_dim1] = f11;
1598 c[i + 1 + j * c_dim1] = f21;
1599 c[i + (j + 1) * c_dim1] = f12;
1600 c[i + 1 + (j + 1) * c_dim1] = f22;
1601 c[i + (j + 2) * c_dim1] = f13;
1602 c[i + 1 + (j + 2) * c_dim1] = f23;
1603 c[i + (j + 3) * c_dim1] = f14;
1604 c[i + 1 + (j + 3) * c_dim1] = f24;
1605 c[i + 2 + j * c_dim1] = f31;
1606 c[i + 3 + j * c_dim1] = f41;
1607 c[i + 2 + (j + 1) * c_dim1] = f32;
1608 c[i + 3 + (j + 1) * c_dim1] = f42;
1609 c[i + 2 + (j + 2) * c_dim1] = f33;
1610 c[i + 3 + (j + 2) * c_dim1] = f43;
1611 c[i + 2 + (j + 3) * c_dim1] = f34;
1612 c[i + 3 + (j + 3) * c_dim1] = f44;
1613 }
1614 if (uisec < isec)
1615 {
1616 i5 = ii + isec - 1;
1617 for (i = ii + uisec; i <= i5; ++i)
1618 {
1619 f11 = c[i + j * c_dim1];
1620 f12 = c[i + (j + 1) * c_dim1];
1621 f13 = c[i + (j + 2) * c_dim1];
1622 f14 = c[i + (j + 3) * c_dim1];
1623 i6 = ll + lsec - 1;
1624 for (l = ll; l <= i6; ++l)
1625 {
1626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1627 257] * b[l + j * b_dim1];
1628 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1629 257] * b[l + (j + 1) * b_dim1];
1630 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1631 257] * b[l + (j + 2) * b_dim1];
1632 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1633 257] * b[l + (j + 3) * b_dim1];
1634 }
1635 c[i + j * c_dim1] = f11;
1636 c[i + (j + 1) * c_dim1] = f12;
1637 c[i + (j + 2) * c_dim1] = f13;
1638 c[i + (j + 3) * c_dim1] = f14;
1639 }
1640 }
1641 }
1642 if (ujsec < jsec)
1643 {
1644 i4 = jj + jsec - 1;
1645 for (j = jj + ujsec; j <= i4; ++j)
1646 {
1647 i5 = ii + uisec - 1;
1648 for (i = ii; i <= i5; i += 4)
1649 {
1650 f11 = c[i + j * c_dim1];
1651 f21 = c[i + 1 + j * c_dim1];
1652 f31 = c[i + 2 + j * c_dim1];
1653 f41 = c[i + 3 + j * c_dim1];
1654 i6 = ll + lsec - 1;
1655 for (l = ll; l <= i6; ++l)
1656 {
1657 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1658 257] * b[l + j * b_dim1];
1659 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1660 257] * b[l + j * b_dim1];
1661 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1662 257] * b[l + j * b_dim1];
1663 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1664 257] * b[l + j * b_dim1];
1665 }
1666 c[i + j * c_dim1] = f11;
1667 c[i + 1 + j * c_dim1] = f21;
1668 c[i + 2 + j * c_dim1] = f31;
1669 c[i + 3 + j * c_dim1] = f41;
1670 }
1671 i5 = ii + isec - 1;
1672 for (i = ii + uisec; i <= i5; ++i)
1673 {
1674 f11 = c[i + j * c_dim1];
1675 i6 = ll + lsec - 1;
1676 for (l = ll; l <= i6; ++l)
1677 {
1678 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1679 257] * b[l + j * b_dim1];
1680 }
1681 c[i + j * c_dim1] = f11;
1682 }
1683 }
1684 }
1685 }
1686 }
1687 }
1688 free(t1);
1689 return;
1690 }
1691 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1692 {
1693 if (GFC_DESCRIPTOR_RANK (a) != 1)
1694 {
1695 const GFC_REAL_8 *restrict abase_x;
1696 const GFC_REAL_8 *restrict bbase_y;
1697 GFC_REAL_8 *restrict dest_y;
1698 GFC_REAL_8 s;
1699
1700 for (y = 0; y < ycount; y++)
1701 {
1702 bbase_y = &bbase[y*bystride];
1703 dest_y = &dest[y*rystride];
1704 for (x = 0; x < xcount; x++)
1705 {
1706 abase_x = &abase[x*axstride];
1707 s = (GFC_REAL_8) 0;
1708 for (n = 0; n < count; n++)
1709 s += abase_x[n] * bbase_y[n];
1710 dest_y[x] = s;
1711 }
1712 }
1713 }
1714 else
1715 {
1716 const GFC_REAL_8 *restrict bbase_y;
1717 GFC_REAL_8 s;
1718
1719 for (y = 0; y < ycount; y++)
1720 {
1721 bbase_y = &bbase[y*bystride];
1722 s = (GFC_REAL_8) 0;
1723 for (n = 0; n < count; n++)
1724 s += abase[n*axstride] * bbase_y[n];
1725 dest[y*rystride] = s;
1726 }
1727 }
1728 }
1729 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1730 {
1731 const GFC_REAL_8 *restrict bbase_y;
1732 GFC_REAL_8 s;
1733
1734 for (y = 0; y < ycount; y++)
1735 {
1736 bbase_y = &bbase[y*bystride];
1737 s = (GFC_REAL_8) 0;
1738 for (n = 0; n < count; n++)
1739 s += abase[n*axstride] * bbase_y[n*bxstride];
1740 dest[y*rxstride] = s;
1741 }
1742 }
1743 else if (axstride < aystride)
1744 {
1745 for (y = 0; y < ycount; y++)
1746 for (x = 0; x < xcount; x++)
1747 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
1748
1749 for (y = 0; y < ycount; y++)
1750 for (n = 0; n < count; n++)
1751 for (x = 0; x < xcount; x++)
1752 /* dest[x,y] += a[x,n] * b[n,y] */
1753 dest[x*rxstride + y*rystride] +=
1754 abase[x*axstride + n*aystride] *
1755 bbase[n*bxstride + y*bystride];
1756 }
1757 else
1758 {
1759 const GFC_REAL_8 *restrict abase_x;
1760 const GFC_REAL_8 *restrict bbase_y;
1761 GFC_REAL_8 *restrict dest_y;
1762 GFC_REAL_8 s;
1763
1764 for (y = 0; y < ycount; y++)
1765 {
1766 bbase_y = &bbase[y*bystride];
1767 dest_y = &dest[y*rystride];
1768 for (x = 0; x < xcount; x++)
1769 {
1770 abase_x = &abase[x*axstride];
1771 s = (GFC_REAL_8) 0;
1772 for (n = 0; n < count; n++)
1773 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1774 dest_y[x*rxstride] = s;
1775 }
1776 }
1777 }
1778 }
1779 #undef POW3
1780 #undef min
1781 #undef max
1782
1783 #endif /* HAVE_AVX512F */
1784
1785 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1786
1787 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1788 void
1789 matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
1790 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1791 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1792 internal_proto(matmul_r8_avx128_fma3);
1793 #endif
1794
1795 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1796 void
1797 matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
1798 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1799 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1800 internal_proto(matmul_r8_avx128_fma4);
1801 #endif
1802
1803 /* Function to fall back to if there is no special processor-specific version. */
1804 static void
1805 matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
1806 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1807 int blas_limit, blas_call gemm)
1808 {
1809 const GFC_REAL_8 * restrict abase;
1810 const GFC_REAL_8 * restrict bbase;
1811 GFC_REAL_8 * restrict dest;
1812
1813 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1814 index_type x, y, n, count, xcount, ycount;
1815
1816 assert (GFC_DESCRIPTOR_RANK (a) == 2
1817 || GFC_DESCRIPTOR_RANK (b) == 2);
1818
1819 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1820
1821 Either A or B (but not both) can be rank 1:
1822
1823 o One-dimensional argument A is implicitly treated as a row matrix
1824 dimensioned [1,count], so xcount=1.
1825
1826 o One-dimensional argument B is implicitly treated as a column matrix
1827 dimensioned [count, 1], so ycount=1.
1828 */
1829
1830 if (retarray->base_addr == NULL)
1831 {
1832 if (GFC_DESCRIPTOR_RANK (a) == 1)
1833 {
1834 GFC_DIMENSION_SET(retarray->dim[0], 0,
1835 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1836 }
1837 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1838 {
1839 GFC_DIMENSION_SET(retarray->dim[0], 0,
1840 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1841 }
1842 else
1843 {
1844 GFC_DIMENSION_SET(retarray->dim[0], 0,
1845 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1846
1847 GFC_DIMENSION_SET(retarray->dim[1], 0,
1848 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1849 GFC_DESCRIPTOR_EXTENT(retarray,0));
1850 }
1851
1852 retarray->base_addr
1853 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
1854 retarray->offset = 0;
1855 }
1856 else if (unlikely (compile_options.bounds_check))
1857 {
1858 index_type ret_extent, arg_extent;
1859
1860 if (GFC_DESCRIPTOR_RANK (a) == 1)
1861 {
1862 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1863 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1864 if (arg_extent != ret_extent)
1865 runtime_error ("Array bound mismatch for dimension 1 of "
1866 "array (%ld/%ld) ",
1867 (long int) ret_extent, (long int) arg_extent);
1868 }
1869 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1870 {
1871 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1872 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1873 if (arg_extent != ret_extent)
1874 runtime_error ("Array bound mismatch for dimension 1 of "
1875 "array (%ld/%ld) ",
1876 (long int) ret_extent, (long int) arg_extent);
1877 }
1878 else
1879 {
1880 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1881 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1882 if (arg_extent != ret_extent)
1883 runtime_error ("Array bound mismatch for dimension 1 of "
1884 "array (%ld/%ld) ",
1885 (long int) ret_extent, (long int) arg_extent);
1886
1887 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1888 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1889 if (arg_extent != ret_extent)
1890 runtime_error ("Array bound mismatch for dimension 2 of "
1891 "array (%ld/%ld) ",
1892 (long int) ret_extent, (long int) arg_extent);
1893 }
1894 }
1895
1896
1897 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1898 {
1899 /* One-dimensional result may be addressed in the code below
1900 either as a row or a column matrix. We want both cases to
1901 work. */
1902 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1903 }
1904 else
1905 {
1906 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1907 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1908 }
1909
1910
1911 if (GFC_DESCRIPTOR_RANK (a) == 1)
1912 {
1913 /* Treat it as a a row matrix A[1,count]. */
1914 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1915 aystride = 1;
1916
1917 xcount = 1;
1918 count = GFC_DESCRIPTOR_EXTENT(a,0);
1919 }
1920 else
1921 {
1922 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1923 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1924
1925 count = GFC_DESCRIPTOR_EXTENT(a,1);
1926 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1927 }
1928
1929 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1930 {
1931 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1932 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
1933 "in dimension 1: is %ld, should be %ld",
1934 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1935 }
1936
1937 if (GFC_DESCRIPTOR_RANK (b) == 1)
1938 {
1939 /* Treat it as a column matrix B[count,1] */
1940 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1941
1942 /* bystride should never be used for 1-dimensional b.
1943 The value is only used for calculation of the
1944 memory by the buffer. */
1945 bystride = 256;
1946 ycount = 1;
1947 }
1948 else
1949 {
1950 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1951 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1952 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1953 }
1954
1955 abase = a->base_addr;
1956 bbase = b->base_addr;
1957 dest = retarray->base_addr;
1958
1959 /* Now that everything is set up, we perform the multiplication
1960 itself. */
1961
1962 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1963 #define min(a,b) ((a) <= (b) ? (a) : (b))
1964 #define max(a,b) ((a) >= (b) ? (a) : (b))
1965
1966 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1967 && (bxstride == 1 || bystride == 1)
1968 && (((float) xcount) * ((float) ycount) * ((float) count)
1969 > POW3(blas_limit)))
1970 {
1971 const int m = xcount, n = ycount, k = count, ldc = rystride;
1972 const GFC_REAL_8 one = 1, zero = 0;
1973 const int lda = (axstride == 1) ? aystride : axstride,
1974 ldb = (bxstride == 1) ? bystride : bxstride;
1975
1976 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1977 {
1978 assert (gemm != NULL);
1979 const char *transa, *transb;
1980 if (try_blas & 2)
1981 transa = "C";
1982 else
1983 transa = axstride == 1 ? "N" : "T";
1984
1985 if (try_blas & 4)
1986 transb = "C";
1987 else
1988 transb = bxstride == 1 ? "N" : "T";
1989
1990 gemm (transa, transb , &m,
1991 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1992 &ldc, 1, 1);
1993 return;
1994 }
1995 }
1996
1997 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1998 {
1999 /* This block of code implements a tuned matmul, derived from
2000 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2001
2002 Bo Kagstrom and Per Ling
2003 Department of Computing Science
2004 Umea University
2005 S-901 87 Umea, Sweden
2006
2007 from netlib.org, translated to C, and modified for matmul.m4. */
2008
2009 const GFC_REAL_8 *a, *b;
2010 GFC_REAL_8 *c;
2011 const index_type m = xcount, n = ycount, k = count;
2012
2013 /* System generated locals */
2014 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2015 i1, i2, i3, i4, i5, i6;
2016
2017 /* Local variables */
2018 GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
2019 f13, f14, f23, f24, f33, f34, f43, f44;
2020 index_type i, j, l, ii, jj, ll;
2021 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2022 GFC_REAL_8 *t1;
2023
2024 a = abase;
2025 b = bbase;
2026 c = retarray->base_addr;
2027
2028 /* Parameter adjustments */
2029 c_dim1 = rystride;
2030 c_offset = 1 + c_dim1;
2031 c -= c_offset;
2032 a_dim1 = aystride;
2033 a_offset = 1 + a_dim1;
2034 a -= a_offset;
2035 b_dim1 = bystride;
2036 b_offset = 1 + b_dim1;
2037 b -= b_offset;
2038
2039 /* Empty c first. */
2040 for (j=1; j<=n; j++)
2041 for (i=1; i<=m; i++)
2042 c[i + j * c_dim1] = (GFC_REAL_8)0;
2043
2044 /* Early exit if possible */
2045 if (m == 0 || n == 0 || k == 0)
2046 return;
2047
2048 /* Adjust size of t1 to what is needed. */
2049 index_type t1_dim, a_sz;
2050 if (aystride == 1)
2051 a_sz = rystride;
2052 else
2053 a_sz = a_dim1;
2054
2055 t1_dim = a_sz * 256 + b_dim1;
2056 if (t1_dim > 65536)
2057 t1_dim = 65536;
2058
2059 t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
2060
2061 /* Start turning the crank. */
2062 i1 = n;
2063 for (jj = 1; jj <= i1; jj += 512)
2064 {
2065 /* Computing MIN */
2066 i2 = 512;
2067 i3 = n - jj + 1;
2068 jsec = min(i2,i3);
2069 ujsec = jsec - jsec % 4;
2070 i2 = k;
2071 for (ll = 1; ll <= i2; ll += 256)
2072 {
2073 /* Computing MIN */
2074 i3 = 256;
2075 i4 = k - ll + 1;
2076 lsec = min(i3,i4);
2077 ulsec = lsec - lsec % 2;
2078
2079 i3 = m;
2080 for (ii = 1; ii <= i3; ii += 256)
2081 {
2082 /* Computing MIN */
2083 i4 = 256;
2084 i5 = m - ii + 1;
2085 isec = min(i4,i5);
2086 uisec = isec - isec % 2;
2087 i4 = ll + ulsec - 1;
2088 for (l = ll; l <= i4; l += 2)
2089 {
2090 i5 = ii + uisec - 1;
2091 for (i = ii; i <= i5; i += 2)
2092 {
2093 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2094 a[i + l * a_dim1];
2095 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2096 a[i + (l + 1) * a_dim1];
2097 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2098 a[i + 1 + l * a_dim1];
2099 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2100 a[i + 1 + (l + 1) * a_dim1];
2101 }
2102 if (uisec < isec)
2103 {
2104 t1[l - ll + 1 + (isec << 8) - 257] =
2105 a[ii + isec - 1 + l * a_dim1];
2106 t1[l - ll + 2 + (isec << 8) - 257] =
2107 a[ii + isec - 1 + (l + 1) * a_dim1];
2108 }
2109 }
2110 if (ulsec < lsec)
2111 {
2112 i4 = ii + isec - 1;
2113 for (i = ii; i<= i4; ++i)
2114 {
2115 t1[lsec + ((i - ii + 1) << 8) - 257] =
2116 a[i + (ll + lsec - 1) * a_dim1];
2117 }
2118 }
2119
2120 uisec = isec - isec % 4;
2121 i4 = jj + ujsec - 1;
2122 for (j = jj; j <= i4; j += 4)
2123 {
2124 i5 = ii + uisec - 1;
2125 for (i = ii; i <= i5; i += 4)
2126 {
2127 f11 = c[i + j * c_dim1];
2128 f21 = c[i + 1 + j * c_dim1];
2129 f12 = c[i + (j + 1) * c_dim1];
2130 f22 = c[i + 1 + (j + 1) * c_dim1];
2131 f13 = c[i + (j + 2) * c_dim1];
2132 f23 = c[i + 1 + (j + 2) * c_dim1];
2133 f14 = c[i + (j + 3) * c_dim1];
2134 f24 = c[i + 1 + (j + 3) * c_dim1];
2135 f31 = c[i + 2 + j * c_dim1];
2136 f41 = c[i + 3 + j * c_dim1];
2137 f32 = c[i + 2 + (j + 1) * c_dim1];
2138 f42 = c[i + 3 + (j + 1) * c_dim1];
2139 f33 = c[i + 2 + (j + 2) * c_dim1];
2140 f43 = c[i + 3 + (j + 2) * c_dim1];
2141 f34 = c[i + 2 + (j + 3) * c_dim1];
2142 f44 = c[i + 3 + (j + 3) * c_dim1];
2143 i6 = ll + lsec - 1;
2144 for (l = ll; l <= i6; ++l)
2145 {
2146 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2147 * b[l + j * b_dim1];
2148 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2149 * b[l + j * b_dim1];
2150 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2151 * b[l + (j + 1) * b_dim1];
2152 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2153 * b[l + (j + 1) * b_dim1];
2154 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2155 * b[l + (j + 2) * b_dim1];
2156 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2157 * b[l + (j + 2) * b_dim1];
2158 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2159 * b[l + (j + 3) * b_dim1];
2160 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2161 * b[l + (j + 3) * b_dim1];
2162 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2163 * b[l + j * b_dim1];
2164 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2165 * b[l + j * b_dim1];
2166 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2167 * b[l + (j + 1) * b_dim1];
2168 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2169 * b[l + (j + 1) * b_dim1];
2170 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2171 * b[l + (j + 2) * b_dim1];
2172 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2173 * b[l + (j + 2) * b_dim1];
2174 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2175 * b[l + (j + 3) * b_dim1];
2176 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2177 * b[l + (j + 3) * b_dim1];
2178 }
2179 c[i + j * c_dim1] = f11;
2180 c[i + 1 + j * c_dim1] = f21;
2181 c[i + (j + 1) * c_dim1] = f12;
2182 c[i + 1 + (j + 1) * c_dim1] = f22;
2183 c[i + (j + 2) * c_dim1] = f13;
2184 c[i + 1 + (j + 2) * c_dim1] = f23;
2185 c[i + (j + 3) * c_dim1] = f14;
2186 c[i + 1 + (j + 3) * c_dim1] = f24;
2187 c[i + 2 + j * c_dim1] = f31;
2188 c[i + 3 + j * c_dim1] = f41;
2189 c[i + 2 + (j + 1) * c_dim1] = f32;
2190 c[i + 3 + (j + 1) * c_dim1] = f42;
2191 c[i + 2 + (j + 2) * c_dim1] = f33;
2192 c[i + 3 + (j + 2) * c_dim1] = f43;
2193 c[i + 2 + (j + 3) * c_dim1] = f34;
2194 c[i + 3 + (j + 3) * c_dim1] = f44;
2195 }
2196 if (uisec < isec)
2197 {
2198 i5 = ii + isec - 1;
2199 for (i = ii + uisec; i <= i5; ++i)
2200 {
2201 f11 = c[i + j * c_dim1];
2202 f12 = c[i + (j + 1) * c_dim1];
2203 f13 = c[i + (j + 2) * c_dim1];
2204 f14 = c[i + (j + 3) * c_dim1];
2205 i6 = ll + lsec - 1;
2206 for (l = ll; l <= i6; ++l)
2207 {
2208 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2209 257] * b[l + j * b_dim1];
2210 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2211 257] * b[l + (j + 1) * b_dim1];
2212 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2213 257] * b[l + (j + 2) * b_dim1];
2214 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2215 257] * b[l + (j + 3) * b_dim1];
2216 }
2217 c[i + j * c_dim1] = f11;
2218 c[i + (j + 1) * c_dim1] = f12;
2219 c[i + (j + 2) * c_dim1] = f13;
2220 c[i + (j + 3) * c_dim1] = f14;
2221 }
2222 }
2223 }
2224 if (ujsec < jsec)
2225 {
2226 i4 = jj + jsec - 1;
2227 for (j = jj + ujsec; j <= i4; ++j)
2228 {
2229 i5 = ii + uisec - 1;
2230 for (i = ii; i <= i5; i += 4)
2231 {
2232 f11 = c[i + j * c_dim1];
2233 f21 = c[i + 1 + j * c_dim1];
2234 f31 = c[i + 2 + j * c_dim1];
2235 f41 = c[i + 3 + j * c_dim1];
2236 i6 = ll + lsec - 1;
2237 for (l = ll; l <= i6; ++l)
2238 {
2239 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2240 257] * b[l + j * b_dim1];
2241 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2242 257] * b[l + j * b_dim1];
2243 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2244 257] * b[l + j * b_dim1];
2245 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2246 257] * b[l + j * b_dim1];
2247 }
2248 c[i + j * c_dim1] = f11;
2249 c[i + 1 + j * c_dim1] = f21;
2250 c[i + 2 + j * c_dim1] = f31;
2251 c[i + 3 + j * c_dim1] = f41;
2252 }
2253 i5 = ii + isec - 1;
2254 for (i = ii + uisec; i <= i5; ++i)
2255 {
2256 f11 = c[i + j * c_dim1];
2257 i6 = ll + lsec - 1;
2258 for (l = ll; l <= i6; ++l)
2259 {
2260 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2261 257] * b[l + j * b_dim1];
2262 }
2263 c[i + j * c_dim1] = f11;
2264 }
2265 }
2266 }
2267 }
2268 }
2269 }
2270 free(t1);
2271 return;
2272 }
2273 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2274 {
2275 if (GFC_DESCRIPTOR_RANK (a) != 1)
2276 {
2277 const GFC_REAL_8 *restrict abase_x;
2278 const GFC_REAL_8 *restrict bbase_y;
2279 GFC_REAL_8 *restrict dest_y;
2280 GFC_REAL_8 s;
2281
2282 for (y = 0; y < ycount; y++)
2283 {
2284 bbase_y = &bbase[y*bystride];
2285 dest_y = &dest[y*rystride];
2286 for (x = 0; x < xcount; x++)
2287 {
2288 abase_x = &abase[x*axstride];
2289 s = (GFC_REAL_8) 0;
2290 for (n = 0; n < count; n++)
2291 s += abase_x[n] * bbase_y[n];
2292 dest_y[x] = s;
2293 }
2294 }
2295 }
2296 else
2297 {
2298 const GFC_REAL_8 *restrict bbase_y;
2299 GFC_REAL_8 s;
2300
2301 for (y = 0; y < ycount; y++)
2302 {
2303 bbase_y = &bbase[y*bystride];
2304 s = (GFC_REAL_8) 0;
2305 for (n = 0; n < count; n++)
2306 s += abase[n*axstride] * bbase_y[n];
2307 dest[y*rystride] = s;
2308 }
2309 }
2310 }
2311 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2312 {
2313 const GFC_REAL_8 *restrict bbase_y;
2314 GFC_REAL_8 s;
2315
2316 for (y = 0; y < ycount; y++)
2317 {
2318 bbase_y = &bbase[y*bystride];
2319 s = (GFC_REAL_8) 0;
2320 for (n = 0; n < count; n++)
2321 s += abase[n*axstride] * bbase_y[n*bxstride];
2322 dest[y*rxstride] = s;
2323 }
2324 }
2325 else if (axstride < aystride)
2326 {
2327 for (y = 0; y < ycount; y++)
2328 for (x = 0; x < xcount; x++)
2329 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
2330
2331 for (y = 0; y < ycount; y++)
2332 for (n = 0; n < count; n++)
2333 for (x = 0; x < xcount; x++)
2334 /* dest[x,y] += a[x,n] * b[n,y] */
2335 dest[x*rxstride + y*rystride] +=
2336 abase[x*axstride + n*aystride] *
2337 bbase[n*bxstride + y*bystride];
2338 }
2339 else
2340 {
2341 const GFC_REAL_8 *restrict abase_x;
2342 const GFC_REAL_8 *restrict bbase_y;
2343 GFC_REAL_8 *restrict dest_y;
2344 GFC_REAL_8 s;
2345
2346 for (y = 0; y < ycount; y++)
2347 {
2348 bbase_y = &bbase[y*bystride];
2349 dest_y = &dest[y*rystride];
2350 for (x = 0; x < xcount; x++)
2351 {
2352 abase_x = &abase[x*axstride];
2353 s = (GFC_REAL_8) 0;
2354 for (n = 0; n < count; n++)
2355 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2356 dest_y[x*rxstride] = s;
2357 }
2358 }
2359 }
2360 }
2361 #undef POW3
2362 #undef min
2363 #undef max
2364
2365
2366 /* Compiling main function, with selection code for the processor. */
2367
2368 /* Currently, this is i386 only. Adjust for other architectures. */
2369
2370 void matmul_r8 (gfc_array_r8 * const restrict retarray,
2371 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2372 int blas_limit, blas_call gemm)
2373 {
2374 static void (*matmul_p) (gfc_array_r8 * const restrict retarray,
2375 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2376 int blas_limit, blas_call gemm);
2377
2378 void (*matmul_fn) (gfc_array_r8 * const restrict retarray,
2379 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2380 int blas_limit, blas_call gemm);
2381
2382 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2383 if (matmul_fn == NULL)
2384 {
2385 matmul_fn = matmul_r8_vanilla;
2386 if (__builtin_cpu_is ("intel"))
2387 {
2388 /* Run down the available processors in order of preference. */
2389 #ifdef HAVE_AVX512F
2390 if (__builtin_cpu_supports ("avx512f"))
2391 {
2392 matmul_fn = matmul_r8_avx512f;
2393 goto store;
2394 }
2395
2396 #endif /* HAVE_AVX512F */
2397
2398 #ifdef HAVE_AVX2
2399 if (__builtin_cpu_supports ("avx2")
2400 && __builtin_cpu_supports ("fma"))
2401 {
2402 matmul_fn = matmul_r8_avx2;
2403 goto store;
2404 }
2405
2406 #endif
2407
2408 #ifdef HAVE_AVX
2409 if (__builtin_cpu_supports ("avx"))
2410 {
2411 matmul_fn = matmul_r8_avx;
2412 goto store;
2413 }
2414 #endif /* HAVE_AVX */
2415 }
2416 else if (__builtin_cpu_is ("amd"))
2417 {
2418 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2419 if (__builtin_cpu_supports ("avx")
2420 && __builtin_cpu_supports ("fma"))
2421 {
2422 matmul_fn = matmul_r8_avx128_fma3;
2423 goto store;
2424 }
2425 #endif
2426 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2427 if (__builtin_cpu_supports ("avx")
2428 && __builtin_cpu_supports ("fma4"))
2429 {
2430 matmul_fn = matmul_r8_avx128_fma4;
2431 goto store;
2432 }
2433 #endif
2434
2435 }
2436 store:
2437 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2438 }
2439
2440 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2441 }
2442
2443 #else /* Just the vanilla function. */
2444
2445 void
2446 matmul_r8 (gfc_array_r8 * const restrict retarray,
2447 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2448 int blas_limit, blas_call gemm)
2449 {
2450 const GFC_REAL_8 * restrict abase;
2451 const GFC_REAL_8 * restrict bbase;
2452 GFC_REAL_8 * restrict dest;
2453
2454 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2455 index_type x, y, n, count, xcount, ycount;
2456
2457 assert (GFC_DESCRIPTOR_RANK (a) == 2
2458 || GFC_DESCRIPTOR_RANK (b) == 2);
2459
2460 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2461
2462 Either A or B (but not both) can be rank 1:
2463
2464 o One-dimensional argument A is implicitly treated as a row matrix
2465 dimensioned [1,count], so xcount=1.
2466
2467 o One-dimensional argument B is implicitly treated as a column matrix
2468 dimensioned [count, 1], so ycount=1.
2469 */
2470
2471 if (retarray->base_addr == NULL)
2472 {
2473 if (GFC_DESCRIPTOR_RANK (a) == 1)
2474 {
2475 GFC_DIMENSION_SET(retarray->dim[0], 0,
2476 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2477 }
2478 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2479 {
2480 GFC_DIMENSION_SET(retarray->dim[0], 0,
2481 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2482 }
2483 else
2484 {
2485 GFC_DIMENSION_SET(retarray->dim[0], 0,
2486 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2487
2488 GFC_DIMENSION_SET(retarray->dim[1], 0,
2489 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2490 GFC_DESCRIPTOR_EXTENT(retarray,0));
2491 }
2492
2493 retarray->base_addr
2494 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
2495 retarray->offset = 0;
2496 }
2497 else if (unlikely (compile_options.bounds_check))
2498 {
2499 index_type ret_extent, arg_extent;
2500
2501 if (GFC_DESCRIPTOR_RANK (a) == 1)
2502 {
2503 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2504 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2505 if (arg_extent != ret_extent)
2506 runtime_error ("Array bound mismatch for dimension 1 of "
2507 "array (%ld/%ld) ",
2508 (long int) ret_extent, (long int) arg_extent);
2509 }
2510 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2511 {
2512 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2513 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2514 if (arg_extent != ret_extent)
2515 runtime_error ("Array bound mismatch for dimension 1 of "
2516 "array (%ld/%ld) ",
2517 (long int) ret_extent, (long int) arg_extent);
2518 }
2519 else
2520 {
2521 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2522 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2523 if (arg_extent != ret_extent)
2524 runtime_error ("Array bound mismatch for dimension 1 of "
2525 "array (%ld/%ld) ",
2526 (long int) ret_extent, (long int) arg_extent);
2527
2528 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2529 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2530 if (arg_extent != ret_extent)
2531 runtime_error ("Array bound mismatch for dimension 2 of "
2532 "array (%ld/%ld) ",
2533 (long int) ret_extent, (long int) arg_extent);
2534 }
2535 }
2536
2537
2538 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2539 {
2540 /* One-dimensional result may be addressed in the code below
2541 either as a row or a column matrix. We want both cases to
2542 work. */
2543 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2544 }
2545 else
2546 {
2547 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2548 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2549 }
2550
2551
2552 if (GFC_DESCRIPTOR_RANK (a) == 1)
2553 {
2554 /* Treat it as a a row matrix A[1,count]. */
2555 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2556 aystride = 1;
2557
2558 xcount = 1;
2559 count = GFC_DESCRIPTOR_EXTENT(a,0);
2560 }
2561 else
2562 {
2563 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2564 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2565
2566 count = GFC_DESCRIPTOR_EXTENT(a,1);
2567 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2568 }
2569
2570 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2571 {
2572 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2573 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
2574 "in dimension 1: is %ld, should be %ld",
2575 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
2576 }
2577
2578 if (GFC_DESCRIPTOR_RANK (b) == 1)
2579 {
2580 /* Treat it as a column matrix B[count,1] */
2581 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2582
2583 /* bystride should never be used for 1-dimensional b.
2584 The value is only used for calculation of the
2585 memory by the buffer. */
2586 bystride = 256;
2587 ycount = 1;
2588 }
2589 else
2590 {
2591 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2592 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2593 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2594 }
2595
2596 abase = a->base_addr;
2597 bbase = b->base_addr;
2598 dest = retarray->base_addr;
2599
2600 /* Now that everything is set up, we perform the multiplication
2601 itself. */
2602
2603 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2604 #define min(a,b) ((a) <= (b) ? (a) : (b))
2605 #define max(a,b) ((a) >= (b) ? (a) : (b))
2606
2607 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2608 && (bxstride == 1 || bystride == 1)
2609 && (((float) xcount) * ((float) ycount) * ((float) count)
2610 > POW3(blas_limit)))
2611 {
2612 const int m = xcount, n = ycount, k = count, ldc = rystride;
2613 const GFC_REAL_8 one = 1, zero = 0;
2614 const int lda = (axstride == 1) ? aystride : axstride,
2615 ldb = (bxstride == 1) ? bystride : bxstride;
2616
2617 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2618 {
2619 assert (gemm != NULL);
2620 const char *transa, *transb;
2621 if (try_blas & 2)
2622 transa = "C";
2623 else
2624 transa = axstride == 1 ? "N" : "T";
2625
2626 if (try_blas & 4)
2627 transb = "C";
2628 else
2629 transb = bxstride == 1 ? "N" : "T";
2630
2631 gemm (transa, transb , &m,
2632 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2633 &ldc, 1, 1);
2634 return;
2635 }
2636 }
2637
2638 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2639 {
2640 /* This block of code implements a tuned matmul, derived from
2641 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2642
2643 Bo Kagstrom and Per Ling
2644 Department of Computing Science
2645 Umea University
2646 S-901 87 Umea, Sweden
2647
2648 from netlib.org, translated to C, and modified for matmul.m4. */
2649
2650 const GFC_REAL_8 *a, *b;
2651 GFC_REAL_8 *c;
2652 const index_type m = xcount, n = ycount, k = count;
2653
2654 /* System generated locals */
2655 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2656 i1, i2, i3, i4, i5, i6;
2657
2658 /* Local variables */
2659 GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
2660 f13, f14, f23, f24, f33, f34, f43, f44;
2661 index_type i, j, l, ii, jj, ll;
2662 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2663 GFC_REAL_8 *t1;
2664
2665 a = abase;
2666 b = bbase;
2667 c = retarray->base_addr;
2668
2669 /* Parameter adjustments */
2670 c_dim1 = rystride;
2671 c_offset = 1 + c_dim1;
2672 c -= c_offset;
2673 a_dim1 = aystride;
2674 a_offset = 1 + a_dim1;
2675 a -= a_offset;
2676 b_dim1 = bystride;
2677 b_offset = 1 + b_dim1;
2678 b -= b_offset;
2679
2680 /* Empty c first. */
2681 for (j=1; j<=n; j++)
2682 for (i=1; i<=m; i++)
2683 c[i + j * c_dim1] = (GFC_REAL_8)0;
2684
2685 /* Early exit if possible */
2686 if (m == 0 || n == 0 || k == 0)
2687 return;
2688
2689 /* Adjust size of t1 to what is needed. */
2690 index_type t1_dim, a_sz;
2691 if (aystride == 1)
2692 a_sz = rystride;
2693 else
2694 a_sz = a_dim1;
2695
2696 t1_dim = a_sz * 256 + b_dim1;
2697 if (t1_dim > 65536)
2698 t1_dim = 65536;
2699
2700 t1 = malloc (t1_dim * sizeof(GFC_REAL_8));
2701
2702 /* Start turning the crank. */
2703 i1 = n;
2704 for (jj = 1; jj <= i1; jj += 512)
2705 {
2706 /* Computing MIN */
2707 i2 = 512;
2708 i3 = n - jj + 1;
2709 jsec = min(i2,i3);
2710 ujsec = jsec - jsec % 4;
2711 i2 = k;
2712 for (ll = 1; ll <= i2; ll += 256)
2713 {
2714 /* Computing MIN */
2715 i3 = 256;
2716 i4 = k - ll + 1;
2717 lsec = min(i3,i4);
2718 ulsec = lsec - lsec % 2;
2719
2720 i3 = m;
2721 for (ii = 1; ii <= i3; ii += 256)
2722 {
2723 /* Computing MIN */
2724 i4 = 256;
2725 i5 = m - ii + 1;
2726 isec = min(i4,i5);
2727 uisec = isec - isec % 2;
2728 i4 = ll + ulsec - 1;
2729 for (l = ll; l <= i4; l += 2)
2730 {
2731 i5 = ii + uisec - 1;
2732 for (i = ii; i <= i5; i += 2)
2733 {
2734 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2735 a[i + l * a_dim1];
2736 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2737 a[i + (l + 1) * a_dim1];
2738 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2739 a[i + 1 + l * a_dim1];
2740 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2741 a[i + 1 + (l + 1) * a_dim1];
2742 }
2743 if (uisec < isec)
2744 {
2745 t1[l - ll + 1 + (isec << 8) - 257] =
2746 a[ii + isec - 1 + l * a_dim1];
2747 t1[l - ll + 2 + (isec << 8) - 257] =
2748 a[ii + isec - 1 + (l + 1) * a_dim1];
2749 }
2750 }
2751 if (ulsec < lsec)
2752 {
2753 i4 = ii + isec - 1;
2754 for (i = ii; i<= i4; ++i)
2755 {
2756 t1[lsec + ((i - ii + 1) << 8) - 257] =
2757 a[i + (ll + lsec - 1) * a_dim1];
2758 }
2759 }
2760
2761 uisec = isec - isec % 4;
2762 i4 = jj + ujsec - 1;
2763 for (j = jj; j <= i4; j += 4)
2764 {
2765 i5 = ii + uisec - 1;
2766 for (i = ii; i <= i5; i += 4)
2767 {
2768 f11 = c[i + j * c_dim1];
2769 f21 = c[i + 1 + j * c_dim1];
2770 f12 = c[i + (j + 1) * c_dim1];
2771 f22 = c[i + 1 + (j + 1) * c_dim1];
2772 f13 = c[i + (j + 2) * c_dim1];
2773 f23 = c[i + 1 + (j + 2) * c_dim1];
2774 f14 = c[i + (j + 3) * c_dim1];
2775 f24 = c[i + 1 + (j + 3) * c_dim1];
2776 f31 = c[i + 2 + j * c_dim1];
2777 f41 = c[i + 3 + j * c_dim1];
2778 f32 = c[i + 2 + (j + 1) * c_dim1];
2779 f42 = c[i + 3 + (j + 1) * c_dim1];
2780 f33 = c[i + 2 + (j + 2) * c_dim1];
2781 f43 = c[i + 3 + (j + 2) * c_dim1];
2782 f34 = c[i + 2 + (j + 3) * c_dim1];
2783 f44 = c[i + 3 + (j + 3) * c_dim1];
2784 i6 = ll + lsec - 1;
2785 for (l = ll; l <= i6; ++l)
2786 {
2787 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2788 * b[l + j * b_dim1];
2789 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2790 * b[l + j * b_dim1];
2791 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2792 * b[l + (j + 1) * b_dim1];
2793 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2794 * b[l + (j + 1) * b_dim1];
2795 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2796 * b[l + (j + 2) * b_dim1];
2797 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2798 * b[l + (j + 2) * b_dim1];
2799 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2800 * b[l + (j + 3) * b_dim1];
2801 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2802 * b[l + (j + 3) * b_dim1];
2803 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2804 * b[l + j * b_dim1];
2805 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2806 * b[l + j * b_dim1];
2807 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2808 * b[l + (j + 1) * b_dim1];
2809 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2810 * b[l + (j + 1) * b_dim1];
2811 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2812 * b[l + (j + 2) * b_dim1];
2813 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2814 * b[l + (j + 2) * b_dim1];
2815 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2816 * b[l + (j + 3) * b_dim1];
2817 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2818 * b[l + (j + 3) * b_dim1];
2819 }
2820 c[i + j * c_dim1] = f11;
2821 c[i + 1 + j * c_dim1] = f21;
2822 c[i + (j + 1) * c_dim1] = f12;
2823 c[i + 1 + (j + 1) * c_dim1] = f22;
2824 c[i + (j + 2) * c_dim1] = f13;
2825 c[i + 1 + (j + 2) * c_dim1] = f23;
2826 c[i + (j + 3) * c_dim1] = f14;
2827 c[i + 1 + (j + 3) * c_dim1] = f24;
2828 c[i + 2 + j * c_dim1] = f31;
2829 c[i + 3 + j * c_dim1] = f41;
2830 c[i + 2 + (j + 1) * c_dim1] = f32;
2831 c[i + 3 + (j + 1) * c_dim1] = f42;
2832 c[i + 2 + (j + 2) * c_dim1] = f33;
2833 c[i + 3 + (j + 2) * c_dim1] = f43;
2834 c[i + 2 + (j + 3) * c_dim1] = f34;
2835 c[i + 3 + (j + 3) * c_dim1] = f44;
2836 }
2837 if (uisec < isec)
2838 {
2839 i5 = ii + isec - 1;
2840 for (i = ii + uisec; i <= i5; ++i)
2841 {
2842 f11 = c[i + j * c_dim1];
2843 f12 = c[i + (j + 1) * c_dim1];
2844 f13 = c[i + (j + 2) * c_dim1];
2845 f14 = c[i + (j + 3) * c_dim1];
2846 i6 = ll + lsec - 1;
2847 for (l = ll; l <= i6; ++l)
2848 {
2849 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2850 257] * b[l + j * b_dim1];
2851 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2852 257] * b[l + (j + 1) * b_dim1];
2853 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2854 257] * b[l + (j + 2) * b_dim1];
2855 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2856 257] * b[l + (j + 3) * b_dim1];
2857 }
2858 c[i + j * c_dim1] = f11;
2859 c[i + (j + 1) * c_dim1] = f12;
2860 c[i + (j + 2) * c_dim1] = f13;
2861 c[i + (j + 3) * c_dim1] = f14;
2862 }
2863 }
2864 }
2865 if (ujsec < jsec)
2866 {
2867 i4 = jj + jsec - 1;
2868 for (j = jj + ujsec; j <= i4; ++j)
2869 {
2870 i5 = ii + uisec - 1;
2871 for (i = ii; i <= i5; i += 4)
2872 {
2873 f11 = c[i + j * c_dim1];
2874 f21 = c[i + 1 + j * c_dim1];
2875 f31 = c[i + 2 + j * c_dim1];
2876 f41 = c[i + 3 + j * c_dim1];
2877 i6 = ll + lsec - 1;
2878 for (l = ll; l <= i6; ++l)
2879 {
2880 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2881 257] * b[l + j * b_dim1];
2882 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2883 257] * b[l + j * b_dim1];
2884 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2885 257] * b[l + j * b_dim1];
2886 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2887 257] * b[l + j * b_dim1];
2888 }
2889 c[i + j * c_dim1] = f11;
2890 c[i + 1 + j * c_dim1] = f21;
2891 c[i + 2 + j * c_dim1] = f31;
2892 c[i + 3 + j * c_dim1] = f41;
2893 }
2894 i5 = ii + isec - 1;
2895 for (i = ii + uisec; i <= i5; ++i)
2896 {
2897 f11 = c[i + j * c_dim1];
2898 i6 = ll + lsec - 1;
2899 for (l = ll; l <= i6; ++l)
2900 {
2901 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2902 257] * b[l + j * b_dim1];
2903 }
2904 c[i + j * c_dim1] = f11;
2905 }
2906 }
2907 }
2908 }
2909 }
2910 }
2911 free(t1);
2912 return;
2913 }
2914 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2915 {
2916 if (GFC_DESCRIPTOR_RANK (a) != 1)
2917 {
2918 const GFC_REAL_8 *restrict abase_x;
2919 const GFC_REAL_8 *restrict bbase_y;
2920 GFC_REAL_8 *restrict dest_y;
2921 GFC_REAL_8 s;
2922
2923 for (y = 0; y < ycount; y++)
2924 {
2925 bbase_y = &bbase[y*bystride];
2926 dest_y = &dest[y*rystride];
2927 for (x = 0; x < xcount; x++)
2928 {
2929 abase_x = &abase[x*axstride];
2930 s = (GFC_REAL_8) 0;
2931 for (n = 0; n < count; n++)
2932 s += abase_x[n] * bbase_y[n];
2933 dest_y[x] = s;
2934 }
2935 }
2936 }
2937 else
2938 {
2939 const GFC_REAL_8 *restrict bbase_y;
2940 GFC_REAL_8 s;
2941
2942 for (y = 0; y < ycount; y++)
2943 {
2944 bbase_y = &bbase[y*bystride];
2945 s = (GFC_REAL_8) 0;
2946 for (n = 0; n < count; n++)
2947 s += abase[n*axstride] * bbase_y[n];
2948 dest[y*rystride] = s;
2949 }
2950 }
2951 }
2952 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2953 {
2954 const GFC_REAL_8 *restrict bbase_y;
2955 GFC_REAL_8 s;
2956
2957 for (y = 0; y < ycount; y++)
2958 {
2959 bbase_y = &bbase[y*bystride];
2960 s = (GFC_REAL_8) 0;
2961 for (n = 0; n < count; n++)
2962 s += abase[n*axstride] * bbase_y[n*bxstride];
2963 dest[y*rxstride] = s;
2964 }
2965 }
2966 else if (axstride < aystride)
2967 {
2968 for (y = 0; y < ycount; y++)
2969 for (x = 0; x < xcount; x++)
2970 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
2971
2972 for (y = 0; y < ycount; y++)
2973 for (n = 0; n < count; n++)
2974 for (x = 0; x < xcount; x++)
2975 /* dest[x,y] += a[x,n] * b[n,y] */
2976 dest[x*rxstride + y*rystride] +=
2977 abase[x*axstride + n*aystride] *
2978 bbase[n*bxstride + y*bystride];
2979 }
2980 else
2981 {
2982 const GFC_REAL_8 *restrict abase_x;
2983 const GFC_REAL_8 *restrict bbase_y;
2984 GFC_REAL_8 *restrict dest_y;
2985 GFC_REAL_8 s;
2986
2987 for (y = 0; y < ycount; y++)
2988 {
2989 bbase_y = &bbase[y*bystride];
2990 dest_y = &dest[y*rystride];
2991 for (x = 0; x < xcount; x++)
2992 {
2993 abase_x = &abase[x*axstride];
2994 s = (GFC_REAL_8) 0;
2995 for (n = 0; n < count; n++)
2996 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2997 dest_y[x*rxstride] = s;
2998 }
2999 }
3000 }
3001 }
3002 #undef POW3
3003 #undef min
3004 #undef max
3005
3006 #endif
3007 #endif
3008