]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_r8.c
Update copyright years.
[thirdparty/gcc.git] / libgfortran / generated / matmul_r8.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_REAL_8)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
39 const int *, const GFC_REAL_8 *, const int *,
40 const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
73 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_r8);
76
77 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
78 /* REAL types generate identical code for AVX and AVX2. Only generate
79 an AVX2 function if we are dealing with integer. */
80 #undef HAVE_AVX2
81 #endif
82
83
84 /* Put exhaustive list of possible architectures here here, ORed together. */
85
86 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
87
88 #ifdef HAVE_AVX
89 static void
90 matmul_r8_avx (gfc_array_r8 * const restrict retarray,
91 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
92 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
93 static void
94 matmul_r8_avx (gfc_array_r8 * const restrict retarray,
95 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
96 int blas_limit, blas_call gemm)
97 {
98 const GFC_REAL_8 * restrict abase;
99 const GFC_REAL_8 * restrict bbase;
100 GFC_REAL_8 * restrict dest;
101
102 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
103 index_type x, y, n, count, xcount, ycount;
104
105 assert (GFC_DESCRIPTOR_RANK (a) == 2
106 || GFC_DESCRIPTOR_RANK (b) == 2);
107
108 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
109
110 Either A or B (but not both) can be rank 1:
111
112 o One-dimensional argument A is implicitly treated as a row matrix
113 dimensioned [1,count], so xcount=1.
114
115 o One-dimensional argument B is implicitly treated as a column matrix
116 dimensioned [count, 1], so ycount=1.
117 */
118
119 if (retarray->base_addr == NULL)
120 {
121 if (GFC_DESCRIPTOR_RANK (a) == 1)
122 {
123 GFC_DIMENSION_SET(retarray->dim[0], 0,
124 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
125 }
126 else if (GFC_DESCRIPTOR_RANK (b) == 1)
127 {
128 GFC_DIMENSION_SET(retarray->dim[0], 0,
129 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
130 }
131 else
132 {
133 GFC_DIMENSION_SET(retarray->dim[0], 0,
134 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
135
136 GFC_DIMENSION_SET(retarray->dim[1], 0,
137 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
138 GFC_DESCRIPTOR_EXTENT(retarray,0));
139 }
140
141 retarray->base_addr
142 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
143 retarray->offset = 0;
144 }
145 else if (unlikely (compile_options.bounds_check))
146 {
147 index_type ret_extent, arg_extent;
148
149 if (GFC_DESCRIPTOR_RANK (a) == 1)
150 {
151 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
152 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
153 if (arg_extent != ret_extent)
154 runtime_error ("Incorrect extent in return array in"
155 " MATMUL intrinsic: is %ld, should be %ld",
156 (long int) ret_extent, (long int) arg_extent);
157 }
158 else if (GFC_DESCRIPTOR_RANK (b) == 1)
159 {
160 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
161 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
162 if (arg_extent != ret_extent)
163 runtime_error ("Incorrect extent in return array in"
164 " MATMUL intrinsic: is %ld, should be %ld",
165 (long int) ret_extent, (long int) arg_extent);
166 }
167 else
168 {
169 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
171 if (arg_extent != ret_extent)
172 runtime_error ("Incorrect extent in return array in"
173 " MATMUL intrinsic for dimension 1:"
174 " is %ld, should be %ld",
175 (long int) ret_extent, (long int) arg_extent);
176
177 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
178 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
179 if (arg_extent != ret_extent)
180 runtime_error ("Incorrect extent in return array in"
181 " MATMUL intrinsic for dimension 2:"
182 " is %ld, should be %ld",
183 (long int) ret_extent, (long int) arg_extent);
184 }
185 }
186
187
188 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
189 {
190 /* One-dimensional result may be addressed in the code below
191 either as a row or a column matrix. We want both cases to
192 work. */
193 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
194 }
195 else
196 {
197 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
198 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
199 }
200
201
202 if (GFC_DESCRIPTOR_RANK (a) == 1)
203 {
204 /* Treat it as a a row matrix A[1,count]. */
205 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
206 aystride = 1;
207
208 xcount = 1;
209 count = GFC_DESCRIPTOR_EXTENT(a,0);
210 }
211 else
212 {
213 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
214 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
215
216 count = GFC_DESCRIPTOR_EXTENT(a,1);
217 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
218 }
219
220 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
221 {
222 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
223 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
224 }
225
226 if (GFC_DESCRIPTOR_RANK (b) == 1)
227 {
228 /* Treat it as a column matrix B[count,1] */
229 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
230
231 /* bystride should never be used for 1-dimensional b.
232 in case it is we want it to cause a segfault, rather than
233 an incorrect result. */
234 bystride = 0xDEADBEEF;
235 ycount = 1;
236 }
237 else
238 {
239 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
240 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
241 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
242 }
243
244 abase = a->base_addr;
245 bbase = b->base_addr;
246 dest = retarray->base_addr;
247
248 /* Now that everything is set up, we perform the multiplication
249 itself. */
250
251 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
252 #define min(a,b) ((a) <= (b) ? (a) : (b))
253 #define max(a,b) ((a) >= (b) ? (a) : (b))
254
255 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
256 && (bxstride == 1 || bystride == 1)
257 && (((float) xcount) * ((float) ycount) * ((float) count)
258 > POW3(blas_limit)))
259 {
260 const int m = xcount, n = ycount, k = count, ldc = rystride;
261 const GFC_REAL_8 one = 1, zero = 0;
262 const int lda = (axstride == 1) ? aystride : axstride,
263 ldb = (bxstride == 1) ? bystride : bxstride;
264
265 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
266 {
267 assert (gemm != NULL);
268 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
269 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
270 &ldc, 1, 1);
271 return;
272 }
273 }
274
275 if (rxstride == 1 && axstride == 1 && bxstride == 1)
276 {
277 /* This block of code implements a tuned matmul, derived from
278 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
279
280 Bo Kagstrom and Per Ling
281 Department of Computing Science
282 Umea University
283 S-901 87 Umea, Sweden
284
285 from netlib.org, translated to C, and modified for matmul.m4. */
286
287 const GFC_REAL_8 *a, *b;
288 GFC_REAL_8 *c;
289 const index_type m = xcount, n = ycount, k = count;
290
291 /* System generated locals */
292 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
293 i1, i2, i3, i4, i5, i6;
294
295 /* Local variables */
296 GFC_REAL_8 t1[65536], /* was [256][256] */
297 f11, f12, f21, f22, f31, f32, f41, f42,
298 f13, f14, f23, f24, f33, f34, f43, f44;
299 index_type i, j, l, ii, jj, ll;
300 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
301
302 a = abase;
303 b = bbase;
304 c = retarray->base_addr;
305
306 /* Parameter adjustments */
307 c_dim1 = rystride;
308 c_offset = 1 + c_dim1;
309 c -= c_offset;
310 a_dim1 = aystride;
311 a_offset = 1 + a_dim1;
312 a -= a_offset;
313 b_dim1 = bystride;
314 b_offset = 1 + b_dim1;
315 b -= b_offset;
316
317 /* Early exit if possible */
318 if (m == 0 || n == 0 || k == 0)
319 return;
320
321 /* Empty c first. */
322 for (j=1; j<=n; j++)
323 for (i=1; i<=m; i++)
324 c[i + j * c_dim1] = (GFC_REAL_8)0;
325
326 /* Start turning the crank. */
327 i1 = n;
328 for (jj = 1; jj <= i1; jj += 512)
329 {
330 /* Computing MIN */
331 i2 = 512;
332 i3 = n - jj + 1;
333 jsec = min(i2,i3);
334 ujsec = jsec - jsec % 4;
335 i2 = k;
336 for (ll = 1; ll <= i2; ll += 256)
337 {
338 /* Computing MIN */
339 i3 = 256;
340 i4 = k - ll + 1;
341 lsec = min(i3,i4);
342 ulsec = lsec - lsec % 2;
343
344 i3 = m;
345 for (ii = 1; ii <= i3; ii += 256)
346 {
347 /* Computing MIN */
348 i4 = 256;
349 i5 = m - ii + 1;
350 isec = min(i4,i5);
351 uisec = isec - isec % 2;
352 i4 = ll + ulsec - 1;
353 for (l = ll; l <= i4; l += 2)
354 {
355 i5 = ii + uisec - 1;
356 for (i = ii; i <= i5; i += 2)
357 {
358 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
359 a[i + l * a_dim1];
360 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
361 a[i + (l + 1) * a_dim1];
362 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
363 a[i + 1 + l * a_dim1];
364 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
365 a[i + 1 + (l + 1) * a_dim1];
366 }
367 if (uisec < isec)
368 {
369 t1[l - ll + 1 + (isec << 8) - 257] =
370 a[ii + isec - 1 + l * a_dim1];
371 t1[l - ll + 2 + (isec << 8) - 257] =
372 a[ii + isec - 1 + (l + 1) * a_dim1];
373 }
374 }
375 if (ulsec < lsec)
376 {
377 i4 = ii + isec - 1;
378 for (i = ii; i<= i4; ++i)
379 {
380 t1[lsec + ((i - ii + 1) << 8) - 257] =
381 a[i + (ll + lsec - 1) * a_dim1];
382 }
383 }
384
385 uisec = isec - isec % 4;
386 i4 = jj + ujsec - 1;
387 for (j = jj; j <= i4; j += 4)
388 {
389 i5 = ii + uisec - 1;
390 for (i = ii; i <= i5; i += 4)
391 {
392 f11 = c[i + j * c_dim1];
393 f21 = c[i + 1 + j * c_dim1];
394 f12 = c[i + (j + 1) * c_dim1];
395 f22 = c[i + 1 + (j + 1) * c_dim1];
396 f13 = c[i + (j + 2) * c_dim1];
397 f23 = c[i + 1 + (j + 2) * c_dim1];
398 f14 = c[i + (j + 3) * c_dim1];
399 f24 = c[i + 1 + (j + 3) * c_dim1];
400 f31 = c[i + 2 + j * c_dim1];
401 f41 = c[i + 3 + j * c_dim1];
402 f32 = c[i + 2 + (j + 1) * c_dim1];
403 f42 = c[i + 3 + (j + 1) * c_dim1];
404 f33 = c[i + 2 + (j + 2) * c_dim1];
405 f43 = c[i + 3 + (j + 2) * c_dim1];
406 f34 = c[i + 2 + (j + 3) * c_dim1];
407 f44 = c[i + 3 + (j + 3) * c_dim1];
408 i6 = ll + lsec - 1;
409 for (l = ll; l <= i6; ++l)
410 {
411 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
412 * b[l + j * b_dim1];
413 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
414 * b[l + j * b_dim1];
415 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
416 * b[l + (j + 1) * b_dim1];
417 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
418 * b[l + (j + 1) * b_dim1];
419 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
420 * b[l + (j + 2) * b_dim1];
421 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
422 * b[l + (j + 2) * b_dim1];
423 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
424 * b[l + (j + 3) * b_dim1];
425 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
426 * b[l + (j + 3) * b_dim1];
427 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
428 * b[l + j * b_dim1];
429 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
430 * b[l + j * b_dim1];
431 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
432 * b[l + (j + 1) * b_dim1];
433 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
434 * b[l + (j + 1) * b_dim1];
435 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
436 * b[l + (j + 2) * b_dim1];
437 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
438 * b[l + (j + 2) * b_dim1];
439 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
440 * b[l + (j + 3) * b_dim1];
441 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
442 * b[l + (j + 3) * b_dim1];
443 }
444 c[i + j * c_dim1] = f11;
445 c[i + 1 + j * c_dim1] = f21;
446 c[i + (j + 1) * c_dim1] = f12;
447 c[i + 1 + (j + 1) * c_dim1] = f22;
448 c[i + (j + 2) * c_dim1] = f13;
449 c[i + 1 + (j + 2) * c_dim1] = f23;
450 c[i + (j + 3) * c_dim1] = f14;
451 c[i + 1 + (j + 3) * c_dim1] = f24;
452 c[i + 2 + j * c_dim1] = f31;
453 c[i + 3 + j * c_dim1] = f41;
454 c[i + 2 + (j + 1) * c_dim1] = f32;
455 c[i + 3 + (j + 1) * c_dim1] = f42;
456 c[i + 2 + (j + 2) * c_dim1] = f33;
457 c[i + 3 + (j + 2) * c_dim1] = f43;
458 c[i + 2 + (j + 3) * c_dim1] = f34;
459 c[i + 3 + (j + 3) * c_dim1] = f44;
460 }
461 if (uisec < isec)
462 {
463 i5 = ii + isec - 1;
464 for (i = ii + uisec; i <= i5; ++i)
465 {
466 f11 = c[i + j * c_dim1];
467 f12 = c[i + (j + 1) * c_dim1];
468 f13 = c[i + (j + 2) * c_dim1];
469 f14 = c[i + (j + 3) * c_dim1];
470 i6 = ll + lsec - 1;
471 for (l = ll; l <= i6; ++l)
472 {
473 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
474 257] * b[l + j * b_dim1];
475 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
476 257] * b[l + (j + 1) * b_dim1];
477 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
478 257] * b[l + (j + 2) * b_dim1];
479 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
480 257] * b[l + (j + 3) * b_dim1];
481 }
482 c[i + j * c_dim1] = f11;
483 c[i + (j + 1) * c_dim1] = f12;
484 c[i + (j + 2) * c_dim1] = f13;
485 c[i + (j + 3) * c_dim1] = f14;
486 }
487 }
488 }
489 if (ujsec < jsec)
490 {
491 i4 = jj + jsec - 1;
492 for (j = jj + ujsec; j <= i4; ++j)
493 {
494 i5 = ii + uisec - 1;
495 for (i = ii; i <= i5; i += 4)
496 {
497 f11 = c[i + j * c_dim1];
498 f21 = c[i + 1 + j * c_dim1];
499 f31 = c[i + 2 + j * c_dim1];
500 f41 = c[i + 3 + j * c_dim1];
501 i6 = ll + lsec - 1;
502 for (l = ll; l <= i6; ++l)
503 {
504 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
505 257] * b[l + j * b_dim1];
506 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
507 257] * b[l + j * b_dim1];
508 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
509 257] * b[l + j * b_dim1];
510 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
511 257] * b[l + j * b_dim1];
512 }
513 c[i + j * c_dim1] = f11;
514 c[i + 1 + j * c_dim1] = f21;
515 c[i + 2 + j * c_dim1] = f31;
516 c[i + 3 + j * c_dim1] = f41;
517 }
518 i5 = ii + isec - 1;
519 for (i = ii + uisec; i <= i5; ++i)
520 {
521 f11 = c[i + j * c_dim1];
522 i6 = ll + lsec - 1;
523 for (l = ll; l <= i6; ++l)
524 {
525 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
526 257] * b[l + j * b_dim1];
527 }
528 c[i + j * c_dim1] = f11;
529 }
530 }
531 }
532 }
533 }
534 }
535 return;
536 }
537 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
538 {
539 if (GFC_DESCRIPTOR_RANK (a) != 1)
540 {
541 const GFC_REAL_8 *restrict abase_x;
542 const GFC_REAL_8 *restrict bbase_y;
543 GFC_REAL_8 *restrict dest_y;
544 GFC_REAL_8 s;
545
546 for (y = 0; y < ycount; y++)
547 {
548 bbase_y = &bbase[y*bystride];
549 dest_y = &dest[y*rystride];
550 for (x = 0; x < xcount; x++)
551 {
552 abase_x = &abase[x*axstride];
553 s = (GFC_REAL_8) 0;
554 for (n = 0; n < count; n++)
555 s += abase_x[n] * bbase_y[n];
556 dest_y[x] = s;
557 }
558 }
559 }
560 else
561 {
562 const GFC_REAL_8 *restrict bbase_y;
563 GFC_REAL_8 s;
564
565 for (y = 0; y < ycount; y++)
566 {
567 bbase_y = &bbase[y*bystride];
568 s = (GFC_REAL_8) 0;
569 for (n = 0; n < count; n++)
570 s += abase[n*axstride] * bbase_y[n];
571 dest[y*rystride] = s;
572 }
573 }
574 }
575 else if (axstride < aystride)
576 {
577 for (y = 0; y < ycount; y++)
578 for (x = 0; x < xcount; x++)
579 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
580
581 for (y = 0; y < ycount; y++)
582 for (n = 0; n < count; n++)
583 for (x = 0; x < xcount; x++)
584 /* dest[x,y] += a[x,n] * b[n,y] */
585 dest[x*rxstride + y*rystride] +=
586 abase[x*axstride + n*aystride] *
587 bbase[n*bxstride + y*bystride];
588 }
589 else if (GFC_DESCRIPTOR_RANK (a) == 1)
590 {
591 const GFC_REAL_8 *restrict bbase_y;
592 GFC_REAL_8 s;
593
594 for (y = 0; y < ycount; y++)
595 {
596 bbase_y = &bbase[y*bystride];
597 s = (GFC_REAL_8) 0;
598 for (n = 0; n < count; n++)
599 s += abase[n*axstride] * bbase_y[n*bxstride];
600 dest[y*rxstride] = s;
601 }
602 }
603 else
604 {
605 const GFC_REAL_8 *restrict abase_x;
606 const GFC_REAL_8 *restrict bbase_y;
607 GFC_REAL_8 *restrict dest_y;
608 GFC_REAL_8 s;
609
610 for (y = 0; y < ycount; y++)
611 {
612 bbase_y = &bbase[y*bystride];
613 dest_y = &dest[y*rystride];
614 for (x = 0; x < xcount; x++)
615 {
616 abase_x = &abase[x*axstride];
617 s = (GFC_REAL_8) 0;
618 for (n = 0; n < count; n++)
619 s += abase_x[n*aystride] * bbase_y[n*bxstride];
620 dest_y[x*rxstride] = s;
621 }
622 }
623 }
624 }
625 #undef POW3
626 #undef min
627 #undef max
628
629 #endif /* HAVE_AVX */
630
631 #ifdef HAVE_AVX2
632 static void
633 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
634 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
635 int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
636 static void
637 matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
638 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
639 int blas_limit, blas_call gemm)
640 {
641 const GFC_REAL_8 * restrict abase;
642 const GFC_REAL_8 * restrict bbase;
643 GFC_REAL_8 * restrict dest;
644
645 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
646 index_type x, y, n, count, xcount, ycount;
647
648 assert (GFC_DESCRIPTOR_RANK (a) == 2
649 || GFC_DESCRIPTOR_RANK (b) == 2);
650
651 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
652
653 Either A or B (but not both) can be rank 1:
654
655 o One-dimensional argument A is implicitly treated as a row matrix
656 dimensioned [1,count], so xcount=1.
657
658 o One-dimensional argument B is implicitly treated as a column matrix
659 dimensioned [count, 1], so ycount=1.
660 */
661
662 if (retarray->base_addr == NULL)
663 {
664 if (GFC_DESCRIPTOR_RANK (a) == 1)
665 {
666 GFC_DIMENSION_SET(retarray->dim[0], 0,
667 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
668 }
669 else if (GFC_DESCRIPTOR_RANK (b) == 1)
670 {
671 GFC_DIMENSION_SET(retarray->dim[0], 0,
672 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
673 }
674 else
675 {
676 GFC_DIMENSION_SET(retarray->dim[0], 0,
677 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
678
679 GFC_DIMENSION_SET(retarray->dim[1], 0,
680 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
681 GFC_DESCRIPTOR_EXTENT(retarray,0));
682 }
683
684 retarray->base_addr
685 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
686 retarray->offset = 0;
687 }
688 else if (unlikely (compile_options.bounds_check))
689 {
690 index_type ret_extent, arg_extent;
691
692 if (GFC_DESCRIPTOR_RANK (a) == 1)
693 {
694 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
695 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
696 if (arg_extent != ret_extent)
697 runtime_error ("Incorrect extent in return array in"
698 " MATMUL intrinsic: is %ld, should be %ld",
699 (long int) ret_extent, (long int) arg_extent);
700 }
701 else if (GFC_DESCRIPTOR_RANK (b) == 1)
702 {
703 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
704 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
705 if (arg_extent != ret_extent)
706 runtime_error ("Incorrect extent in return array in"
707 " MATMUL intrinsic: is %ld, should be %ld",
708 (long int) ret_extent, (long int) arg_extent);
709 }
710 else
711 {
712 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
713 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
714 if (arg_extent != ret_extent)
715 runtime_error ("Incorrect extent in return array in"
716 " MATMUL intrinsic for dimension 1:"
717 " is %ld, should be %ld",
718 (long int) ret_extent, (long int) arg_extent);
719
720 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
721 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
722 if (arg_extent != ret_extent)
723 runtime_error ("Incorrect extent in return array in"
724 " MATMUL intrinsic for dimension 2:"
725 " is %ld, should be %ld",
726 (long int) ret_extent, (long int) arg_extent);
727 }
728 }
729
730
731 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
732 {
733 /* One-dimensional result may be addressed in the code below
734 either as a row or a column matrix. We want both cases to
735 work. */
736 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
737 }
738 else
739 {
740 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
741 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
742 }
743
744
745 if (GFC_DESCRIPTOR_RANK (a) == 1)
746 {
747 /* Treat it as a a row matrix A[1,count]. */
748 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
749 aystride = 1;
750
751 xcount = 1;
752 count = GFC_DESCRIPTOR_EXTENT(a,0);
753 }
754 else
755 {
756 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
757 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
758
759 count = GFC_DESCRIPTOR_EXTENT(a,1);
760 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
761 }
762
763 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
764 {
765 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
766 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
767 }
768
769 if (GFC_DESCRIPTOR_RANK (b) == 1)
770 {
771 /* Treat it as a column matrix B[count,1] */
772 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
773
774 /* bystride should never be used for 1-dimensional b.
775 in case it is we want it to cause a segfault, rather than
776 an incorrect result. */
777 bystride = 0xDEADBEEF;
778 ycount = 1;
779 }
780 else
781 {
782 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
783 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
784 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
785 }
786
787 abase = a->base_addr;
788 bbase = b->base_addr;
789 dest = retarray->base_addr;
790
791 /* Now that everything is set up, we perform the multiplication
792 itself. */
793
794 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
795 #define min(a,b) ((a) <= (b) ? (a) : (b))
796 #define max(a,b) ((a) >= (b) ? (a) : (b))
797
798 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
799 && (bxstride == 1 || bystride == 1)
800 && (((float) xcount) * ((float) ycount) * ((float) count)
801 > POW3(blas_limit)))
802 {
803 const int m = xcount, n = ycount, k = count, ldc = rystride;
804 const GFC_REAL_8 one = 1, zero = 0;
805 const int lda = (axstride == 1) ? aystride : axstride,
806 ldb = (bxstride == 1) ? bystride : bxstride;
807
808 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
809 {
810 assert (gemm != NULL);
811 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
812 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
813 &ldc, 1, 1);
814 return;
815 }
816 }
817
818 if (rxstride == 1 && axstride == 1 && bxstride == 1)
819 {
820 /* This block of code implements a tuned matmul, derived from
821 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
822
823 Bo Kagstrom and Per Ling
824 Department of Computing Science
825 Umea University
826 S-901 87 Umea, Sweden
827
828 from netlib.org, translated to C, and modified for matmul.m4. */
829
830 const GFC_REAL_8 *a, *b;
831 GFC_REAL_8 *c;
832 const index_type m = xcount, n = ycount, k = count;
833
834 /* System generated locals */
835 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
836 i1, i2, i3, i4, i5, i6;
837
838 /* Local variables */
839 GFC_REAL_8 t1[65536], /* was [256][256] */
840 f11, f12, f21, f22, f31, f32, f41, f42,
841 f13, f14, f23, f24, f33, f34, f43, f44;
842 index_type i, j, l, ii, jj, ll;
843 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
844
845 a = abase;
846 b = bbase;
847 c = retarray->base_addr;
848
849 /* Parameter adjustments */
850 c_dim1 = rystride;
851 c_offset = 1 + c_dim1;
852 c -= c_offset;
853 a_dim1 = aystride;
854 a_offset = 1 + a_dim1;
855 a -= a_offset;
856 b_dim1 = bystride;
857 b_offset = 1 + b_dim1;
858 b -= b_offset;
859
860 /* Early exit if possible */
861 if (m == 0 || n == 0 || k == 0)
862 return;
863
864 /* Empty c first. */
865 for (j=1; j<=n; j++)
866 for (i=1; i<=m; i++)
867 c[i + j * c_dim1] = (GFC_REAL_8)0;
868
869 /* Start turning the crank. */
870 i1 = n;
871 for (jj = 1; jj <= i1; jj += 512)
872 {
873 /* Computing MIN */
874 i2 = 512;
875 i3 = n - jj + 1;
876 jsec = min(i2,i3);
877 ujsec = jsec - jsec % 4;
878 i2 = k;
879 for (ll = 1; ll <= i2; ll += 256)
880 {
881 /* Computing MIN */
882 i3 = 256;
883 i4 = k - ll + 1;
884 lsec = min(i3,i4);
885 ulsec = lsec - lsec % 2;
886
887 i3 = m;
888 for (ii = 1; ii <= i3; ii += 256)
889 {
890 /* Computing MIN */
891 i4 = 256;
892 i5 = m - ii + 1;
893 isec = min(i4,i5);
894 uisec = isec - isec % 2;
895 i4 = ll + ulsec - 1;
896 for (l = ll; l <= i4; l += 2)
897 {
898 i5 = ii + uisec - 1;
899 for (i = ii; i <= i5; i += 2)
900 {
901 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
902 a[i + l * a_dim1];
903 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
904 a[i + (l + 1) * a_dim1];
905 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
906 a[i + 1 + l * a_dim1];
907 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
908 a[i + 1 + (l + 1) * a_dim1];
909 }
910 if (uisec < isec)
911 {
912 t1[l - ll + 1 + (isec << 8) - 257] =
913 a[ii + isec - 1 + l * a_dim1];
914 t1[l - ll + 2 + (isec << 8) - 257] =
915 a[ii + isec - 1 + (l + 1) * a_dim1];
916 }
917 }
918 if (ulsec < lsec)
919 {
920 i4 = ii + isec - 1;
921 for (i = ii; i<= i4; ++i)
922 {
923 t1[lsec + ((i - ii + 1) << 8) - 257] =
924 a[i + (ll + lsec - 1) * a_dim1];
925 }
926 }
927
928 uisec = isec - isec % 4;
929 i4 = jj + ujsec - 1;
930 for (j = jj; j <= i4; j += 4)
931 {
932 i5 = ii + uisec - 1;
933 for (i = ii; i <= i5; i += 4)
934 {
935 f11 = c[i + j * c_dim1];
936 f21 = c[i + 1 + j * c_dim1];
937 f12 = c[i + (j + 1) * c_dim1];
938 f22 = c[i + 1 + (j + 1) * c_dim1];
939 f13 = c[i + (j + 2) * c_dim1];
940 f23 = c[i + 1 + (j + 2) * c_dim1];
941 f14 = c[i + (j + 3) * c_dim1];
942 f24 = c[i + 1 + (j + 3) * c_dim1];
943 f31 = c[i + 2 + j * c_dim1];
944 f41 = c[i + 3 + j * c_dim1];
945 f32 = c[i + 2 + (j + 1) * c_dim1];
946 f42 = c[i + 3 + (j + 1) * c_dim1];
947 f33 = c[i + 2 + (j + 2) * c_dim1];
948 f43 = c[i + 3 + (j + 2) * c_dim1];
949 f34 = c[i + 2 + (j + 3) * c_dim1];
950 f44 = c[i + 3 + (j + 3) * c_dim1];
951 i6 = ll + lsec - 1;
952 for (l = ll; l <= i6; ++l)
953 {
954 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
955 * b[l + j * b_dim1];
956 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
957 * b[l + j * b_dim1];
958 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
959 * b[l + (j + 1) * b_dim1];
960 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
961 * b[l + (j + 1) * b_dim1];
962 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
963 * b[l + (j + 2) * b_dim1];
964 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
965 * b[l + (j + 2) * b_dim1];
966 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
967 * b[l + (j + 3) * b_dim1];
968 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
969 * b[l + (j + 3) * b_dim1];
970 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
971 * b[l + j * b_dim1];
972 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
973 * b[l + j * b_dim1];
974 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
975 * b[l + (j + 1) * b_dim1];
976 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
977 * b[l + (j + 1) * b_dim1];
978 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
979 * b[l + (j + 2) * b_dim1];
980 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
981 * b[l + (j + 2) * b_dim1];
982 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
983 * b[l + (j + 3) * b_dim1];
984 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
985 * b[l + (j + 3) * b_dim1];
986 }
987 c[i + j * c_dim1] = f11;
988 c[i + 1 + j * c_dim1] = f21;
989 c[i + (j + 1) * c_dim1] = f12;
990 c[i + 1 + (j + 1) * c_dim1] = f22;
991 c[i + (j + 2) * c_dim1] = f13;
992 c[i + 1 + (j + 2) * c_dim1] = f23;
993 c[i + (j + 3) * c_dim1] = f14;
994 c[i + 1 + (j + 3) * c_dim1] = f24;
995 c[i + 2 + j * c_dim1] = f31;
996 c[i + 3 + j * c_dim1] = f41;
997 c[i + 2 + (j + 1) * c_dim1] = f32;
998 c[i + 3 + (j + 1) * c_dim1] = f42;
999 c[i + 2 + (j + 2) * c_dim1] = f33;
1000 c[i + 3 + (j + 2) * c_dim1] = f43;
1001 c[i + 2 + (j + 3) * c_dim1] = f34;
1002 c[i + 3 + (j + 3) * c_dim1] = f44;
1003 }
1004 if (uisec < isec)
1005 {
1006 i5 = ii + isec - 1;
1007 for (i = ii + uisec; i <= i5; ++i)
1008 {
1009 f11 = c[i + j * c_dim1];
1010 f12 = c[i + (j + 1) * c_dim1];
1011 f13 = c[i + (j + 2) * c_dim1];
1012 f14 = c[i + (j + 3) * c_dim1];
1013 i6 = ll + lsec - 1;
1014 for (l = ll; l <= i6; ++l)
1015 {
1016 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1017 257] * b[l + j * b_dim1];
1018 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1019 257] * b[l + (j + 1) * b_dim1];
1020 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1021 257] * b[l + (j + 2) * b_dim1];
1022 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1023 257] * b[l + (j + 3) * b_dim1];
1024 }
1025 c[i + j * c_dim1] = f11;
1026 c[i + (j + 1) * c_dim1] = f12;
1027 c[i + (j + 2) * c_dim1] = f13;
1028 c[i + (j + 3) * c_dim1] = f14;
1029 }
1030 }
1031 }
1032 if (ujsec < jsec)
1033 {
1034 i4 = jj + jsec - 1;
1035 for (j = jj + ujsec; j <= i4; ++j)
1036 {
1037 i5 = ii + uisec - 1;
1038 for (i = ii; i <= i5; i += 4)
1039 {
1040 f11 = c[i + j * c_dim1];
1041 f21 = c[i + 1 + j * c_dim1];
1042 f31 = c[i + 2 + j * c_dim1];
1043 f41 = c[i + 3 + j * c_dim1];
1044 i6 = ll + lsec - 1;
1045 for (l = ll; l <= i6; ++l)
1046 {
1047 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1048 257] * b[l + j * b_dim1];
1049 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1050 257] * b[l + j * b_dim1];
1051 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1052 257] * b[l + j * b_dim1];
1053 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1054 257] * b[l + j * b_dim1];
1055 }
1056 c[i + j * c_dim1] = f11;
1057 c[i + 1 + j * c_dim1] = f21;
1058 c[i + 2 + j * c_dim1] = f31;
1059 c[i + 3 + j * c_dim1] = f41;
1060 }
1061 i5 = ii + isec - 1;
1062 for (i = ii + uisec; i <= i5; ++i)
1063 {
1064 f11 = c[i + j * c_dim1];
1065 i6 = ll + lsec - 1;
1066 for (l = ll; l <= i6; ++l)
1067 {
1068 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1069 257] * b[l + j * b_dim1];
1070 }
1071 c[i + j * c_dim1] = f11;
1072 }
1073 }
1074 }
1075 }
1076 }
1077 }
1078 return;
1079 }
1080 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1081 {
1082 if (GFC_DESCRIPTOR_RANK (a) != 1)
1083 {
1084 const GFC_REAL_8 *restrict abase_x;
1085 const GFC_REAL_8 *restrict bbase_y;
1086 GFC_REAL_8 *restrict dest_y;
1087 GFC_REAL_8 s;
1088
1089 for (y = 0; y < ycount; y++)
1090 {
1091 bbase_y = &bbase[y*bystride];
1092 dest_y = &dest[y*rystride];
1093 for (x = 0; x < xcount; x++)
1094 {
1095 abase_x = &abase[x*axstride];
1096 s = (GFC_REAL_8) 0;
1097 for (n = 0; n < count; n++)
1098 s += abase_x[n] * bbase_y[n];
1099 dest_y[x] = s;
1100 }
1101 }
1102 }
1103 else
1104 {
1105 const GFC_REAL_8 *restrict bbase_y;
1106 GFC_REAL_8 s;
1107
1108 for (y = 0; y < ycount; y++)
1109 {
1110 bbase_y = &bbase[y*bystride];
1111 s = (GFC_REAL_8) 0;
1112 for (n = 0; n < count; n++)
1113 s += abase[n*axstride] * bbase_y[n];
1114 dest[y*rystride] = s;
1115 }
1116 }
1117 }
1118 else if (axstride < aystride)
1119 {
1120 for (y = 0; y < ycount; y++)
1121 for (x = 0; x < xcount; x++)
1122 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
1123
1124 for (y = 0; y < ycount; y++)
1125 for (n = 0; n < count; n++)
1126 for (x = 0; x < xcount; x++)
1127 /* dest[x,y] += a[x,n] * b[n,y] */
1128 dest[x*rxstride + y*rystride] +=
1129 abase[x*axstride + n*aystride] *
1130 bbase[n*bxstride + y*bystride];
1131 }
1132 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1133 {
1134 const GFC_REAL_8 *restrict bbase_y;
1135 GFC_REAL_8 s;
1136
1137 for (y = 0; y < ycount; y++)
1138 {
1139 bbase_y = &bbase[y*bystride];
1140 s = (GFC_REAL_8) 0;
1141 for (n = 0; n < count; n++)
1142 s += abase[n*axstride] * bbase_y[n*bxstride];
1143 dest[y*rxstride] = s;
1144 }
1145 }
1146 else
1147 {
1148 const GFC_REAL_8 *restrict abase_x;
1149 const GFC_REAL_8 *restrict bbase_y;
1150 GFC_REAL_8 *restrict dest_y;
1151 GFC_REAL_8 s;
1152
1153 for (y = 0; y < ycount; y++)
1154 {
1155 bbase_y = &bbase[y*bystride];
1156 dest_y = &dest[y*rystride];
1157 for (x = 0; x < xcount; x++)
1158 {
1159 abase_x = &abase[x*axstride];
1160 s = (GFC_REAL_8) 0;
1161 for (n = 0; n < count; n++)
1162 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1163 dest_y[x*rxstride] = s;
1164 }
1165 }
1166 }
1167 }
1168 #undef POW3
1169 #undef min
1170 #undef max
1171
1172 #endif /* HAVE_AVX2 */
1173
1174 #ifdef HAVE_AVX512F
1175 static void
1176 matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
1177 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1178 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1179 static void
1180 matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
1181 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1182 int blas_limit, blas_call gemm)
1183 {
1184 const GFC_REAL_8 * restrict abase;
1185 const GFC_REAL_8 * restrict bbase;
1186 GFC_REAL_8 * restrict dest;
1187
1188 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1189 index_type x, y, n, count, xcount, ycount;
1190
1191 assert (GFC_DESCRIPTOR_RANK (a) == 2
1192 || GFC_DESCRIPTOR_RANK (b) == 2);
1193
1194 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1195
1196 Either A or B (but not both) can be rank 1:
1197
1198 o One-dimensional argument A is implicitly treated as a row matrix
1199 dimensioned [1,count], so xcount=1.
1200
1201 o One-dimensional argument B is implicitly treated as a column matrix
1202 dimensioned [count, 1], so ycount=1.
1203 */
1204
1205 if (retarray->base_addr == NULL)
1206 {
1207 if (GFC_DESCRIPTOR_RANK (a) == 1)
1208 {
1209 GFC_DIMENSION_SET(retarray->dim[0], 0,
1210 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1211 }
1212 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1213 {
1214 GFC_DIMENSION_SET(retarray->dim[0], 0,
1215 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1216 }
1217 else
1218 {
1219 GFC_DIMENSION_SET(retarray->dim[0], 0,
1220 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1221
1222 GFC_DIMENSION_SET(retarray->dim[1], 0,
1223 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1224 GFC_DESCRIPTOR_EXTENT(retarray,0));
1225 }
1226
1227 retarray->base_addr
1228 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
1229 retarray->offset = 0;
1230 }
1231 else if (unlikely (compile_options.bounds_check))
1232 {
1233 index_type ret_extent, arg_extent;
1234
1235 if (GFC_DESCRIPTOR_RANK (a) == 1)
1236 {
1237 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1238 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1239 if (arg_extent != ret_extent)
1240 runtime_error ("Incorrect extent in return array in"
1241 " MATMUL intrinsic: is %ld, should be %ld",
1242 (long int) ret_extent, (long int) arg_extent);
1243 }
1244 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1245 {
1246 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1247 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1248 if (arg_extent != ret_extent)
1249 runtime_error ("Incorrect extent in return array in"
1250 " MATMUL intrinsic: is %ld, should be %ld",
1251 (long int) ret_extent, (long int) arg_extent);
1252 }
1253 else
1254 {
1255 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1256 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1257 if (arg_extent != ret_extent)
1258 runtime_error ("Incorrect extent in return array in"
1259 " MATMUL intrinsic for dimension 1:"
1260 " is %ld, should be %ld",
1261 (long int) ret_extent, (long int) arg_extent);
1262
1263 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1264 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1265 if (arg_extent != ret_extent)
1266 runtime_error ("Incorrect extent in return array in"
1267 " MATMUL intrinsic for dimension 2:"
1268 " is %ld, should be %ld",
1269 (long int) ret_extent, (long int) arg_extent);
1270 }
1271 }
1272
1273
1274 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1275 {
1276 /* One-dimensional result may be addressed in the code below
1277 either as a row or a column matrix. We want both cases to
1278 work. */
1279 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1280 }
1281 else
1282 {
1283 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1284 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1285 }
1286
1287
1288 if (GFC_DESCRIPTOR_RANK (a) == 1)
1289 {
1290 /* Treat it as a a row matrix A[1,count]. */
1291 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1292 aystride = 1;
1293
1294 xcount = 1;
1295 count = GFC_DESCRIPTOR_EXTENT(a,0);
1296 }
1297 else
1298 {
1299 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1300 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1301
1302 count = GFC_DESCRIPTOR_EXTENT(a,1);
1303 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1304 }
1305
1306 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1307 {
1308 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1309 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1310 }
1311
1312 if (GFC_DESCRIPTOR_RANK (b) == 1)
1313 {
1314 /* Treat it as a column matrix B[count,1] */
1315 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1316
1317 /* bystride should never be used for 1-dimensional b.
1318 in case it is we want it to cause a segfault, rather than
1319 an incorrect result. */
1320 bystride = 0xDEADBEEF;
1321 ycount = 1;
1322 }
1323 else
1324 {
1325 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1326 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1327 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1328 }
1329
1330 abase = a->base_addr;
1331 bbase = b->base_addr;
1332 dest = retarray->base_addr;
1333
1334 /* Now that everything is set up, we perform the multiplication
1335 itself. */
1336
1337 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1338 #define min(a,b) ((a) <= (b) ? (a) : (b))
1339 #define max(a,b) ((a) >= (b) ? (a) : (b))
1340
1341 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1342 && (bxstride == 1 || bystride == 1)
1343 && (((float) xcount) * ((float) ycount) * ((float) count)
1344 > POW3(blas_limit)))
1345 {
1346 const int m = xcount, n = ycount, k = count, ldc = rystride;
1347 const GFC_REAL_8 one = 1, zero = 0;
1348 const int lda = (axstride == 1) ? aystride : axstride,
1349 ldb = (bxstride == 1) ? bystride : bxstride;
1350
1351 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1352 {
1353 assert (gemm != NULL);
1354 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1355 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1356 &ldc, 1, 1);
1357 return;
1358 }
1359 }
1360
1361 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1362 {
1363 /* This block of code implements a tuned matmul, derived from
1364 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1365
1366 Bo Kagstrom and Per Ling
1367 Department of Computing Science
1368 Umea University
1369 S-901 87 Umea, Sweden
1370
1371 from netlib.org, translated to C, and modified for matmul.m4. */
1372
1373 const GFC_REAL_8 *a, *b;
1374 GFC_REAL_8 *c;
1375 const index_type m = xcount, n = ycount, k = count;
1376
1377 /* System generated locals */
1378 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1379 i1, i2, i3, i4, i5, i6;
1380
1381 /* Local variables */
1382 GFC_REAL_8 t1[65536], /* was [256][256] */
1383 f11, f12, f21, f22, f31, f32, f41, f42,
1384 f13, f14, f23, f24, f33, f34, f43, f44;
1385 index_type i, j, l, ii, jj, ll;
1386 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1387
1388 a = abase;
1389 b = bbase;
1390 c = retarray->base_addr;
1391
1392 /* Parameter adjustments */
1393 c_dim1 = rystride;
1394 c_offset = 1 + c_dim1;
1395 c -= c_offset;
1396 a_dim1 = aystride;
1397 a_offset = 1 + a_dim1;
1398 a -= a_offset;
1399 b_dim1 = bystride;
1400 b_offset = 1 + b_dim1;
1401 b -= b_offset;
1402
1403 /* Early exit if possible */
1404 if (m == 0 || n == 0 || k == 0)
1405 return;
1406
1407 /* Empty c first. */
1408 for (j=1; j<=n; j++)
1409 for (i=1; i<=m; i++)
1410 c[i + j * c_dim1] = (GFC_REAL_8)0;
1411
1412 /* Start turning the crank. */
1413 i1 = n;
1414 for (jj = 1; jj <= i1; jj += 512)
1415 {
1416 /* Computing MIN */
1417 i2 = 512;
1418 i3 = n - jj + 1;
1419 jsec = min(i2,i3);
1420 ujsec = jsec - jsec % 4;
1421 i2 = k;
1422 for (ll = 1; ll <= i2; ll += 256)
1423 {
1424 /* Computing MIN */
1425 i3 = 256;
1426 i4 = k - ll + 1;
1427 lsec = min(i3,i4);
1428 ulsec = lsec - lsec % 2;
1429
1430 i3 = m;
1431 for (ii = 1; ii <= i3; ii += 256)
1432 {
1433 /* Computing MIN */
1434 i4 = 256;
1435 i5 = m - ii + 1;
1436 isec = min(i4,i5);
1437 uisec = isec - isec % 2;
1438 i4 = ll + ulsec - 1;
1439 for (l = ll; l <= i4; l += 2)
1440 {
1441 i5 = ii + uisec - 1;
1442 for (i = ii; i <= i5; i += 2)
1443 {
1444 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1445 a[i + l * a_dim1];
1446 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1447 a[i + (l + 1) * a_dim1];
1448 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1449 a[i + 1 + l * a_dim1];
1450 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1451 a[i + 1 + (l + 1) * a_dim1];
1452 }
1453 if (uisec < isec)
1454 {
1455 t1[l - ll + 1 + (isec << 8) - 257] =
1456 a[ii + isec - 1 + l * a_dim1];
1457 t1[l - ll + 2 + (isec << 8) - 257] =
1458 a[ii + isec - 1 + (l + 1) * a_dim1];
1459 }
1460 }
1461 if (ulsec < lsec)
1462 {
1463 i4 = ii + isec - 1;
1464 for (i = ii; i<= i4; ++i)
1465 {
1466 t1[lsec + ((i - ii + 1) << 8) - 257] =
1467 a[i + (ll + lsec - 1) * a_dim1];
1468 }
1469 }
1470
1471 uisec = isec - isec % 4;
1472 i4 = jj + ujsec - 1;
1473 for (j = jj; j <= i4; j += 4)
1474 {
1475 i5 = ii + uisec - 1;
1476 for (i = ii; i <= i5; i += 4)
1477 {
1478 f11 = c[i + j * c_dim1];
1479 f21 = c[i + 1 + j * c_dim1];
1480 f12 = c[i + (j + 1) * c_dim1];
1481 f22 = c[i + 1 + (j + 1) * c_dim1];
1482 f13 = c[i + (j + 2) * c_dim1];
1483 f23 = c[i + 1 + (j + 2) * c_dim1];
1484 f14 = c[i + (j + 3) * c_dim1];
1485 f24 = c[i + 1 + (j + 3) * c_dim1];
1486 f31 = c[i + 2 + j * c_dim1];
1487 f41 = c[i + 3 + j * c_dim1];
1488 f32 = c[i + 2 + (j + 1) * c_dim1];
1489 f42 = c[i + 3 + (j + 1) * c_dim1];
1490 f33 = c[i + 2 + (j + 2) * c_dim1];
1491 f43 = c[i + 3 + (j + 2) * c_dim1];
1492 f34 = c[i + 2 + (j + 3) * c_dim1];
1493 f44 = c[i + 3 + (j + 3) * c_dim1];
1494 i6 = ll + lsec - 1;
1495 for (l = ll; l <= i6; ++l)
1496 {
1497 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1498 * b[l + j * b_dim1];
1499 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1500 * b[l + j * b_dim1];
1501 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1502 * b[l + (j + 1) * b_dim1];
1503 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1504 * b[l + (j + 1) * b_dim1];
1505 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1506 * b[l + (j + 2) * b_dim1];
1507 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1508 * b[l + (j + 2) * b_dim1];
1509 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1510 * b[l + (j + 3) * b_dim1];
1511 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1512 * b[l + (j + 3) * b_dim1];
1513 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1514 * b[l + j * b_dim1];
1515 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1516 * b[l + j * b_dim1];
1517 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1518 * b[l + (j + 1) * b_dim1];
1519 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1520 * b[l + (j + 1) * b_dim1];
1521 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1522 * b[l + (j + 2) * b_dim1];
1523 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1524 * b[l + (j + 2) * b_dim1];
1525 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1526 * b[l + (j + 3) * b_dim1];
1527 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1528 * b[l + (j + 3) * b_dim1];
1529 }
1530 c[i + j * c_dim1] = f11;
1531 c[i + 1 + j * c_dim1] = f21;
1532 c[i + (j + 1) * c_dim1] = f12;
1533 c[i + 1 + (j + 1) * c_dim1] = f22;
1534 c[i + (j + 2) * c_dim1] = f13;
1535 c[i + 1 + (j + 2) * c_dim1] = f23;
1536 c[i + (j + 3) * c_dim1] = f14;
1537 c[i + 1 + (j + 3) * c_dim1] = f24;
1538 c[i + 2 + j * c_dim1] = f31;
1539 c[i + 3 + j * c_dim1] = f41;
1540 c[i + 2 + (j + 1) * c_dim1] = f32;
1541 c[i + 3 + (j + 1) * c_dim1] = f42;
1542 c[i + 2 + (j + 2) * c_dim1] = f33;
1543 c[i + 3 + (j + 2) * c_dim1] = f43;
1544 c[i + 2 + (j + 3) * c_dim1] = f34;
1545 c[i + 3 + (j + 3) * c_dim1] = f44;
1546 }
1547 if (uisec < isec)
1548 {
1549 i5 = ii + isec - 1;
1550 for (i = ii + uisec; i <= i5; ++i)
1551 {
1552 f11 = c[i + j * c_dim1];
1553 f12 = c[i + (j + 1) * c_dim1];
1554 f13 = c[i + (j + 2) * c_dim1];
1555 f14 = c[i + (j + 3) * c_dim1];
1556 i6 = ll + lsec - 1;
1557 for (l = ll; l <= i6; ++l)
1558 {
1559 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1560 257] * b[l + j * b_dim1];
1561 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1562 257] * b[l + (j + 1) * b_dim1];
1563 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1564 257] * b[l + (j + 2) * b_dim1];
1565 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1566 257] * b[l + (j + 3) * b_dim1];
1567 }
1568 c[i + j * c_dim1] = f11;
1569 c[i + (j + 1) * c_dim1] = f12;
1570 c[i + (j + 2) * c_dim1] = f13;
1571 c[i + (j + 3) * c_dim1] = f14;
1572 }
1573 }
1574 }
1575 if (ujsec < jsec)
1576 {
1577 i4 = jj + jsec - 1;
1578 for (j = jj + ujsec; j <= i4; ++j)
1579 {
1580 i5 = ii + uisec - 1;
1581 for (i = ii; i <= i5; i += 4)
1582 {
1583 f11 = c[i + j * c_dim1];
1584 f21 = c[i + 1 + j * c_dim1];
1585 f31 = c[i + 2 + j * c_dim1];
1586 f41 = c[i + 3 + j * c_dim1];
1587 i6 = ll + lsec - 1;
1588 for (l = ll; l <= i6; ++l)
1589 {
1590 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1591 257] * b[l + j * b_dim1];
1592 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1593 257] * b[l + j * b_dim1];
1594 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1595 257] * b[l + j * b_dim1];
1596 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1597 257] * b[l + j * b_dim1];
1598 }
1599 c[i + j * c_dim1] = f11;
1600 c[i + 1 + j * c_dim1] = f21;
1601 c[i + 2 + j * c_dim1] = f31;
1602 c[i + 3 + j * c_dim1] = f41;
1603 }
1604 i5 = ii + isec - 1;
1605 for (i = ii + uisec; i <= i5; ++i)
1606 {
1607 f11 = c[i + j * c_dim1];
1608 i6 = ll + lsec - 1;
1609 for (l = ll; l <= i6; ++l)
1610 {
1611 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1612 257] * b[l + j * b_dim1];
1613 }
1614 c[i + j * c_dim1] = f11;
1615 }
1616 }
1617 }
1618 }
1619 }
1620 }
1621 return;
1622 }
1623 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1624 {
1625 if (GFC_DESCRIPTOR_RANK (a) != 1)
1626 {
1627 const GFC_REAL_8 *restrict abase_x;
1628 const GFC_REAL_8 *restrict bbase_y;
1629 GFC_REAL_8 *restrict dest_y;
1630 GFC_REAL_8 s;
1631
1632 for (y = 0; y < ycount; y++)
1633 {
1634 bbase_y = &bbase[y*bystride];
1635 dest_y = &dest[y*rystride];
1636 for (x = 0; x < xcount; x++)
1637 {
1638 abase_x = &abase[x*axstride];
1639 s = (GFC_REAL_8) 0;
1640 for (n = 0; n < count; n++)
1641 s += abase_x[n] * bbase_y[n];
1642 dest_y[x] = s;
1643 }
1644 }
1645 }
1646 else
1647 {
1648 const GFC_REAL_8 *restrict bbase_y;
1649 GFC_REAL_8 s;
1650
1651 for (y = 0; y < ycount; y++)
1652 {
1653 bbase_y = &bbase[y*bystride];
1654 s = (GFC_REAL_8) 0;
1655 for (n = 0; n < count; n++)
1656 s += abase[n*axstride] * bbase_y[n];
1657 dest[y*rystride] = s;
1658 }
1659 }
1660 }
1661 else if (axstride < aystride)
1662 {
1663 for (y = 0; y < ycount; y++)
1664 for (x = 0; x < xcount; x++)
1665 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
1666
1667 for (y = 0; y < ycount; y++)
1668 for (n = 0; n < count; n++)
1669 for (x = 0; x < xcount; x++)
1670 /* dest[x,y] += a[x,n] * b[n,y] */
1671 dest[x*rxstride + y*rystride] +=
1672 abase[x*axstride + n*aystride] *
1673 bbase[n*bxstride + y*bystride];
1674 }
1675 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1676 {
1677 const GFC_REAL_8 *restrict bbase_y;
1678 GFC_REAL_8 s;
1679
1680 for (y = 0; y < ycount; y++)
1681 {
1682 bbase_y = &bbase[y*bystride];
1683 s = (GFC_REAL_8) 0;
1684 for (n = 0; n < count; n++)
1685 s += abase[n*axstride] * bbase_y[n*bxstride];
1686 dest[y*rxstride] = s;
1687 }
1688 }
1689 else
1690 {
1691 const GFC_REAL_8 *restrict abase_x;
1692 const GFC_REAL_8 *restrict bbase_y;
1693 GFC_REAL_8 *restrict dest_y;
1694 GFC_REAL_8 s;
1695
1696 for (y = 0; y < ycount; y++)
1697 {
1698 bbase_y = &bbase[y*bystride];
1699 dest_y = &dest[y*rystride];
1700 for (x = 0; x < xcount; x++)
1701 {
1702 abase_x = &abase[x*axstride];
1703 s = (GFC_REAL_8) 0;
1704 for (n = 0; n < count; n++)
1705 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1706 dest_y[x*rxstride] = s;
1707 }
1708 }
1709 }
1710 }
1711 #undef POW3
1712 #undef min
1713 #undef max
1714
1715 #endif /* HAVE_AVX512F */
1716
1717 /* Function to fall back to if there is no special processor-specific version. */
1718 static void
1719 matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
1720 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
1721 int blas_limit, blas_call gemm)
1722 {
1723 const GFC_REAL_8 * restrict abase;
1724 const GFC_REAL_8 * restrict bbase;
1725 GFC_REAL_8 * restrict dest;
1726
1727 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1728 index_type x, y, n, count, xcount, ycount;
1729
1730 assert (GFC_DESCRIPTOR_RANK (a) == 2
1731 || GFC_DESCRIPTOR_RANK (b) == 2);
1732
1733 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1734
1735 Either A or B (but not both) can be rank 1:
1736
1737 o One-dimensional argument A is implicitly treated as a row matrix
1738 dimensioned [1,count], so xcount=1.
1739
1740 o One-dimensional argument B is implicitly treated as a column matrix
1741 dimensioned [count, 1], so ycount=1.
1742 */
1743
1744 if (retarray->base_addr == NULL)
1745 {
1746 if (GFC_DESCRIPTOR_RANK (a) == 1)
1747 {
1748 GFC_DIMENSION_SET(retarray->dim[0], 0,
1749 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1750 }
1751 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1752 {
1753 GFC_DIMENSION_SET(retarray->dim[0], 0,
1754 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1755 }
1756 else
1757 {
1758 GFC_DIMENSION_SET(retarray->dim[0], 0,
1759 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1760
1761 GFC_DIMENSION_SET(retarray->dim[1], 0,
1762 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1763 GFC_DESCRIPTOR_EXTENT(retarray,0));
1764 }
1765
1766 retarray->base_addr
1767 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
1768 retarray->offset = 0;
1769 }
1770 else if (unlikely (compile_options.bounds_check))
1771 {
1772 index_type ret_extent, arg_extent;
1773
1774 if (GFC_DESCRIPTOR_RANK (a) == 1)
1775 {
1776 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1777 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1778 if (arg_extent != ret_extent)
1779 runtime_error ("Incorrect extent in return array in"
1780 " MATMUL intrinsic: is %ld, should be %ld",
1781 (long int) ret_extent, (long int) arg_extent);
1782 }
1783 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1784 {
1785 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1786 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1787 if (arg_extent != ret_extent)
1788 runtime_error ("Incorrect extent in return array in"
1789 " MATMUL intrinsic: is %ld, should be %ld",
1790 (long int) ret_extent, (long int) arg_extent);
1791 }
1792 else
1793 {
1794 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1795 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1796 if (arg_extent != ret_extent)
1797 runtime_error ("Incorrect extent in return array in"
1798 " MATMUL intrinsic for dimension 1:"
1799 " is %ld, should be %ld",
1800 (long int) ret_extent, (long int) arg_extent);
1801
1802 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1803 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1804 if (arg_extent != ret_extent)
1805 runtime_error ("Incorrect extent in return array in"
1806 " MATMUL intrinsic for dimension 2:"
1807 " is %ld, should be %ld",
1808 (long int) ret_extent, (long int) arg_extent);
1809 }
1810 }
1811
1812
1813 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1814 {
1815 /* One-dimensional result may be addressed in the code below
1816 either as a row or a column matrix. We want both cases to
1817 work. */
1818 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1819 }
1820 else
1821 {
1822 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1823 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1824 }
1825
1826
1827 if (GFC_DESCRIPTOR_RANK (a) == 1)
1828 {
1829 /* Treat it as a a row matrix A[1,count]. */
1830 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1831 aystride = 1;
1832
1833 xcount = 1;
1834 count = GFC_DESCRIPTOR_EXTENT(a,0);
1835 }
1836 else
1837 {
1838 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1839 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1840
1841 count = GFC_DESCRIPTOR_EXTENT(a,1);
1842 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1843 }
1844
1845 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1846 {
1847 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1848 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1849 }
1850
1851 if (GFC_DESCRIPTOR_RANK (b) == 1)
1852 {
1853 /* Treat it as a column matrix B[count,1] */
1854 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1855
1856 /* bystride should never be used for 1-dimensional b.
1857 in case it is we want it to cause a segfault, rather than
1858 an incorrect result. */
1859 bystride = 0xDEADBEEF;
1860 ycount = 1;
1861 }
1862 else
1863 {
1864 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1865 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1866 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1867 }
1868
1869 abase = a->base_addr;
1870 bbase = b->base_addr;
1871 dest = retarray->base_addr;
1872
1873 /* Now that everything is set up, we perform the multiplication
1874 itself. */
1875
1876 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1877 #define min(a,b) ((a) <= (b) ? (a) : (b))
1878 #define max(a,b) ((a) >= (b) ? (a) : (b))
1879
1880 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1881 && (bxstride == 1 || bystride == 1)
1882 && (((float) xcount) * ((float) ycount) * ((float) count)
1883 > POW3(blas_limit)))
1884 {
1885 const int m = xcount, n = ycount, k = count, ldc = rystride;
1886 const GFC_REAL_8 one = 1, zero = 0;
1887 const int lda = (axstride == 1) ? aystride : axstride,
1888 ldb = (bxstride == 1) ? bystride : bxstride;
1889
1890 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1891 {
1892 assert (gemm != NULL);
1893 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1894 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1895 &ldc, 1, 1);
1896 return;
1897 }
1898 }
1899
1900 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1901 {
1902 /* This block of code implements a tuned matmul, derived from
1903 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1904
1905 Bo Kagstrom and Per Ling
1906 Department of Computing Science
1907 Umea University
1908 S-901 87 Umea, Sweden
1909
1910 from netlib.org, translated to C, and modified for matmul.m4. */
1911
1912 const GFC_REAL_8 *a, *b;
1913 GFC_REAL_8 *c;
1914 const index_type m = xcount, n = ycount, k = count;
1915
1916 /* System generated locals */
1917 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1918 i1, i2, i3, i4, i5, i6;
1919
1920 /* Local variables */
1921 GFC_REAL_8 t1[65536], /* was [256][256] */
1922 f11, f12, f21, f22, f31, f32, f41, f42,
1923 f13, f14, f23, f24, f33, f34, f43, f44;
1924 index_type i, j, l, ii, jj, ll;
1925 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1926
1927 a = abase;
1928 b = bbase;
1929 c = retarray->base_addr;
1930
1931 /* Parameter adjustments */
1932 c_dim1 = rystride;
1933 c_offset = 1 + c_dim1;
1934 c -= c_offset;
1935 a_dim1 = aystride;
1936 a_offset = 1 + a_dim1;
1937 a -= a_offset;
1938 b_dim1 = bystride;
1939 b_offset = 1 + b_dim1;
1940 b -= b_offset;
1941
1942 /* Early exit if possible */
1943 if (m == 0 || n == 0 || k == 0)
1944 return;
1945
1946 /* Empty c first. */
1947 for (j=1; j<=n; j++)
1948 for (i=1; i<=m; i++)
1949 c[i + j * c_dim1] = (GFC_REAL_8)0;
1950
1951 /* Start turning the crank. */
1952 i1 = n;
1953 for (jj = 1; jj <= i1; jj += 512)
1954 {
1955 /* Computing MIN */
1956 i2 = 512;
1957 i3 = n - jj + 1;
1958 jsec = min(i2,i3);
1959 ujsec = jsec - jsec % 4;
1960 i2 = k;
1961 for (ll = 1; ll <= i2; ll += 256)
1962 {
1963 /* Computing MIN */
1964 i3 = 256;
1965 i4 = k - ll + 1;
1966 lsec = min(i3,i4);
1967 ulsec = lsec - lsec % 2;
1968
1969 i3 = m;
1970 for (ii = 1; ii <= i3; ii += 256)
1971 {
1972 /* Computing MIN */
1973 i4 = 256;
1974 i5 = m - ii + 1;
1975 isec = min(i4,i5);
1976 uisec = isec - isec % 2;
1977 i4 = ll + ulsec - 1;
1978 for (l = ll; l <= i4; l += 2)
1979 {
1980 i5 = ii + uisec - 1;
1981 for (i = ii; i <= i5; i += 2)
1982 {
1983 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1984 a[i + l * a_dim1];
1985 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1986 a[i + (l + 1) * a_dim1];
1987 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1988 a[i + 1 + l * a_dim1];
1989 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1990 a[i + 1 + (l + 1) * a_dim1];
1991 }
1992 if (uisec < isec)
1993 {
1994 t1[l - ll + 1 + (isec << 8) - 257] =
1995 a[ii + isec - 1 + l * a_dim1];
1996 t1[l - ll + 2 + (isec << 8) - 257] =
1997 a[ii + isec - 1 + (l + 1) * a_dim1];
1998 }
1999 }
2000 if (ulsec < lsec)
2001 {
2002 i4 = ii + isec - 1;
2003 for (i = ii; i<= i4; ++i)
2004 {
2005 t1[lsec + ((i - ii + 1) << 8) - 257] =
2006 a[i + (ll + lsec - 1) * a_dim1];
2007 }
2008 }
2009
2010 uisec = isec - isec % 4;
2011 i4 = jj + ujsec - 1;
2012 for (j = jj; j <= i4; j += 4)
2013 {
2014 i5 = ii + uisec - 1;
2015 for (i = ii; i <= i5; i += 4)
2016 {
2017 f11 = c[i + j * c_dim1];
2018 f21 = c[i + 1 + j * c_dim1];
2019 f12 = c[i + (j + 1) * c_dim1];
2020 f22 = c[i + 1 + (j + 1) * c_dim1];
2021 f13 = c[i + (j + 2) * c_dim1];
2022 f23 = c[i + 1 + (j + 2) * c_dim1];
2023 f14 = c[i + (j + 3) * c_dim1];
2024 f24 = c[i + 1 + (j + 3) * c_dim1];
2025 f31 = c[i + 2 + j * c_dim1];
2026 f41 = c[i + 3 + j * c_dim1];
2027 f32 = c[i + 2 + (j + 1) * c_dim1];
2028 f42 = c[i + 3 + (j + 1) * c_dim1];
2029 f33 = c[i + 2 + (j + 2) * c_dim1];
2030 f43 = c[i + 3 + (j + 2) * c_dim1];
2031 f34 = c[i + 2 + (j + 3) * c_dim1];
2032 f44 = c[i + 3 + (j + 3) * c_dim1];
2033 i6 = ll + lsec - 1;
2034 for (l = ll; l <= i6; ++l)
2035 {
2036 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2037 * b[l + j * b_dim1];
2038 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2039 * b[l + j * b_dim1];
2040 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2041 * b[l + (j + 1) * b_dim1];
2042 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2043 * b[l + (j + 1) * b_dim1];
2044 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2045 * b[l + (j + 2) * b_dim1];
2046 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2047 * b[l + (j + 2) * b_dim1];
2048 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2049 * b[l + (j + 3) * b_dim1];
2050 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2051 * b[l + (j + 3) * b_dim1];
2052 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2053 * b[l + j * b_dim1];
2054 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2055 * b[l + j * b_dim1];
2056 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2057 * b[l + (j + 1) * b_dim1];
2058 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2059 * b[l + (j + 1) * b_dim1];
2060 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2061 * b[l + (j + 2) * b_dim1];
2062 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2063 * b[l + (j + 2) * b_dim1];
2064 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2065 * b[l + (j + 3) * b_dim1];
2066 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2067 * b[l + (j + 3) * b_dim1];
2068 }
2069 c[i + j * c_dim1] = f11;
2070 c[i + 1 + j * c_dim1] = f21;
2071 c[i + (j + 1) * c_dim1] = f12;
2072 c[i + 1 + (j + 1) * c_dim1] = f22;
2073 c[i + (j + 2) * c_dim1] = f13;
2074 c[i + 1 + (j + 2) * c_dim1] = f23;
2075 c[i + (j + 3) * c_dim1] = f14;
2076 c[i + 1 + (j + 3) * c_dim1] = f24;
2077 c[i + 2 + j * c_dim1] = f31;
2078 c[i + 3 + j * c_dim1] = f41;
2079 c[i + 2 + (j + 1) * c_dim1] = f32;
2080 c[i + 3 + (j + 1) * c_dim1] = f42;
2081 c[i + 2 + (j + 2) * c_dim1] = f33;
2082 c[i + 3 + (j + 2) * c_dim1] = f43;
2083 c[i + 2 + (j + 3) * c_dim1] = f34;
2084 c[i + 3 + (j + 3) * c_dim1] = f44;
2085 }
2086 if (uisec < isec)
2087 {
2088 i5 = ii + isec - 1;
2089 for (i = ii + uisec; i <= i5; ++i)
2090 {
2091 f11 = c[i + j * c_dim1];
2092 f12 = c[i + (j + 1) * c_dim1];
2093 f13 = c[i + (j + 2) * c_dim1];
2094 f14 = c[i + (j + 3) * c_dim1];
2095 i6 = ll + lsec - 1;
2096 for (l = ll; l <= i6; ++l)
2097 {
2098 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2099 257] * b[l + j * b_dim1];
2100 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2101 257] * b[l + (j + 1) * b_dim1];
2102 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2103 257] * b[l + (j + 2) * b_dim1];
2104 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2105 257] * b[l + (j + 3) * b_dim1];
2106 }
2107 c[i + j * c_dim1] = f11;
2108 c[i + (j + 1) * c_dim1] = f12;
2109 c[i + (j + 2) * c_dim1] = f13;
2110 c[i + (j + 3) * c_dim1] = f14;
2111 }
2112 }
2113 }
2114 if (ujsec < jsec)
2115 {
2116 i4 = jj + jsec - 1;
2117 for (j = jj + ujsec; j <= i4; ++j)
2118 {
2119 i5 = ii + uisec - 1;
2120 for (i = ii; i <= i5; i += 4)
2121 {
2122 f11 = c[i + j * c_dim1];
2123 f21 = c[i + 1 + j * c_dim1];
2124 f31 = c[i + 2 + j * c_dim1];
2125 f41 = c[i + 3 + j * c_dim1];
2126 i6 = ll + lsec - 1;
2127 for (l = ll; l <= i6; ++l)
2128 {
2129 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2130 257] * b[l + j * b_dim1];
2131 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2132 257] * b[l + j * b_dim1];
2133 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2134 257] * b[l + j * b_dim1];
2135 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2136 257] * b[l + j * b_dim1];
2137 }
2138 c[i + j * c_dim1] = f11;
2139 c[i + 1 + j * c_dim1] = f21;
2140 c[i + 2 + j * c_dim1] = f31;
2141 c[i + 3 + j * c_dim1] = f41;
2142 }
2143 i5 = ii + isec - 1;
2144 for (i = ii + uisec; i <= i5; ++i)
2145 {
2146 f11 = c[i + j * c_dim1];
2147 i6 = ll + lsec - 1;
2148 for (l = ll; l <= i6; ++l)
2149 {
2150 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2151 257] * b[l + j * b_dim1];
2152 }
2153 c[i + j * c_dim1] = f11;
2154 }
2155 }
2156 }
2157 }
2158 }
2159 }
2160 return;
2161 }
2162 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2163 {
2164 if (GFC_DESCRIPTOR_RANK (a) != 1)
2165 {
2166 const GFC_REAL_8 *restrict abase_x;
2167 const GFC_REAL_8 *restrict bbase_y;
2168 GFC_REAL_8 *restrict dest_y;
2169 GFC_REAL_8 s;
2170
2171 for (y = 0; y < ycount; y++)
2172 {
2173 bbase_y = &bbase[y*bystride];
2174 dest_y = &dest[y*rystride];
2175 for (x = 0; x < xcount; x++)
2176 {
2177 abase_x = &abase[x*axstride];
2178 s = (GFC_REAL_8) 0;
2179 for (n = 0; n < count; n++)
2180 s += abase_x[n] * bbase_y[n];
2181 dest_y[x] = s;
2182 }
2183 }
2184 }
2185 else
2186 {
2187 const GFC_REAL_8 *restrict bbase_y;
2188 GFC_REAL_8 s;
2189
2190 for (y = 0; y < ycount; y++)
2191 {
2192 bbase_y = &bbase[y*bystride];
2193 s = (GFC_REAL_8) 0;
2194 for (n = 0; n < count; n++)
2195 s += abase[n*axstride] * bbase_y[n];
2196 dest[y*rystride] = s;
2197 }
2198 }
2199 }
2200 else if (axstride < aystride)
2201 {
2202 for (y = 0; y < ycount; y++)
2203 for (x = 0; x < xcount; x++)
2204 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
2205
2206 for (y = 0; y < ycount; y++)
2207 for (n = 0; n < count; n++)
2208 for (x = 0; x < xcount; x++)
2209 /* dest[x,y] += a[x,n] * b[n,y] */
2210 dest[x*rxstride + y*rystride] +=
2211 abase[x*axstride + n*aystride] *
2212 bbase[n*bxstride + y*bystride];
2213 }
2214 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2215 {
2216 const GFC_REAL_8 *restrict bbase_y;
2217 GFC_REAL_8 s;
2218
2219 for (y = 0; y < ycount; y++)
2220 {
2221 bbase_y = &bbase[y*bystride];
2222 s = (GFC_REAL_8) 0;
2223 for (n = 0; n < count; n++)
2224 s += abase[n*axstride] * bbase_y[n*bxstride];
2225 dest[y*rxstride] = s;
2226 }
2227 }
2228 else
2229 {
2230 const GFC_REAL_8 *restrict abase_x;
2231 const GFC_REAL_8 *restrict bbase_y;
2232 GFC_REAL_8 *restrict dest_y;
2233 GFC_REAL_8 s;
2234
2235 for (y = 0; y < ycount; y++)
2236 {
2237 bbase_y = &bbase[y*bystride];
2238 dest_y = &dest[y*rystride];
2239 for (x = 0; x < xcount; x++)
2240 {
2241 abase_x = &abase[x*axstride];
2242 s = (GFC_REAL_8) 0;
2243 for (n = 0; n < count; n++)
2244 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2245 dest_y[x*rxstride] = s;
2246 }
2247 }
2248 }
2249 }
2250 #undef POW3
2251 #undef min
2252 #undef max
2253
2254
2255 /* Compiling main function, with selection code for the processor. */
2256
2257 /* Currently, this is i386 only. Adjust for other architectures. */
2258
2259 #include <config/i386/cpuinfo.h>
2260 void matmul_r8 (gfc_array_r8 * const restrict retarray,
2261 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2262 int blas_limit, blas_call gemm)
2263 {
2264 static void (*matmul_p) (gfc_array_r8 * const restrict retarray,
2265 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2266 int blas_limit, blas_call gemm) = NULL;
2267
2268 if (matmul_p == NULL)
2269 {
2270 matmul_p = matmul_r8_vanilla;
2271 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2272 {
2273 /* Run down the available processors in order of preference. */
2274 #ifdef HAVE_AVX512F
2275 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2276 {
2277 matmul_p = matmul_r8_avx512f;
2278 goto tailcall;
2279 }
2280
2281 #endif /* HAVE_AVX512F */
2282
2283 #ifdef HAVE_AVX2
2284 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2285 {
2286 matmul_p = matmul_r8_avx2;
2287 goto tailcall;
2288 }
2289
2290 #endif
2291
2292 #ifdef HAVE_AVX
2293 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2294 {
2295 matmul_p = matmul_r8_avx;
2296 goto tailcall;
2297 }
2298 #endif /* HAVE_AVX */
2299 }
2300 }
2301
2302 tailcall:
2303 (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2304 }
2305
2306 #else /* Just the vanilla function. */
2307
2308 void
2309 matmul_r8 (gfc_array_r8 * const restrict retarray,
2310 gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
2311 int blas_limit, blas_call gemm)
2312 {
2313 const GFC_REAL_8 * restrict abase;
2314 const GFC_REAL_8 * restrict bbase;
2315 GFC_REAL_8 * restrict dest;
2316
2317 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2318 index_type x, y, n, count, xcount, ycount;
2319
2320 assert (GFC_DESCRIPTOR_RANK (a) == 2
2321 || GFC_DESCRIPTOR_RANK (b) == 2);
2322
2323 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2324
2325 Either A or B (but not both) can be rank 1:
2326
2327 o One-dimensional argument A is implicitly treated as a row matrix
2328 dimensioned [1,count], so xcount=1.
2329
2330 o One-dimensional argument B is implicitly treated as a column matrix
2331 dimensioned [count, 1], so ycount=1.
2332 */
2333
2334 if (retarray->base_addr == NULL)
2335 {
2336 if (GFC_DESCRIPTOR_RANK (a) == 1)
2337 {
2338 GFC_DIMENSION_SET(retarray->dim[0], 0,
2339 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2340 }
2341 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2342 {
2343 GFC_DIMENSION_SET(retarray->dim[0], 0,
2344 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2345 }
2346 else
2347 {
2348 GFC_DIMENSION_SET(retarray->dim[0], 0,
2349 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2350
2351 GFC_DIMENSION_SET(retarray->dim[1], 0,
2352 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2353 GFC_DESCRIPTOR_EXTENT(retarray,0));
2354 }
2355
2356 retarray->base_addr
2357 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8));
2358 retarray->offset = 0;
2359 }
2360 else if (unlikely (compile_options.bounds_check))
2361 {
2362 index_type ret_extent, arg_extent;
2363
2364 if (GFC_DESCRIPTOR_RANK (a) == 1)
2365 {
2366 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2367 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2368 if (arg_extent != ret_extent)
2369 runtime_error ("Incorrect extent in return array in"
2370 " MATMUL intrinsic: is %ld, should be %ld",
2371 (long int) ret_extent, (long int) arg_extent);
2372 }
2373 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2374 {
2375 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2376 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2377 if (arg_extent != ret_extent)
2378 runtime_error ("Incorrect extent in return array in"
2379 " MATMUL intrinsic: is %ld, should be %ld",
2380 (long int) ret_extent, (long int) arg_extent);
2381 }
2382 else
2383 {
2384 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2385 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2386 if (arg_extent != ret_extent)
2387 runtime_error ("Incorrect extent in return array in"
2388 " MATMUL intrinsic for dimension 1:"
2389 " is %ld, should be %ld",
2390 (long int) ret_extent, (long int) arg_extent);
2391
2392 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2393 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2394 if (arg_extent != ret_extent)
2395 runtime_error ("Incorrect extent in return array in"
2396 " MATMUL intrinsic for dimension 2:"
2397 " is %ld, should be %ld",
2398 (long int) ret_extent, (long int) arg_extent);
2399 }
2400 }
2401
2402
2403 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2404 {
2405 /* One-dimensional result may be addressed in the code below
2406 either as a row or a column matrix. We want both cases to
2407 work. */
2408 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2409 }
2410 else
2411 {
2412 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2413 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2414 }
2415
2416
2417 if (GFC_DESCRIPTOR_RANK (a) == 1)
2418 {
2419 /* Treat it as a a row matrix A[1,count]. */
2420 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2421 aystride = 1;
2422
2423 xcount = 1;
2424 count = GFC_DESCRIPTOR_EXTENT(a,0);
2425 }
2426 else
2427 {
2428 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2429 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2430
2431 count = GFC_DESCRIPTOR_EXTENT(a,1);
2432 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2433 }
2434
2435 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2436 {
2437 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2438 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2439 }
2440
2441 if (GFC_DESCRIPTOR_RANK (b) == 1)
2442 {
2443 /* Treat it as a column matrix B[count,1] */
2444 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2445
2446 /* bystride should never be used for 1-dimensional b.
2447 in case it is we want it to cause a segfault, rather than
2448 an incorrect result. */
2449 bystride = 0xDEADBEEF;
2450 ycount = 1;
2451 }
2452 else
2453 {
2454 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2455 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2456 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2457 }
2458
2459 abase = a->base_addr;
2460 bbase = b->base_addr;
2461 dest = retarray->base_addr;
2462
2463 /* Now that everything is set up, we perform the multiplication
2464 itself. */
2465
2466 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2467 #define min(a,b) ((a) <= (b) ? (a) : (b))
2468 #define max(a,b) ((a) >= (b) ? (a) : (b))
2469
2470 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2471 && (bxstride == 1 || bystride == 1)
2472 && (((float) xcount) * ((float) ycount) * ((float) count)
2473 > POW3(blas_limit)))
2474 {
2475 const int m = xcount, n = ycount, k = count, ldc = rystride;
2476 const GFC_REAL_8 one = 1, zero = 0;
2477 const int lda = (axstride == 1) ? aystride : axstride,
2478 ldb = (bxstride == 1) ? bystride : bxstride;
2479
2480 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2481 {
2482 assert (gemm != NULL);
2483 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2484 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2485 &ldc, 1, 1);
2486 return;
2487 }
2488 }
2489
2490 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2491 {
2492 /* This block of code implements a tuned matmul, derived from
2493 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2494
2495 Bo Kagstrom and Per Ling
2496 Department of Computing Science
2497 Umea University
2498 S-901 87 Umea, Sweden
2499
2500 from netlib.org, translated to C, and modified for matmul.m4. */
2501
2502 const GFC_REAL_8 *a, *b;
2503 GFC_REAL_8 *c;
2504 const index_type m = xcount, n = ycount, k = count;
2505
2506 /* System generated locals */
2507 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2508 i1, i2, i3, i4, i5, i6;
2509
2510 /* Local variables */
2511 GFC_REAL_8 t1[65536], /* was [256][256] */
2512 f11, f12, f21, f22, f31, f32, f41, f42,
2513 f13, f14, f23, f24, f33, f34, f43, f44;
2514 index_type i, j, l, ii, jj, ll;
2515 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2516
2517 a = abase;
2518 b = bbase;
2519 c = retarray->base_addr;
2520
2521 /* Parameter adjustments */
2522 c_dim1 = rystride;
2523 c_offset = 1 + c_dim1;
2524 c -= c_offset;
2525 a_dim1 = aystride;
2526 a_offset = 1 + a_dim1;
2527 a -= a_offset;
2528 b_dim1 = bystride;
2529 b_offset = 1 + b_dim1;
2530 b -= b_offset;
2531
2532 /* Early exit if possible */
2533 if (m == 0 || n == 0 || k == 0)
2534 return;
2535
2536 /* Empty c first. */
2537 for (j=1; j<=n; j++)
2538 for (i=1; i<=m; i++)
2539 c[i + j * c_dim1] = (GFC_REAL_8)0;
2540
2541 /* Start turning the crank. */
2542 i1 = n;
2543 for (jj = 1; jj <= i1; jj += 512)
2544 {
2545 /* Computing MIN */
2546 i2 = 512;
2547 i3 = n - jj + 1;
2548 jsec = min(i2,i3);
2549 ujsec = jsec - jsec % 4;
2550 i2 = k;
2551 for (ll = 1; ll <= i2; ll += 256)
2552 {
2553 /* Computing MIN */
2554 i3 = 256;
2555 i4 = k - ll + 1;
2556 lsec = min(i3,i4);
2557 ulsec = lsec - lsec % 2;
2558
2559 i3 = m;
2560 for (ii = 1; ii <= i3; ii += 256)
2561 {
2562 /* Computing MIN */
2563 i4 = 256;
2564 i5 = m - ii + 1;
2565 isec = min(i4,i5);
2566 uisec = isec - isec % 2;
2567 i4 = ll + ulsec - 1;
2568 for (l = ll; l <= i4; l += 2)
2569 {
2570 i5 = ii + uisec - 1;
2571 for (i = ii; i <= i5; i += 2)
2572 {
2573 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2574 a[i + l * a_dim1];
2575 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2576 a[i + (l + 1) * a_dim1];
2577 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2578 a[i + 1 + l * a_dim1];
2579 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2580 a[i + 1 + (l + 1) * a_dim1];
2581 }
2582 if (uisec < isec)
2583 {
2584 t1[l - ll + 1 + (isec << 8) - 257] =
2585 a[ii + isec - 1 + l * a_dim1];
2586 t1[l - ll + 2 + (isec << 8) - 257] =
2587 a[ii + isec - 1 + (l + 1) * a_dim1];
2588 }
2589 }
2590 if (ulsec < lsec)
2591 {
2592 i4 = ii + isec - 1;
2593 for (i = ii; i<= i4; ++i)
2594 {
2595 t1[lsec + ((i - ii + 1) << 8) - 257] =
2596 a[i + (ll + lsec - 1) * a_dim1];
2597 }
2598 }
2599
2600 uisec = isec - isec % 4;
2601 i4 = jj + ujsec - 1;
2602 for (j = jj; j <= i4; j += 4)
2603 {
2604 i5 = ii + uisec - 1;
2605 for (i = ii; i <= i5; i += 4)
2606 {
2607 f11 = c[i + j * c_dim1];
2608 f21 = c[i + 1 + j * c_dim1];
2609 f12 = c[i + (j + 1) * c_dim1];
2610 f22 = c[i + 1 + (j + 1) * c_dim1];
2611 f13 = c[i + (j + 2) * c_dim1];
2612 f23 = c[i + 1 + (j + 2) * c_dim1];
2613 f14 = c[i + (j + 3) * c_dim1];
2614 f24 = c[i + 1 + (j + 3) * c_dim1];
2615 f31 = c[i + 2 + j * c_dim1];
2616 f41 = c[i + 3 + j * c_dim1];
2617 f32 = c[i + 2 + (j + 1) * c_dim1];
2618 f42 = c[i + 3 + (j + 1) * c_dim1];
2619 f33 = c[i + 2 + (j + 2) * c_dim1];
2620 f43 = c[i + 3 + (j + 2) * c_dim1];
2621 f34 = c[i + 2 + (j + 3) * c_dim1];
2622 f44 = c[i + 3 + (j + 3) * c_dim1];
2623 i6 = ll + lsec - 1;
2624 for (l = ll; l <= i6; ++l)
2625 {
2626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627 * b[l + j * b_dim1];
2628 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629 * b[l + j * b_dim1];
2630 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631 * b[l + (j + 1) * b_dim1];
2632 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633 * b[l + (j + 1) * b_dim1];
2634 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635 * b[l + (j + 2) * b_dim1];
2636 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637 * b[l + (j + 2) * b_dim1];
2638 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2639 * b[l + (j + 3) * b_dim1];
2640 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2641 * b[l + (j + 3) * b_dim1];
2642 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643 * b[l + j * b_dim1];
2644 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645 * b[l + j * b_dim1];
2646 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647 * b[l + (j + 1) * b_dim1];
2648 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649 * b[l + (j + 1) * b_dim1];
2650 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651 * b[l + (j + 2) * b_dim1];
2652 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653 * b[l + (j + 2) * b_dim1];
2654 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2655 * b[l + (j + 3) * b_dim1];
2656 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2657 * b[l + (j + 3) * b_dim1];
2658 }
2659 c[i + j * c_dim1] = f11;
2660 c[i + 1 + j * c_dim1] = f21;
2661 c[i + (j + 1) * c_dim1] = f12;
2662 c[i + 1 + (j + 1) * c_dim1] = f22;
2663 c[i + (j + 2) * c_dim1] = f13;
2664 c[i + 1 + (j + 2) * c_dim1] = f23;
2665 c[i + (j + 3) * c_dim1] = f14;
2666 c[i + 1 + (j + 3) * c_dim1] = f24;
2667 c[i + 2 + j * c_dim1] = f31;
2668 c[i + 3 + j * c_dim1] = f41;
2669 c[i + 2 + (j + 1) * c_dim1] = f32;
2670 c[i + 3 + (j + 1) * c_dim1] = f42;
2671 c[i + 2 + (j + 2) * c_dim1] = f33;
2672 c[i + 3 + (j + 2) * c_dim1] = f43;
2673 c[i + 2 + (j + 3) * c_dim1] = f34;
2674 c[i + 3 + (j + 3) * c_dim1] = f44;
2675 }
2676 if (uisec < isec)
2677 {
2678 i5 = ii + isec - 1;
2679 for (i = ii + uisec; i <= i5; ++i)
2680 {
2681 f11 = c[i + j * c_dim1];
2682 f12 = c[i + (j + 1) * c_dim1];
2683 f13 = c[i + (j + 2) * c_dim1];
2684 f14 = c[i + (j + 3) * c_dim1];
2685 i6 = ll + lsec - 1;
2686 for (l = ll; l <= i6; ++l)
2687 {
2688 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + j * b_dim1];
2690 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691 257] * b[l + (j + 1) * b_dim1];
2692 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2693 257] * b[l + (j + 2) * b_dim1];
2694 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2695 257] * b[l + (j + 3) * b_dim1];
2696 }
2697 c[i + j * c_dim1] = f11;
2698 c[i + (j + 1) * c_dim1] = f12;
2699 c[i + (j + 2) * c_dim1] = f13;
2700 c[i + (j + 3) * c_dim1] = f14;
2701 }
2702 }
2703 }
2704 if (ujsec < jsec)
2705 {
2706 i4 = jj + jsec - 1;
2707 for (j = jj + ujsec; j <= i4; ++j)
2708 {
2709 i5 = ii + uisec - 1;
2710 for (i = ii; i <= i5; i += 4)
2711 {
2712 f11 = c[i + j * c_dim1];
2713 f21 = c[i + 1 + j * c_dim1];
2714 f31 = c[i + 2 + j * c_dim1];
2715 f41 = c[i + 3 + j * c_dim1];
2716 i6 = ll + lsec - 1;
2717 for (l = ll; l <= i6; ++l)
2718 {
2719 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2720 257] * b[l + j * b_dim1];
2721 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2722 257] * b[l + j * b_dim1];
2723 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2724 257] * b[l + j * b_dim1];
2725 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2726 257] * b[l + j * b_dim1];
2727 }
2728 c[i + j * c_dim1] = f11;
2729 c[i + 1 + j * c_dim1] = f21;
2730 c[i + 2 + j * c_dim1] = f31;
2731 c[i + 3 + j * c_dim1] = f41;
2732 }
2733 i5 = ii + isec - 1;
2734 for (i = ii + uisec; i <= i5; ++i)
2735 {
2736 f11 = c[i + j * c_dim1];
2737 i6 = ll + lsec - 1;
2738 for (l = ll; l <= i6; ++l)
2739 {
2740 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741 257] * b[l + j * b_dim1];
2742 }
2743 c[i + j * c_dim1] = f11;
2744 }
2745 }
2746 }
2747 }
2748 }
2749 }
2750 return;
2751 }
2752 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2753 {
2754 if (GFC_DESCRIPTOR_RANK (a) != 1)
2755 {
2756 const GFC_REAL_8 *restrict abase_x;
2757 const GFC_REAL_8 *restrict bbase_y;
2758 GFC_REAL_8 *restrict dest_y;
2759 GFC_REAL_8 s;
2760
2761 for (y = 0; y < ycount; y++)
2762 {
2763 bbase_y = &bbase[y*bystride];
2764 dest_y = &dest[y*rystride];
2765 for (x = 0; x < xcount; x++)
2766 {
2767 abase_x = &abase[x*axstride];
2768 s = (GFC_REAL_8) 0;
2769 for (n = 0; n < count; n++)
2770 s += abase_x[n] * bbase_y[n];
2771 dest_y[x] = s;
2772 }
2773 }
2774 }
2775 else
2776 {
2777 const GFC_REAL_8 *restrict bbase_y;
2778 GFC_REAL_8 s;
2779
2780 for (y = 0; y < ycount; y++)
2781 {
2782 bbase_y = &bbase[y*bystride];
2783 s = (GFC_REAL_8) 0;
2784 for (n = 0; n < count; n++)
2785 s += abase[n*axstride] * bbase_y[n];
2786 dest[y*rystride] = s;
2787 }
2788 }
2789 }
2790 else if (axstride < aystride)
2791 {
2792 for (y = 0; y < ycount; y++)
2793 for (x = 0; x < xcount; x++)
2794 dest[x*rxstride + y*rystride] = (GFC_REAL_8)0;
2795
2796 for (y = 0; y < ycount; y++)
2797 for (n = 0; n < count; n++)
2798 for (x = 0; x < xcount; x++)
2799 /* dest[x,y] += a[x,n] * b[n,y] */
2800 dest[x*rxstride + y*rystride] +=
2801 abase[x*axstride + n*aystride] *
2802 bbase[n*bxstride + y*bystride];
2803 }
2804 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2805 {
2806 const GFC_REAL_8 *restrict bbase_y;
2807 GFC_REAL_8 s;
2808
2809 for (y = 0; y < ycount; y++)
2810 {
2811 bbase_y = &bbase[y*bystride];
2812 s = (GFC_REAL_8) 0;
2813 for (n = 0; n < count; n++)
2814 s += abase[n*axstride] * bbase_y[n*bxstride];
2815 dest[y*rxstride] = s;
2816 }
2817 }
2818 else
2819 {
2820 const GFC_REAL_8 *restrict abase_x;
2821 const GFC_REAL_8 *restrict bbase_y;
2822 GFC_REAL_8 *restrict dest_y;
2823 GFC_REAL_8 s;
2824
2825 for (y = 0; y < ycount; y++)
2826 {
2827 bbase_y = &bbase[y*bystride];
2828 dest_y = &dest[y*rystride];
2829 for (x = 0; x < xcount; x++)
2830 {
2831 abase_x = &abase[x*axstride];
2832 s = (GFC_REAL_8) 0;
2833 for (n = 0; n < count; n++)
2834 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2835 dest_y[x*rxstride] = s;
2836 }
2837 }
2838 }
2839 }
2840 #undef POW3
2841 #undef min
2842 #undef max
2843
2844 #endif
2845 #endif
2846