]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgfortran/generated/matmul_r10.c
re PR libfortran/78379 (Processor-specific versions for matmul)
[thirdparty/gcc.git] / libgfortran / generated / matmul_r10.c
CommitLineData
644cb69f 1/* Implementation of the MATMUL intrinsic
cbe34bb5 2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
644cb69f
FXC
3 Contributed by Paul Brook <paul@nowt.org>
4
21d1335b 5This file is part of the GNU Fortran runtime library (libgfortran).
644cb69f
FXC
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
748086b7 10version 3 of the License, or (at your option) any later version.
644cb69f
FXC
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
748086b7
JJ
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24<http://www.gnu.org/licenses/>. */
644cb69f 25
36ae8a61 26#include "libgfortran.h"
644cb69f
FXC
27#include <string.h>
28#include <assert.h>
36ae8a61 29
644cb69f
FXC
30
31#if defined (HAVE_GFC_REAL_10)
32
5a0aad31 33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
5d70ab07 34 passed to us by the front-end, in which case we call it for large
5a0aad31
FXC
35 matrices. */
36
37typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
39 const int *, const GFC_REAL_10 *, const int *,
40 const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
41 int, int);
42
1524f80b
RS
43/* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
644cb69f
FXC
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
1524f80b
RS
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
644cb69f 60 DO I=1,M
1524f80b
RS
61 S = 0
62 DO K=1,COUNT
5a0aad31 63 S = S+A(I,K)*B(K,J)
1524f80b
RS
64 C(I,J) = S
65 ENDIF
644cb69f
FXC
66*/
67
5a0aad31
FXC
68/* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
85206901 72extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
5a0aad31
FXC
73 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
644cb69f
FXC
75export_proto(matmul_r10);
76
31cfd832
TK
77/* Put exhaustive list of possible architectures here here, ORed together. */
78
79#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81#ifdef HAVE_AVX
82static void
83matmul_r10_avx (gfc_array_r10 * const restrict retarray,
84 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86static void
87matmul_r10_avx (gfc_array_r10 * const restrict retarray,
88 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90{
91 const GFC_REAL_10 * restrict abase;
92 const GFC_REAL_10 * restrict bbase;
93 GFC_REAL_10 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110*/
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
169
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
177 }
178 }
179
180
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182 {
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187 }
188 else
189 {
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192 }
193
194
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
196 {
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
200
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
203 }
204 else
205 {
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211 }
212
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214 {
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride = 0xDEADBEEF;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245#define min(a,b) ((a) <= (b) ? (a) : (b))
246#define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_REAL_10 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
265 }
266 }
267
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
269 {
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
272
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
277
278 from netlib.org, translated to C, and modified for matmul.m4. */
279
280 const GFC_REAL_10 *a, *b;
281 GFC_REAL_10 *c;
282 const index_type m = xcount, n = ycount, k = count;
283
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
287
288 /* Local variables */
289 GFC_REAL_10 t1[65536], /* was [256][256] */
290 f11, f12, f21, f22, f31, f32, f41, f42,
291 f13, f14, f23, f24, f33, f34, f43, f44;
292 index_type i, j, l, ii, jj, ll;
293 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
294
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
298
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
309
310 /* Early exit if possible */
311 if (m == 0 || n == 0 || k == 0)
312 return;
313
314 /* Empty c first. */
315 for (j=1; j<=n; j++)
316 for (i=1; i<=m; i++)
317 c[i + j * c_dim1] = (GFC_REAL_10)0;
318
319 /* Start turning the crank. */
320 i1 = n;
321 for (jj = 1; jj <= i1; jj += 512)
322 {
323 /* Computing MIN */
324 i2 = 512;
325 i3 = n - jj + 1;
326 jsec = min(i2,i3);
327 ujsec = jsec - jsec % 4;
328 i2 = k;
329 for (ll = 1; ll <= i2; ll += 256)
330 {
331 /* Computing MIN */
332 i3 = 256;
333 i4 = k - ll + 1;
334 lsec = min(i3,i4);
335 ulsec = lsec - lsec % 2;
336
337 i3 = m;
338 for (ii = 1; ii <= i3; ii += 256)
339 {
340 /* Computing MIN */
341 i4 = 256;
342 i5 = m - ii + 1;
343 isec = min(i4,i5);
344 uisec = isec - isec % 2;
345 i4 = ll + ulsec - 1;
346 for (l = ll; l <= i4; l += 2)
347 {
348 i5 = ii + uisec - 1;
349 for (i = ii; i <= i5; i += 2)
350 {
351 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
352 a[i + l * a_dim1];
353 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
354 a[i + (l + 1) * a_dim1];
355 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
356 a[i + 1 + l * a_dim1];
357 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
358 a[i + 1 + (l + 1) * a_dim1];
359 }
360 if (uisec < isec)
361 {
362 t1[l - ll + 1 + (isec << 8) - 257] =
363 a[ii + isec - 1 + l * a_dim1];
364 t1[l - ll + 2 + (isec << 8) - 257] =
365 a[ii + isec - 1 + (l + 1) * a_dim1];
366 }
367 }
368 if (ulsec < lsec)
369 {
370 i4 = ii + isec - 1;
371 for (i = ii; i<= i4; ++i)
372 {
373 t1[lsec + ((i - ii + 1) << 8) - 257] =
374 a[i + (ll + lsec - 1) * a_dim1];
375 }
376 }
377
378 uisec = isec - isec % 4;
379 i4 = jj + ujsec - 1;
380 for (j = jj; j <= i4; j += 4)
381 {
382 i5 = ii + uisec - 1;
383 for (i = ii; i <= i5; i += 4)
384 {
385 f11 = c[i + j * c_dim1];
386 f21 = c[i + 1 + j * c_dim1];
387 f12 = c[i + (j + 1) * c_dim1];
388 f22 = c[i + 1 + (j + 1) * c_dim1];
389 f13 = c[i + (j + 2) * c_dim1];
390 f23 = c[i + 1 + (j + 2) * c_dim1];
391 f14 = c[i + (j + 3) * c_dim1];
392 f24 = c[i + 1 + (j + 3) * c_dim1];
393 f31 = c[i + 2 + j * c_dim1];
394 f41 = c[i + 3 + j * c_dim1];
395 f32 = c[i + 2 + (j + 1) * c_dim1];
396 f42 = c[i + 3 + (j + 1) * c_dim1];
397 f33 = c[i + 2 + (j + 2) * c_dim1];
398 f43 = c[i + 3 + (j + 2) * c_dim1];
399 f34 = c[i + 2 + (j + 3) * c_dim1];
400 f44 = c[i + 3 + (j + 3) * c_dim1];
401 i6 = ll + lsec - 1;
402 for (l = ll; l <= i6; ++l)
403 {
404 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
405 * b[l + j * b_dim1];
406 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
407 * b[l + j * b_dim1];
408 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
409 * b[l + (j + 1) * b_dim1];
410 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
411 * b[l + (j + 1) * b_dim1];
412 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
413 * b[l + (j + 2) * b_dim1];
414 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
415 * b[l + (j + 2) * b_dim1];
416 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
417 * b[l + (j + 3) * b_dim1];
418 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
419 * b[l + (j + 3) * b_dim1];
420 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
421 * b[l + j * b_dim1];
422 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
423 * b[l + j * b_dim1];
424 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
425 * b[l + (j + 1) * b_dim1];
426 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
427 * b[l + (j + 1) * b_dim1];
428 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
429 * b[l + (j + 2) * b_dim1];
430 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
431 * b[l + (j + 2) * b_dim1];
432 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
433 * b[l + (j + 3) * b_dim1];
434 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
435 * b[l + (j + 3) * b_dim1];
436 }
437 c[i + j * c_dim1] = f11;
438 c[i + 1 + j * c_dim1] = f21;
439 c[i + (j + 1) * c_dim1] = f12;
440 c[i + 1 + (j + 1) * c_dim1] = f22;
441 c[i + (j + 2) * c_dim1] = f13;
442 c[i + 1 + (j + 2) * c_dim1] = f23;
443 c[i + (j + 3) * c_dim1] = f14;
444 c[i + 1 + (j + 3) * c_dim1] = f24;
445 c[i + 2 + j * c_dim1] = f31;
446 c[i + 3 + j * c_dim1] = f41;
447 c[i + 2 + (j + 1) * c_dim1] = f32;
448 c[i + 3 + (j + 1) * c_dim1] = f42;
449 c[i + 2 + (j + 2) * c_dim1] = f33;
450 c[i + 3 + (j + 2) * c_dim1] = f43;
451 c[i + 2 + (j + 3) * c_dim1] = f34;
452 c[i + 3 + (j + 3) * c_dim1] = f44;
453 }
454 if (uisec < isec)
455 {
456 i5 = ii + isec - 1;
457 for (i = ii + uisec; i <= i5; ++i)
458 {
459 f11 = c[i + j * c_dim1];
460 f12 = c[i + (j + 1) * c_dim1];
461 f13 = c[i + (j + 2) * c_dim1];
462 f14 = c[i + (j + 3) * c_dim1];
463 i6 = ll + lsec - 1;
464 for (l = ll; l <= i6; ++l)
465 {
466 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
467 257] * b[l + j * b_dim1];
468 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
469 257] * b[l + (j + 1) * b_dim1];
470 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
471 257] * b[l + (j + 2) * b_dim1];
472 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
473 257] * b[l + (j + 3) * b_dim1];
474 }
475 c[i + j * c_dim1] = f11;
476 c[i + (j + 1) * c_dim1] = f12;
477 c[i + (j + 2) * c_dim1] = f13;
478 c[i + (j + 3) * c_dim1] = f14;
479 }
480 }
481 }
482 if (ujsec < jsec)
483 {
484 i4 = jj + jsec - 1;
485 for (j = jj + ujsec; j <= i4; ++j)
486 {
487 i5 = ii + uisec - 1;
488 for (i = ii; i <= i5; i += 4)
489 {
490 f11 = c[i + j * c_dim1];
491 f21 = c[i + 1 + j * c_dim1];
492 f31 = c[i + 2 + j * c_dim1];
493 f41 = c[i + 3 + j * c_dim1];
494 i6 = ll + lsec - 1;
495 for (l = ll; l <= i6; ++l)
496 {
497 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498 257] * b[l + j * b_dim1];
499 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
500 257] * b[l + j * b_dim1];
501 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
502 257] * b[l + j * b_dim1];
503 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
504 257] * b[l + j * b_dim1];
505 }
506 c[i + j * c_dim1] = f11;
507 c[i + 1 + j * c_dim1] = f21;
508 c[i + 2 + j * c_dim1] = f31;
509 c[i + 3 + j * c_dim1] = f41;
510 }
511 i5 = ii + isec - 1;
512 for (i = ii + uisec; i <= i5; ++i)
513 {
514 f11 = c[i + j * c_dim1];
515 i6 = ll + lsec - 1;
516 for (l = ll; l <= i6; ++l)
517 {
518 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
519 257] * b[l + j * b_dim1];
520 }
521 c[i + j * c_dim1] = f11;
522 }
523 }
524 }
525 }
526 }
527 }
528 return;
529 }
530 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
531 {
532 if (GFC_DESCRIPTOR_RANK (a) != 1)
533 {
534 const GFC_REAL_10 *restrict abase_x;
535 const GFC_REAL_10 *restrict bbase_y;
536 GFC_REAL_10 *restrict dest_y;
537 GFC_REAL_10 s;
538
539 for (y = 0; y < ycount; y++)
540 {
541 bbase_y = &bbase[y*bystride];
542 dest_y = &dest[y*rystride];
543 for (x = 0; x < xcount; x++)
544 {
545 abase_x = &abase[x*axstride];
546 s = (GFC_REAL_10) 0;
547 for (n = 0; n < count; n++)
548 s += abase_x[n] * bbase_y[n];
549 dest_y[x] = s;
550 }
551 }
552 }
553 else
554 {
555 const GFC_REAL_10 *restrict bbase_y;
556 GFC_REAL_10 s;
557
558 for (y = 0; y < ycount; y++)
559 {
560 bbase_y = &bbase[y*bystride];
561 s = (GFC_REAL_10) 0;
562 for (n = 0; n < count; n++)
563 s += abase[n*axstride] * bbase_y[n];
564 dest[y*rystride] = s;
565 }
566 }
567 }
568 else if (axstride < aystride)
569 {
570 for (y = 0; y < ycount; y++)
571 for (x = 0; x < xcount; x++)
572 dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
573
574 for (y = 0; y < ycount; y++)
575 for (n = 0; n < count; n++)
576 for (x = 0; x < xcount; x++)
577 /* dest[x,y] += a[x,n] * b[n,y] */
578 dest[x*rxstride + y*rystride] +=
579 abase[x*axstride + n*aystride] *
580 bbase[n*bxstride + y*bystride];
581 }
582 else if (GFC_DESCRIPTOR_RANK (a) == 1)
583 {
584 const GFC_REAL_10 *restrict bbase_y;
585 GFC_REAL_10 s;
586
587 for (y = 0; y < ycount; y++)
588 {
589 bbase_y = &bbase[y*bystride];
590 s = (GFC_REAL_10) 0;
591 for (n = 0; n < count; n++)
592 s += abase[n*axstride] * bbase_y[n*bxstride];
593 dest[y*rxstride] = s;
594 }
595 }
596 else
597 {
598 const GFC_REAL_10 *restrict abase_x;
599 const GFC_REAL_10 *restrict bbase_y;
600 GFC_REAL_10 *restrict dest_y;
601 GFC_REAL_10 s;
602
603 for (y = 0; y < ycount; y++)
604 {
605 bbase_y = &bbase[y*bystride];
606 dest_y = &dest[y*rystride];
607 for (x = 0; x < xcount; x++)
608 {
609 abase_x = &abase[x*axstride];
610 s = (GFC_REAL_10) 0;
611 for (n = 0; n < count; n++)
612 s += abase_x[n*aystride] * bbase_y[n*bxstride];
613 dest_y[x*rxstride] = s;
614 }
615 }
616 }
617}
618#undef POW3
619#undef min
620#undef max
621
622#endif /* HAVE_AVX */
623
624#ifdef HAVE_AVX2
625static void
626matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
627 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
6d03bdcc 628 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
31cfd832
TK
629static void
630matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
631 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
632 int blas_limit, blas_call gemm)
633{
634 const GFC_REAL_10 * restrict abase;
635 const GFC_REAL_10 * restrict bbase;
636 GFC_REAL_10 * restrict dest;
637
638 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
639 index_type x, y, n, count, xcount, ycount;
640
641 assert (GFC_DESCRIPTOR_RANK (a) == 2
642 || GFC_DESCRIPTOR_RANK (b) == 2);
643
644/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
645
646 Either A or B (but not both) can be rank 1:
647
648 o One-dimensional argument A is implicitly treated as a row matrix
649 dimensioned [1,count], so xcount=1.
650
651 o One-dimensional argument B is implicitly treated as a column matrix
652 dimensioned [count, 1], so ycount=1.
653*/
654
655 if (retarray->base_addr == NULL)
656 {
657 if (GFC_DESCRIPTOR_RANK (a) == 1)
658 {
659 GFC_DIMENSION_SET(retarray->dim[0], 0,
660 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
661 }
662 else if (GFC_DESCRIPTOR_RANK (b) == 1)
663 {
664 GFC_DIMENSION_SET(retarray->dim[0], 0,
665 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
666 }
667 else
668 {
669 GFC_DIMENSION_SET(retarray->dim[0], 0,
670 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
671
672 GFC_DIMENSION_SET(retarray->dim[1], 0,
673 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
674 GFC_DESCRIPTOR_EXTENT(retarray,0));
675 }
676
677 retarray->base_addr
678 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
679 retarray->offset = 0;
680 }
681 else if (unlikely (compile_options.bounds_check))
682 {
683 index_type ret_extent, arg_extent;
684
685 if (GFC_DESCRIPTOR_RANK (a) == 1)
686 {
687 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
689 if (arg_extent != ret_extent)
690 runtime_error ("Incorrect extent in return array in"
691 " MATMUL intrinsic: is %ld, should be %ld",
692 (long int) ret_extent, (long int) arg_extent);
693 }
694 else if (GFC_DESCRIPTOR_RANK (b) == 1)
695 {
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent, (long int) arg_extent);
702 }
703 else
704 {
705 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
706 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
707 if (arg_extent != ret_extent)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic for dimension 1:"
710 " is %ld, should be %ld",
711 (long int) ret_extent, (long int) arg_extent);
712
713 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
715 if (arg_extent != ret_extent)
716 runtime_error ("Incorrect extent in return array in"
717 " MATMUL intrinsic for dimension 2:"
718 " is %ld, should be %ld",
719 (long int) ret_extent, (long int) arg_extent);
720 }
721 }
722
723
724 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
725 {
726 /* One-dimensional result may be addressed in the code below
727 either as a row or a column matrix. We want both cases to
728 work. */
729 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
730 }
731 else
732 {
733 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
734 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
735 }
736
737
738 if (GFC_DESCRIPTOR_RANK (a) == 1)
739 {
740 /* Treat it as a a row matrix A[1,count]. */
741 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
742 aystride = 1;
743
744 xcount = 1;
745 count = GFC_DESCRIPTOR_EXTENT(a,0);
746 }
747 else
748 {
749 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
750 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
751
752 count = GFC_DESCRIPTOR_EXTENT(a,1);
753 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
754 }
755
756 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
757 {
758 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
759 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
760 }
761
762 if (GFC_DESCRIPTOR_RANK (b) == 1)
763 {
764 /* Treat it as a column matrix B[count,1] */
765 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
766
767 /* bystride should never be used for 1-dimensional b.
768 in case it is we want it to cause a segfault, rather than
769 an incorrect result. */
770 bystride = 0xDEADBEEF;
771 ycount = 1;
772 }
773 else
774 {
775 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
776 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
777 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
778 }
779
780 abase = a->base_addr;
781 bbase = b->base_addr;
782 dest = retarray->base_addr;
783
784 /* Now that everything is set up, we perform the multiplication
785 itself. */
786
787#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
788#define min(a,b) ((a) <= (b) ? (a) : (b))
789#define max(a,b) ((a) >= (b) ? (a) : (b))
790
791 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
792 && (bxstride == 1 || bystride == 1)
793 && (((float) xcount) * ((float) ycount) * ((float) count)
794 > POW3(blas_limit)))
795 {
796 const int m = xcount, n = ycount, k = count, ldc = rystride;
797 const GFC_REAL_10 one = 1, zero = 0;
798 const int lda = (axstride == 1) ? aystride : axstride,
799 ldb = (bxstride == 1) ? bystride : bxstride;
800
801 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
802 {
803 assert (gemm != NULL);
804 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
805 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
806 &ldc, 1, 1);
807 return;
808 }
809 }
810
811 if (rxstride == 1 && axstride == 1 && bxstride == 1)
812 {
813 /* This block of code implements a tuned matmul, derived from
814 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
815
816 Bo Kagstrom and Per Ling
817 Department of Computing Science
818 Umea University
819 S-901 87 Umea, Sweden
820
821 from netlib.org, translated to C, and modified for matmul.m4. */
822
823 const GFC_REAL_10 *a, *b;
824 GFC_REAL_10 *c;
825 const index_type m = xcount, n = ycount, k = count;
826
827 /* System generated locals */
828 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
829 i1, i2, i3, i4, i5, i6;
830
831 /* Local variables */
832 GFC_REAL_10 t1[65536], /* was [256][256] */
833 f11, f12, f21, f22, f31, f32, f41, f42,
834 f13, f14, f23, f24, f33, f34, f43, f44;
835 index_type i, j, l, ii, jj, ll;
836 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
837
838 a = abase;
839 b = bbase;
840 c = retarray->base_addr;
841
842 /* Parameter adjustments */
843 c_dim1 = rystride;
844 c_offset = 1 + c_dim1;
845 c -= c_offset;
846 a_dim1 = aystride;
847 a_offset = 1 + a_dim1;
848 a -= a_offset;
849 b_dim1 = bystride;
850 b_offset = 1 + b_dim1;
851 b -= b_offset;
852
853 /* Early exit if possible */
854 if (m == 0 || n == 0 || k == 0)
855 return;
856
857 /* Empty c first. */
858 for (j=1; j<=n; j++)
859 for (i=1; i<=m; i++)
860 c[i + j * c_dim1] = (GFC_REAL_10)0;
861
862 /* Start turning the crank. */
863 i1 = n;
864 for (jj = 1; jj <= i1; jj += 512)
865 {
866 /* Computing MIN */
867 i2 = 512;
868 i3 = n - jj + 1;
869 jsec = min(i2,i3);
870 ujsec = jsec - jsec % 4;
871 i2 = k;
872 for (ll = 1; ll <= i2; ll += 256)
873 {
874 /* Computing MIN */
875 i3 = 256;
876 i4 = k - ll + 1;
877 lsec = min(i3,i4);
878 ulsec = lsec - lsec % 2;
879
880 i3 = m;
881 for (ii = 1; ii <= i3; ii += 256)
882 {
883 /* Computing MIN */
884 i4 = 256;
885 i5 = m - ii + 1;
886 isec = min(i4,i5);
887 uisec = isec - isec % 2;
888 i4 = ll + ulsec - 1;
889 for (l = ll; l <= i4; l += 2)
890 {
891 i5 = ii + uisec - 1;
892 for (i = ii; i <= i5; i += 2)
893 {
894 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
895 a[i + l * a_dim1];
896 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
897 a[i + (l + 1) * a_dim1];
898 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
899 a[i + 1 + l * a_dim1];
900 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
901 a[i + 1 + (l + 1) * a_dim1];
902 }
903 if (uisec < isec)
904 {
905 t1[l - ll + 1 + (isec << 8) - 257] =
906 a[ii + isec - 1 + l * a_dim1];
907 t1[l - ll + 2 + (isec << 8) - 257] =
908 a[ii + isec - 1 + (l + 1) * a_dim1];
909 }
910 }
911 if (ulsec < lsec)
912 {
913 i4 = ii + isec - 1;
914 for (i = ii; i<= i4; ++i)
915 {
916 t1[lsec + ((i - ii + 1) << 8) - 257] =
917 a[i + (ll + lsec - 1) * a_dim1];
918 }
919 }
920
921 uisec = isec - isec % 4;
922 i4 = jj + ujsec - 1;
923 for (j = jj; j <= i4; j += 4)
924 {
925 i5 = ii + uisec - 1;
926 for (i = ii; i <= i5; i += 4)
927 {
928 f11 = c[i + j * c_dim1];
929 f21 = c[i + 1 + j * c_dim1];
930 f12 = c[i + (j + 1) * c_dim1];
931 f22 = c[i + 1 + (j + 1) * c_dim1];
932 f13 = c[i + (j + 2) * c_dim1];
933 f23 = c[i + 1 + (j + 2) * c_dim1];
934 f14 = c[i + (j + 3) * c_dim1];
935 f24 = c[i + 1 + (j + 3) * c_dim1];
936 f31 = c[i + 2 + j * c_dim1];
937 f41 = c[i + 3 + j * c_dim1];
938 f32 = c[i + 2 + (j + 1) * c_dim1];
939 f42 = c[i + 3 + (j + 1) * c_dim1];
940 f33 = c[i + 2 + (j + 2) * c_dim1];
941 f43 = c[i + 3 + (j + 2) * c_dim1];
942 f34 = c[i + 2 + (j + 3) * c_dim1];
943 f44 = c[i + 3 + (j + 3) * c_dim1];
944 i6 = ll + lsec - 1;
945 for (l = ll; l <= i6; ++l)
946 {
947 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
948 * b[l + j * b_dim1];
949 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
950 * b[l + j * b_dim1];
951 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
952 * b[l + (j + 1) * b_dim1];
953 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
954 * b[l + (j + 1) * b_dim1];
955 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
956 * b[l + (j + 2) * b_dim1];
957 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
958 * b[l + (j + 2) * b_dim1];
959 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
960 * b[l + (j + 3) * b_dim1];
961 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
962 * b[l + (j + 3) * b_dim1];
963 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
964 * b[l + j * b_dim1];
965 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
966 * b[l + j * b_dim1];
967 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
968 * b[l + (j + 1) * b_dim1];
969 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
970 * b[l + (j + 1) * b_dim1];
971 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
972 * b[l + (j + 2) * b_dim1];
973 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
974 * b[l + (j + 2) * b_dim1];
975 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
976 * b[l + (j + 3) * b_dim1];
977 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
978 * b[l + (j + 3) * b_dim1];
979 }
980 c[i + j * c_dim1] = f11;
981 c[i + 1 + j * c_dim1] = f21;
982 c[i + (j + 1) * c_dim1] = f12;
983 c[i + 1 + (j + 1) * c_dim1] = f22;
984 c[i + (j + 2) * c_dim1] = f13;
985 c[i + 1 + (j + 2) * c_dim1] = f23;
986 c[i + (j + 3) * c_dim1] = f14;
987 c[i + 1 + (j + 3) * c_dim1] = f24;
988 c[i + 2 + j * c_dim1] = f31;
989 c[i + 3 + j * c_dim1] = f41;
990 c[i + 2 + (j + 1) * c_dim1] = f32;
991 c[i + 3 + (j + 1) * c_dim1] = f42;
992 c[i + 2 + (j + 2) * c_dim1] = f33;
993 c[i + 3 + (j + 2) * c_dim1] = f43;
994 c[i + 2 + (j + 3) * c_dim1] = f34;
995 c[i + 3 + (j + 3) * c_dim1] = f44;
996 }
997 if (uisec < isec)
998 {
999 i5 = ii + isec - 1;
1000 for (i = ii + uisec; i <= i5; ++i)
1001 {
1002 f11 = c[i + j * c_dim1];
1003 f12 = c[i + (j + 1) * c_dim1];
1004 f13 = c[i + (j + 2) * c_dim1];
1005 f14 = c[i + (j + 3) * c_dim1];
1006 i6 = ll + lsec - 1;
1007 for (l = ll; l <= i6; ++l)
1008 {
1009 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1010 257] * b[l + j * b_dim1];
1011 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1012 257] * b[l + (j + 1) * b_dim1];
1013 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1014 257] * b[l + (j + 2) * b_dim1];
1015 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1016 257] * b[l + (j + 3) * b_dim1];
1017 }
1018 c[i + j * c_dim1] = f11;
1019 c[i + (j + 1) * c_dim1] = f12;
1020 c[i + (j + 2) * c_dim1] = f13;
1021 c[i + (j + 3) * c_dim1] = f14;
1022 }
1023 }
1024 }
1025 if (ujsec < jsec)
1026 {
1027 i4 = jj + jsec - 1;
1028 for (j = jj + ujsec; j <= i4; ++j)
1029 {
1030 i5 = ii + uisec - 1;
1031 for (i = ii; i <= i5; i += 4)
1032 {
1033 f11 = c[i + j * c_dim1];
1034 f21 = c[i + 1 + j * c_dim1];
1035 f31 = c[i + 2 + j * c_dim1];
1036 f41 = c[i + 3 + j * c_dim1];
1037 i6 = ll + lsec - 1;
1038 for (l = ll; l <= i6; ++l)
1039 {
1040 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 257] * b[l + j * b_dim1];
1042 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1043 257] * b[l + j * b_dim1];
1044 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1045 257] * b[l + j * b_dim1];
1046 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1047 257] * b[l + j * b_dim1];
1048 }
1049 c[i + j * c_dim1] = f11;
1050 c[i + 1 + j * c_dim1] = f21;
1051 c[i + 2 + j * c_dim1] = f31;
1052 c[i + 3 + j * c_dim1] = f41;
1053 }
1054 i5 = ii + isec - 1;
1055 for (i = ii + uisec; i <= i5; ++i)
1056 {
1057 f11 = c[i + j * c_dim1];
1058 i6 = ll + lsec - 1;
1059 for (l = ll; l <= i6; ++l)
1060 {
1061 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1062 257] * b[l + j * b_dim1];
1063 }
1064 c[i + j * c_dim1] = f11;
1065 }
1066 }
1067 }
1068 }
1069 }
1070 }
1071 return;
1072 }
1073 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1074 {
1075 if (GFC_DESCRIPTOR_RANK (a) != 1)
1076 {
1077 const GFC_REAL_10 *restrict abase_x;
1078 const GFC_REAL_10 *restrict bbase_y;
1079 GFC_REAL_10 *restrict dest_y;
1080 GFC_REAL_10 s;
1081
1082 for (y = 0; y < ycount; y++)
1083 {
1084 bbase_y = &bbase[y*bystride];
1085 dest_y = &dest[y*rystride];
1086 for (x = 0; x < xcount; x++)
1087 {
1088 abase_x = &abase[x*axstride];
1089 s = (GFC_REAL_10) 0;
1090 for (n = 0; n < count; n++)
1091 s += abase_x[n] * bbase_y[n];
1092 dest_y[x] = s;
1093 }
1094 }
1095 }
1096 else
1097 {
1098 const GFC_REAL_10 *restrict bbase_y;
1099 GFC_REAL_10 s;
1100
1101 for (y = 0; y < ycount; y++)
1102 {
1103 bbase_y = &bbase[y*bystride];
1104 s = (GFC_REAL_10) 0;
1105 for (n = 0; n < count; n++)
1106 s += abase[n*axstride] * bbase_y[n];
1107 dest[y*rystride] = s;
1108 }
1109 }
1110 }
1111 else if (axstride < aystride)
1112 {
1113 for (y = 0; y < ycount; y++)
1114 for (x = 0; x < xcount; x++)
1115 dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
1116
1117 for (y = 0; y < ycount; y++)
1118 for (n = 0; n < count; n++)
1119 for (x = 0; x < xcount; x++)
1120 /* dest[x,y] += a[x,n] * b[n,y] */
1121 dest[x*rxstride + y*rystride] +=
1122 abase[x*axstride + n*aystride] *
1123 bbase[n*bxstride + y*bystride];
1124 }
1125 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1126 {
1127 const GFC_REAL_10 *restrict bbase_y;
1128 GFC_REAL_10 s;
1129
1130 for (y = 0; y < ycount; y++)
1131 {
1132 bbase_y = &bbase[y*bystride];
1133 s = (GFC_REAL_10) 0;
1134 for (n = 0; n < count; n++)
1135 s += abase[n*axstride] * bbase_y[n*bxstride];
1136 dest[y*rxstride] = s;
1137 }
1138 }
1139 else
1140 {
1141 const GFC_REAL_10 *restrict abase_x;
1142 const GFC_REAL_10 *restrict bbase_y;
1143 GFC_REAL_10 *restrict dest_y;
1144 GFC_REAL_10 s;
1145
1146 for (y = 0; y < ycount; y++)
1147 {
1148 bbase_y = &bbase[y*bystride];
1149 dest_y = &dest[y*rystride];
1150 for (x = 0; x < xcount; x++)
1151 {
1152 abase_x = &abase[x*axstride];
1153 s = (GFC_REAL_10) 0;
1154 for (n = 0; n < count; n++)
1155 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1156 dest_y[x*rxstride] = s;
1157 }
1158 }
1159 }
1160}
1161#undef POW3
1162#undef min
1163#undef max
1164
1165#endif /* HAVE_AVX2 */
1166
1167#ifdef HAVE_AVX512F
1168static void
1169matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
1170 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
1171 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1172static void
1173matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
1174 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
1175 int blas_limit, blas_call gemm)
1176{
1177 const GFC_REAL_10 * restrict abase;
1178 const GFC_REAL_10 * restrict bbase;
1179 GFC_REAL_10 * restrict dest;
1180
1181 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1182 index_type x, y, n, count, xcount, ycount;
1183
1184 assert (GFC_DESCRIPTOR_RANK (a) == 2
1185 || GFC_DESCRIPTOR_RANK (b) == 2);
1186
1187/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1188
1189 Either A or B (but not both) can be rank 1:
1190
1191 o One-dimensional argument A is implicitly treated as a row matrix
1192 dimensioned [1,count], so xcount=1.
1193
1194 o One-dimensional argument B is implicitly treated as a column matrix
1195 dimensioned [count, 1], so ycount=1.
1196*/
1197
1198 if (retarray->base_addr == NULL)
1199 {
1200 if (GFC_DESCRIPTOR_RANK (a) == 1)
1201 {
1202 GFC_DIMENSION_SET(retarray->dim[0], 0,
1203 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1204 }
1205 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1206 {
1207 GFC_DIMENSION_SET(retarray->dim[0], 0,
1208 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1209 }
1210 else
1211 {
1212 GFC_DIMENSION_SET(retarray->dim[0], 0,
1213 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1214
1215 GFC_DIMENSION_SET(retarray->dim[1], 0,
1216 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1217 GFC_DESCRIPTOR_EXTENT(retarray,0));
1218 }
1219
1220 retarray->base_addr
1221 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
1222 retarray->offset = 0;
1223 }
1224 else if (unlikely (compile_options.bounds_check))
1225 {
1226 index_type ret_extent, arg_extent;
1227
1228 if (GFC_DESCRIPTOR_RANK (a) == 1)
1229 {
1230 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1231 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1232 if (arg_extent != ret_extent)
1233 runtime_error ("Incorrect extent in return array in"
1234 " MATMUL intrinsic: is %ld, should be %ld",
1235 (long int) ret_extent, (long int) arg_extent);
1236 }
1237 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1238 {
1239 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1240 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1241 if (arg_extent != ret_extent)
1242 runtime_error ("Incorrect extent in return array in"
1243 " MATMUL intrinsic: is %ld, should be %ld",
1244 (long int) ret_extent, (long int) arg_extent);
1245 }
1246 else
1247 {
1248 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1249 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250 if (arg_extent != ret_extent)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic for dimension 1:"
1253 " is %ld, should be %ld",
1254 (long int) ret_extent, (long int) arg_extent);
1255
1256 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1257 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1258 if (arg_extent != ret_extent)
1259 runtime_error ("Incorrect extent in return array in"
1260 " MATMUL intrinsic for dimension 2:"
1261 " is %ld, should be %ld",
1262 (long int) ret_extent, (long int) arg_extent);
1263 }
1264 }
1265
1266
1267 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1268 {
1269 /* One-dimensional result may be addressed in the code below
1270 either as a row or a column matrix. We want both cases to
1271 work. */
1272 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1273 }
1274 else
1275 {
1276 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1277 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1278 }
1279
1280
1281 if (GFC_DESCRIPTOR_RANK (a) == 1)
1282 {
1283 /* Treat it as a a row matrix A[1,count]. */
1284 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1285 aystride = 1;
1286
1287 xcount = 1;
1288 count = GFC_DESCRIPTOR_EXTENT(a,0);
1289 }
1290 else
1291 {
1292 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1294
1295 count = GFC_DESCRIPTOR_EXTENT(a,1);
1296 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1297 }
1298
1299 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1300 {
1301 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1302 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1303 }
1304
1305 if (GFC_DESCRIPTOR_RANK (b) == 1)
1306 {
1307 /* Treat it as a column matrix B[count,1] */
1308 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1309
1310 /* bystride should never be used for 1-dimensional b.
1311 in case it is we want it to cause a segfault, rather than
1312 an incorrect result. */
1313 bystride = 0xDEADBEEF;
1314 ycount = 1;
1315 }
1316 else
1317 {
1318 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1319 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1320 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1321 }
1322
1323 abase = a->base_addr;
1324 bbase = b->base_addr;
1325 dest = retarray->base_addr;
1326
1327 /* Now that everything is set up, we perform the multiplication
1328 itself. */
1329
1330#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1331#define min(a,b) ((a) <= (b) ? (a) : (b))
1332#define max(a,b) ((a) >= (b) ? (a) : (b))
1333
1334 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1335 && (bxstride == 1 || bystride == 1)
1336 && (((float) xcount) * ((float) ycount) * ((float) count)
1337 > POW3(blas_limit)))
1338 {
1339 const int m = xcount, n = ycount, k = count, ldc = rystride;
1340 const GFC_REAL_10 one = 1, zero = 0;
1341 const int lda = (axstride == 1) ? aystride : axstride,
1342 ldb = (bxstride == 1) ? bystride : bxstride;
1343
1344 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1345 {
1346 assert (gemm != NULL);
1347 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1348 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1349 &ldc, 1, 1);
1350 return;
1351 }
1352 }
1353
1354 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1355 {
1356 /* This block of code implements a tuned matmul, derived from
1357 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1358
1359 Bo Kagstrom and Per Ling
1360 Department of Computing Science
1361 Umea University
1362 S-901 87 Umea, Sweden
1363
1364 from netlib.org, translated to C, and modified for matmul.m4. */
1365
1366 const GFC_REAL_10 *a, *b;
1367 GFC_REAL_10 *c;
1368 const index_type m = xcount, n = ycount, k = count;
1369
1370 /* System generated locals */
1371 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1372 i1, i2, i3, i4, i5, i6;
1373
1374 /* Local variables */
1375 GFC_REAL_10 t1[65536], /* was [256][256] */
1376 f11, f12, f21, f22, f31, f32, f41, f42,
1377 f13, f14, f23, f24, f33, f34, f43, f44;
1378 index_type i, j, l, ii, jj, ll;
1379 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1380
1381 a = abase;
1382 b = bbase;
1383 c = retarray->base_addr;
1384
1385 /* Parameter adjustments */
1386 c_dim1 = rystride;
1387 c_offset = 1 + c_dim1;
1388 c -= c_offset;
1389 a_dim1 = aystride;
1390 a_offset = 1 + a_dim1;
1391 a -= a_offset;
1392 b_dim1 = bystride;
1393 b_offset = 1 + b_dim1;
1394 b -= b_offset;
1395
1396 /* Early exit if possible */
1397 if (m == 0 || n == 0 || k == 0)
1398 return;
1399
1400 /* Empty c first. */
1401 for (j=1; j<=n; j++)
1402 for (i=1; i<=m; i++)
1403 c[i + j * c_dim1] = (GFC_REAL_10)0;
1404
1405 /* Start turning the crank. */
1406 i1 = n;
1407 for (jj = 1; jj <= i1; jj += 512)
1408 {
1409 /* Computing MIN */
1410 i2 = 512;
1411 i3 = n - jj + 1;
1412 jsec = min(i2,i3);
1413 ujsec = jsec - jsec % 4;
1414 i2 = k;
1415 for (ll = 1; ll <= i2; ll += 256)
1416 {
1417 /* Computing MIN */
1418 i3 = 256;
1419 i4 = k - ll + 1;
1420 lsec = min(i3,i4);
1421 ulsec = lsec - lsec % 2;
1422
1423 i3 = m;
1424 for (ii = 1; ii <= i3; ii += 256)
1425 {
1426 /* Computing MIN */
1427 i4 = 256;
1428 i5 = m - ii + 1;
1429 isec = min(i4,i5);
1430 uisec = isec - isec % 2;
1431 i4 = ll + ulsec - 1;
1432 for (l = ll; l <= i4; l += 2)
1433 {
1434 i5 = ii + uisec - 1;
1435 for (i = ii; i <= i5; i += 2)
1436 {
1437 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1438 a[i + l * a_dim1];
1439 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1440 a[i + (l + 1) * a_dim1];
1441 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1442 a[i + 1 + l * a_dim1];
1443 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1444 a[i + 1 + (l + 1) * a_dim1];
1445 }
1446 if (uisec < isec)
1447 {
1448 t1[l - ll + 1 + (isec << 8) - 257] =
1449 a[ii + isec - 1 + l * a_dim1];
1450 t1[l - ll + 2 + (isec << 8) - 257] =
1451 a[ii + isec - 1 + (l + 1) * a_dim1];
1452 }
1453 }
1454 if (ulsec < lsec)
1455 {
1456 i4 = ii + isec - 1;
1457 for (i = ii; i<= i4; ++i)
1458 {
1459 t1[lsec + ((i - ii + 1) << 8) - 257] =
1460 a[i + (ll + lsec - 1) * a_dim1];
1461 }
1462 }
1463
1464 uisec = isec - isec % 4;
1465 i4 = jj + ujsec - 1;
1466 for (j = jj; j <= i4; j += 4)
1467 {
1468 i5 = ii + uisec - 1;
1469 for (i = ii; i <= i5; i += 4)
1470 {
1471 f11 = c[i + j * c_dim1];
1472 f21 = c[i + 1 + j * c_dim1];
1473 f12 = c[i + (j + 1) * c_dim1];
1474 f22 = c[i + 1 + (j + 1) * c_dim1];
1475 f13 = c[i + (j + 2) * c_dim1];
1476 f23 = c[i + 1 + (j + 2) * c_dim1];
1477 f14 = c[i + (j + 3) * c_dim1];
1478 f24 = c[i + 1 + (j + 3) * c_dim1];
1479 f31 = c[i + 2 + j * c_dim1];
1480 f41 = c[i + 3 + j * c_dim1];
1481 f32 = c[i + 2 + (j + 1) * c_dim1];
1482 f42 = c[i + 3 + (j + 1) * c_dim1];
1483 f33 = c[i + 2 + (j + 2) * c_dim1];
1484 f43 = c[i + 3 + (j + 2) * c_dim1];
1485 f34 = c[i + 2 + (j + 3) * c_dim1];
1486 f44 = c[i + 3 + (j + 3) * c_dim1];
1487 i6 = ll + lsec - 1;
1488 for (l = ll; l <= i6; ++l)
1489 {
1490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1491 * b[l + j * b_dim1];
1492 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1493 * b[l + j * b_dim1];
1494 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1495 * b[l + (j + 1) * b_dim1];
1496 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1497 * b[l + (j + 1) * b_dim1];
1498 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499 * b[l + (j + 2) * b_dim1];
1500 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501 * b[l + (j + 2) * b_dim1];
1502 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503 * b[l + (j + 3) * b_dim1];
1504 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505 * b[l + (j + 3) * b_dim1];
1506 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1507 * b[l + j * b_dim1];
1508 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1509 * b[l + j * b_dim1];
1510 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1511 * b[l + (j + 1) * b_dim1];
1512 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1513 * b[l + (j + 1) * b_dim1];
1514 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515 * b[l + (j + 2) * b_dim1];
1516 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517 * b[l + (j + 2) * b_dim1];
1518 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519 * b[l + (j + 3) * b_dim1];
1520 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521 * b[l + (j + 3) * b_dim1];
1522 }
1523 c[i + j * c_dim1] = f11;
1524 c[i + 1 + j * c_dim1] = f21;
1525 c[i + (j + 1) * c_dim1] = f12;
1526 c[i + 1 + (j + 1) * c_dim1] = f22;
1527 c[i + (j + 2) * c_dim1] = f13;
1528 c[i + 1 + (j + 2) * c_dim1] = f23;
1529 c[i + (j + 3) * c_dim1] = f14;
1530 c[i + 1 + (j + 3) * c_dim1] = f24;
1531 c[i + 2 + j * c_dim1] = f31;
1532 c[i + 3 + j * c_dim1] = f41;
1533 c[i + 2 + (j + 1) * c_dim1] = f32;
1534 c[i + 3 + (j + 1) * c_dim1] = f42;
1535 c[i + 2 + (j + 2) * c_dim1] = f33;
1536 c[i + 3 + (j + 2) * c_dim1] = f43;
1537 c[i + 2 + (j + 3) * c_dim1] = f34;
1538 c[i + 3 + (j + 3) * c_dim1] = f44;
1539 }
1540 if (uisec < isec)
1541 {
1542 i5 = ii + isec - 1;
1543 for (i = ii + uisec; i <= i5; ++i)
1544 {
1545 f11 = c[i + j * c_dim1];
1546 f12 = c[i + (j + 1) * c_dim1];
1547 f13 = c[i + (j + 2) * c_dim1];
1548 f14 = c[i + (j + 3) * c_dim1];
1549 i6 = ll + lsec - 1;
1550 for (l = ll; l <= i6; ++l)
1551 {
1552 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1553 257] * b[l + j * b_dim1];
1554 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1555 257] * b[l + (j + 1) * b_dim1];
1556 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1557 257] * b[l + (j + 2) * b_dim1];
1558 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1559 257] * b[l + (j + 3) * b_dim1];
1560 }
1561 c[i + j * c_dim1] = f11;
1562 c[i + (j + 1) * c_dim1] = f12;
1563 c[i + (j + 2) * c_dim1] = f13;
1564 c[i + (j + 3) * c_dim1] = f14;
1565 }
1566 }
1567 }
1568 if (ujsec < jsec)
1569 {
1570 i4 = jj + jsec - 1;
1571 for (j = jj + ujsec; j <= i4; ++j)
1572 {
1573 i5 = ii + uisec - 1;
1574 for (i = ii; i <= i5; i += 4)
1575 {
1576 f11 = c[i + j * c_dim1];
1577 f21 = c[i + 1 + j * c_dim1];
1578 f31 = c[i + 2 + j * c_dim1];
1579 f41 = c[i + 3 + j * c_dim1];
1580 i6 = ll + lsec - 1;
1581 for (l = ll; l <= i6; ++l)
1582 {
1583 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1584 257] * b[l + j * b_dim1];
1585 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1586 257] * b[l + j * b_dim1];
1587 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1588 257] * b[l + j * b_dim1];
1589 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1590 257] * b[l + j * b_dim1];
1591 }
1592 c[i + j * c_dim1] = f11;
1593 c[i + 1 + j * c_dim1] = f21;
1594 c[i + 2 + j * c_dim1] = f31;
1595 c[i + 3 + j * c_dim1] = f41;
1596 }
1597 i5 = ii + isec - 1;
1598 for (i = ii + uisec; i <= i5; ++i)
1599 {
1600 f11 = c[i + j * c_dim1];
1601 i6 = ll + lsec - 1;
1602 for (l = ll; l <= i6; ++l)
1603 {
1604 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1605 257] * b[l + j * b_dim1];
1606 }
1607 c[i + j * c_dim1] = f11;
1608 }
1609 }
1610 }
1611 }
1612 }
1613 }
1614 return;
1615 }
1616 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1617 {
1618 if (GFC_DESCRIPTOR_RANK (a) != 1)
1619 {
1620 const GFC_REAL_10 *restrict abase_x;
1621 const GFC_REAL_10 *restrict bbase_y;
1622 GFC_REAL_10 *restrict dest_y;
1623 GFC_REAL_10 s;
1624
1625 for (y = 0; y < ycount; y++)
1626 {
1627 bbase_y = &bbase[y*bystride];
1628 dest_y = &dest[y*rystride];
1629 for (x = 0; x < xcount; x++)
1630 {
1631 abase_x = &abase[x*axstride];
1632 s = (GFC_REAL_10) 0;
1633 for (n = 0; n < count; n++)
1634 s += abase_x[n] * bbase_y[n];
1635 dest_y[x] = s;
1636 }
1637 }
1638 }
1639 else
1640 {
1641 const GFC_REAL_10 *restrict bbase_y;
1642 GFC_REAL_10 s;
1643
1644 for (y = 0; y < ycount; y++)
1645 {
1646 bbase_y = &bbase[y*bystride];
1647 s = (GFC_REAL_10) 0;
1648 for (n = 0; n < count; n++)
1649 s += abase[n*axstride] * bbase_y[n];
1650 dest[y*rystride] = s;
1651 }
1652 }
1653 }
1654 else if (axstride < aystride)
1655 {
1656 for (y = 0; y < ycount; y++)
1657 for (x = 0; x < xcount; x++)
1658 dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
1659
1660 for (y = 0; y < ycount; y++)
1661 for (n = 0; n < count; n++)
1662 for (x = 0; x < xcount; x++)
1663 /* dest[x,y] += a[x,n] * b[n,y] */
1664 dest[x*rxstride + y*rystride] +=
1665 abase[x*axstride + n*aystride] *
1666 bbase[n*bxstride + y*bystride];
1667 }
1668 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1669 {
1670 const GFC_REAL_10 *restrict bbase_y;
1671 GFC_REAL_10 s;
1672
1673 for (y = 0; y < ycount; y++)
1674 {
1675 bbase_y = &bbase[y*bystride];
1676 s = (GFC_REAL_10) 0;
1677 for (n = 0; n < count; n++)
1678 s += abase[n*axstride] * bbase_y[n*bxstride];
1679 dest[y*rxstride] = s;
1680 }
1681 }
1682 else
1683 {
1684 const GFC_REAL_10 *restrict abase_x;
1685 const GFC_REAL_10 *restrict bbase_y;
1686 GFC_REAL_10 *restrict dest_y;
1687 GFC_REAL_10 s;
1688
1689 for (y = 0; y < ycount; y++)
1690 {
1691 bbase_y = &bbase[y*bystride];
1692 dest_y = &dest[y*rystride];
1693 for (x = 0; x < xcount; x++)
1694 {
1695 abase_x = &abase[x*axstride];
1696 s = (GFC_REAL_10) 0;
1697 for (n = 0; n < count; n++)
1698 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1699 dest_y[x*rxstride] = s;
1700 }
1701 }
1702 }
1703}
1704#undef POW3
1705#undef min
1706#undef max
1707
1708#endif /* HAVE_AVX512F */
1709
1710/* Function to fall back to if there is no special processor-specific version. */
1711static void
1712matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
1713 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
1714 int blas_limit, blas_call gemm)
1715{
1716 const GFC_REAL_10 * restrict abase;
1717 const GFC_REAL_10 * restrict bbase;
1718 GFC_REAL_10 * restrict dest;
1719
1720 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1721 index_type x, y, n, count, xcount, ycount;
1722
1723 assert (GFC_DESCRIPTOR_RANK (a) == 2
1724 || GFC_DESCRIPTOR_RANK (b) == 2);
1725
1726/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1727
1728 Either A or B (but not both) can be rank 1:
1729
1730 o One-dimensional argument A is implicitly treated as a row matrix
1731 dimensioned [1,count], so xcount=1.
1732
1733 o One-dimensional argument B is implicitly treated as a column matrix
1734 dimensioned [count, 1], so ycount=1.
1735*/
1736
1737 if (retarray->base_addr == NULL)
1738 {
1739 if (GFC_DESCRIPTOR_RANK (a) == 1)
1740 {
1741 GFC_DIMENSION_SET(retarray->dim[0], 0,
1742 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1743 }
1744 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1745 {
1746 GFC_DIMENSION_SET(retarray->dim[0], 0,
1747 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1748 }
1749 else
1750 {
1751 GFC_DIMENSION_SET(retarray->dim[0], 0,
1752 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1753
1754 GFC_DIMENSION_SET(retarray->dim[1], 0,
1755 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1756 GFC_DESCRIPTOR_EXTENT(retarray,0));
1757 }
1758
1759 retarray->base_addr
1760 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
1761 retarray->offset = 0;
1762 }
1763 else if (unlikely (compile_options.bounds_check))
1764 {
1765 index_type ret_extent, arg_extent;
1766
1767 if (GFC_DESCRIPTOR_RANK (a) == 1)
1768 {
1769 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1770 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1771 if (arg_extent != ret_extent)
1772 runtime_error ("Incorrect extent in return array in"
1773 " MATMUL intrinsic: is %ld, should be %ld",
1774 (long int) ret_extent, (long int) arg_extent);
1775 }
1776 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1777 {
1778 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1779 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1780 if (arg_extent != ret_extent)
1781 runtime_error ("Incorrect extent in return array in"
1782 " MATMUL intrinsic: is %ld, should be %ld",
1783 (long int) ret_extent, (long int) arg_extent);
1784 }
1785 else
1786 {
1787 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1788 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1789 if (arg_extent != ret_extent)
1790 runtime_error ("Incorrect extent in return array in"
1791 " MATMUL intrinsic for dimension 1:"
1792 " is %ld, should be %ld",
1793 (long int) ret_extent, (long int) arg_extent);
1794
1795 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1796 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1797 if (arg_extent != ret_extent)
1798 runtime_error ("Incorrect extent in return array in"
1799 " MATMUL intrinsic for dimension 2:"
1800 " is %ld, should be %ld",
1801 (long int) ret_extent, (long int) arg_extent);
1802 }
1803 }
1804
1805
1806 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1807 {
1808 /* One-dimensional result may be addressed in the code below
1809 either as a row or a column matrix. We want both cases to
1810 work. */
1811 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1812 }
1813 else
1814 {
1815 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1816 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1817 }
1818
1819
1820 if (GFC_DESCRIPTOR_RANK (a) == 1)
1821 {
1822 /* Treat it as a a row matrix A[1,count]. */
1823 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1824 aystride = 1;
1825
1826 xcount = 1;
1827 count = GFC_DESCRIPTOR_EXTENT(a,0);
1828 }
1829 else
1830 {
1831 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1833
1834 count = GFC_DESCRIPTOR_EXTENT(a,1);
1835 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1836 }
1837
1838 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1839 {
1840 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1841 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1842 }
1843
1844 if (GFC_DESCRIPTOR_RANK (b) == 1)
1845 {
1846 /* Treat it as a column matrix B[count,1] */
1847 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1848
1849 /* bystride should never be used for 1-dimensional b.
1850 in case it is we want it to cause a segfault, rather than
1851 an incorrect result. */
1852 bystride = 0xDEADBEEF;
1853 ycount = 1;
1854 }
1855 else
1856 {
1857 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1858 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1859 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1860 }
1861
1862 abase = a->base_addr;
1863 bbase = b->base_addr;
1864 dest = retarray->base_addr;
1865
1866 /* Now that everything is set up, we perform the multiplication
1867 itself. */
1868
1869#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1870#define min(a,b) ((a) <= (b) ? (a) : (b))
1871#define max(a,b) ((a) >= (b) ? (a) : (b))
1872
1873 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1874 && (bxstride == 1 || bystride == 1)
1875 && (((float) xcount) * ((float) ycount) * ((float) count)
1876 > POW3(blas_limit)))
1877 {
1878 const int m = xcount, n = ycount, k = count, ldc = rystride;
1879 const GFC_REAL_10 one = 1, zero = 0;
1880 const int lda = (axstride == 1) ? aystride : axstride,
1881 ldb = (bxstride == 1) ? bystride : bxstride;
1882
1883 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1884 {
1885 assert (gemm != NULL);
1886 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1887 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1888 &ldc, 1, 1);
1889 return;
1890 }
1891 }
1892
1893 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1894 {
1895 /* This block of code implements a tuned matmul, derived from
1896 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1897
1898 Bo Kagstrom and Per Ling
1899 Department of Computing Science
1900 Umea University
1901 S-901 87 Umea, Sweden
1902
1903 from netlib.org, translated to C, and modified for matmul.m4. */
1904
1905 const GFC_REAL_10 *a, *b;
1906 GFC_REAL_10 *c;
1907 const index_type m = xcount, n = ycount, k = count;
1908
1909 /* System generated locals */
1910 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1911 i1, i2, i3, i4, i5, i6;
1912
1913 /* Local variables */
1914 GFC_REAL_10 t1[65536], /* was [256][256] */
1915 f11, f12, f21, f22, f31, f32, f41, f42,
1916 f13, f14, f23, f24, f33, f34, f43, f44;
1917 index_type i, j, l, ii, jj, ll;
1918 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1919
1920 a = abase;
1921 b = bbase;
1922 c = retarray->base_addr;
1923
1924 /* Parameter adjustments */
1925 c_dim1 = rystride;
1926 c_offset = 1 + c_dim1;
1927 c -= c_offset;
1928 a_dim1 = aystride;
1929 a_offset = 1 + a_dim1;
1930 a -= a_offset;
1931 b_dim1 = bystride;
1932 b_offset = 1 + b_dim1;
1933 b -= b_offset;
1934
1935 /* Early exit if possible */
1936 if (m == 0 || n == 0 || k == 0)
1937 return;
1938
1939 /* Empty c first. */
1940 for (j=1; j<=n; j++)
1941 for (i=1; i<=m; i++)
1942 c[i + j * c_dim1] = (GFC_REAL_10)0;
1943
1944 /* Start turning the crank. */
1945 i1 = n;
1946 for (jj = 1; jj <= i1; jj += 512)
1947 {
1948 /* Computing MIN */
1949 i2 = 512;
1950 i3 = n - jj + 1;
1951 jsec = min(i2,i3);
1952 ujsec = jsec - jsec % 4;
1953 i2 = k;
1954 for (ll = 1; ll <= i2; ll += 256)
1955 {
1956 /* Computing MIN */
1957 i3 = 256;
1958 i4 = k - ll + 1;
1959 lsec = min(i3,i4);
1960 ulsec = lsec - lsec % 2;
1961
1962 i3 = m;
1963 for (ii = 1; ii <= i3; ii += 256)
1964 {
1965 /* Computing MIN */
1966 i4 = 256;
1967 i5 = m - ii + 1;
1968 isec = min(i4,i5);
1969 uisec = isec - isec % 2;
1970 i4 = ll + ulsec - 1;
1971 for (l = ll; l <= i4; l += 2)
1972 {
1973 i5 = ii + uisec - 1;
1974 for (i = ii; i <= i5; i += 2)
1975 {
1976 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1977 a[i + l * a_dim1];
1978 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1979 a[i + (l + 1) * a_dim1];
1980 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1981 a[i + 1 + l * a_dim1];
1982 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1983 a[i + 1 + (l + 1) * a_dim1];
1984 }
1985 if (uisec < isec)
1986 {
1987 t1[l - ll + 1 + (isec << 8) - 257] =
1988 a[ii + isec - 1 + l * a_dim1];
1989 t1[l - ll + 2 + (isec << 8) - 257] =
1990 a[ii + isec - 1 + (l + 1) * a_dim1];
1991 }
1992 }
1993 if (ulsec < lsec)
1994 {
1995 i4 = ii + isec - 1;
1996 for (i = ii; i<= i4; ++i)
1997 {
1998 t1[lsec + ((i - ii + 1) << 8) - 257] =
1999 a[i + (ll + lsec - 1) * a_dim1];
2000 }
2001 }
2002
2003 uisec = isec - isec % 4;
2004 i4 = jj + ujsec - 1;
2005 for (j = jj; j <= i4; j += 4)
2006 {
2007 i5 = ii + uisec - 1;
2008 for (i = ii; i <= i5; i += 4)
2009 {
2010 f11 = c[i + j * c_dim1];
2011 f21 = c[i + 1 + j * c_dim1];
2012 f12 = c[i + (j + 1) * c_dim1];
2013 f22 = c[i + 1 + (j + 1) * c_dim1];
2014 f13 = c[i + (j + 2) * c_dim1];
2015 f23 = c[i + 1 + (j + 2) * c_dim1];
2016 f14 = c[i + (j + 3) * c_dim1];
2017 f24 = c[i + 1 + (j + 3) * c_dim1];
2018 f31 = c[i + 2 + j * c_dim1];
2019 f41 = c[i + 3 + j * c_dim1];
2020 f32 = c[i + 2 + (j + 1) * c_dim1];
2021 f42 = c[i + 3 + (j + 1) * c_dim1];
2022 f33 = c[i + 2 + (j + 2) * c_dim1];
2023 f43 = c[i + 3 + (j + 2) * c_dim1];
2024 f34 = c[i + 2 + (j + 3) * c_dim1];
2025 f44 = c[i + 3 + (j + 3) * c_dim1];
2026 i6 = ll + lsec - 1;
2027 for (l = ll; l <= i6; ++l)
2028 {
2029 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2030 * b[l + j * b_dim1];
2031 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2032 * b[l + j * b_dim1];
2033 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2034 * b[l + (j + 1) * b_dim1];
2035 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2036 * b[l + (j + 1) * b_dim1];
2037 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038 * b[l + (j + 2) * b_dim1];
2039 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040 * b[l + (j + 2) * b_dim1];
2041 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042 * b[l + (j + 3) * b_dim1];
2043 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044 * b[l + (j + 3) * b_dim1];
2045 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2046 * b[l + j * b_dim1];
2047 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2048 * b[l + j * b_dim1];
2049 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2050 * b[l + (j + 1) * b_dim1];
2051 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2052 * b[l + (j + 1) * b_dim1];
2053 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054 * b[l + (j + 2) * b_dim1];
2055 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056 * b[l + (j + 2) * b_dim1];
2057 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058 * b[l + (j + 3) * b_dim1];
2059 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060 * b[l + (j + 3) * b_dim1];
2061 }
2062 c[i + j * c_dim1] = f11;
2063 c[i + 1 + j * c_dim1] = f21;
2064 c[i + (j + 1) * c_dim1] = f12;
2065 c[i + 1 + (j + 1) * c_dim1] = f22;
2066 c[i + (j + 2) * c_dim1] = f13;
2067 c[i + 1 + (j + 2) * c_dim1] = f23;
2068 c[i + (j + 3) * c_dim1] = f14;
2069 c[i + 1 + (j + 3) * c_dim1] = f24;
2070 c[i + 2 + j * c_dim1] = f31;
2071 c[i + 3 + j * c_dim1] = f41;
2072 c[i + 2 + (j + 1) * c_dim1] = f32;
2073 c[i + 3 + (j + 1) * c_dim1] = f42;
2074 c[i + 2 + (j + 2) * c_dim1] = f33;
2075 c[i + 3 + (j + 2) * c_dim1] = f43;
2076 c[i + 2 + (j + 3) * c_dim1] = f34;
2077 c[i + 3 + (j + 3) * c_dim1] = f44;
2078 }
2079 if (uisec < isec)
2080 {
2081 i5 = ii + isec - 1;
2082 for (i = ii + uisec; i <= i5; ++i)
2083 {
2084 f11 = c[i + j * c_dim1];
2085 f12 = c[i + (j + 1) * c_dim1];
2086 f13 = c[i + (j + 2) * c_dim1];
2087 f14 = c[i + (j + 3) * c_dim1];
2088 i6 = ll + lsec - 1;
2089 for (l = ll; l <= i6; ++l)
2090 {
2091 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2092 257] * b[l + j * b_dim1];
2093 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2094 257] * b[l + (j + 1) * b_dim1];
2095 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2096 257] * b[l + (j + 2) * b_dim1];
2097 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2098 257] * b[l + (j + 3) * b_dim1];
2099 }
2100 c[i + j * c_dim1] = f11;
2101 c[i + (j + 1) * c_dim1] = f12;
2102 c[i + (j + 2) * c_dim1] = f13;
2103 c[i + (j + 3) * c_dim1] = f14;
2104 }
2105 }
2106 }
2107 if (ujsec < jsec)
2108 {
2109 i4 = jj + jsec - 1;
2110 for (j = jj + ujsec; j <= i4; ++j)
2111 {
2112 i5 = ii + uisec - 1;
2113 for (i = ii; i <= i5; i += 4)
2114 {
2115 f11 = c[i + j * c_dim1];
2116 f21 = c[i + 1 + j * c_dim1];
2117 f31 = c[i + 2 + j * c_dim1];
2118 f41 = c[i + 3 + j * c_dim1];
2119 i6 = ll + lsec - 1;
2120 for (l = ll; l <= i6; ++l)
2121 {
2122 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2123 257] * b[l + j * b_dim1];
2124 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2125 257] * b[l + j * b_dim1];
2126 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2127 257] * b[l + j * b_dim1];
2128 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2129 257] * b[l + j * b_dim1];
2130 }
2131 c[i + j * c_dim1] = f11;
2132 c[i + 1 + j * c_dim1] = f21;
2133 c[i + 2 + j * c_dim1] = f31;
2134 c[i + 3 + j * c_dim1] = f41;
2135 }
2136 i5 = ii + isec - 1;
2137 for (i = ii + uisec; i <= i5; ++i)
2138 {
2139 f11 = c[i + j * c_dim1];
2140 i6 = ll + lsec - 1;
2141 for (l = ll; l <= i6; ++l)
2142 {
2143 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2144 257] * b[l + j * b_dim1];
2145 }
2146 c[i + j * c_dim1] = f11;
2147 }
2148 }
2149 }
2150 }
2151 }
2152 }
2153 return;
2154 }
2155 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2156 {
2157 if (GFC_DESCRIPTOR_RANK (a) != 1)
2158 {
2159 const GFC_REAL_10 *restrict abase_x;
2160 const GFC_REAL_10 *restrict bbase_y;
2161 GFC_REAL_10 *restrict dest_y;
2162 GFC_REAL_10 s;
2163
2164 for (y = 0; y < ycount; y++)
2165 {
2166 bbase_y = &bbase[y*bystride];
2167 dest_y = &dest[y*rystride];
2168 for (x = 0; x < xcount; x++)
2169 {
2170 abase_x = &abase[x*axstride];
2171 s = (GFC_REAL_10) 0;
2172 for (n = 0; n < count; n++)
2173 s += abase_x[n] * bbase_y[n];
2174 dest_y[x] = s;
2175 }
2176 }
2177 }
2178 else
2179 {
2180 const GFC_REAL_10 *restrict bbase_y;
2181 GFC_REAL_10 s;
2182
2183 for (y = 0; y < ycount; y++)
2184 {
2185 bbase_y = &bbase[y*bystride];
2186 s = (GFC_REAL_10) 0;
2187 for (n = 0; n < count; n++)
2188 s += abase[n*axstride] * bbase_y[n];
2189 dest[y*rystride] = s;
2190 }
2191 }
2192 }
2193 else if (axstride < aystride)
2194 {
2195 for (y = 0; y < ycount; y++)
2196 for (x = 0; x < xcount; x++)
2197 dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
2198
2199 for (y = 0; y < ycount; y++)
2200 for (n = 0; n < count; n++)
2201 for (x = 0; x < xcount; x++)
2202 /* dest[x,y] += a[x,n] * b[n,y] */
2203 dest[x*rxstride + y*rystride] +=
2204 abase[x*axstride + n*aystride] *
2205 bbase[n*bxstride + y*bystride];
2206 }
2207 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2208 {
2209 const GFC_REAL_10 *restrict bbase_y;
2210 GFC_REAL_10 s;
2211
2212 for (y = 0; y < ycount; y++)
2213 {
2214 bbase_y = &bbase[y*bystride];
2215 s = (GFC_REAL_10) 0;
2216 for (n = 0; n < count; n++)
2217 s += abase[n*axstride] * bbase_y[n*bxstride];
2218 dest[y*rxstride] = s;
2219 }
2220 }
2221 else
2222 {
2223 const GFC_REAL_10 *restrict abase_x;
2224 const GFC_REAL_10 *restrict bbase_y;
2225 GFC_REAL_10 *restrict dest_y;
2226 GFC_REAL_10 s;
2227
2228 for (y = 0; y < ycount; y++)
2229 {
2230 bbase_y = &bbase[y*bystride];
2231 dest_y = &dest[y*rystride];
2232 for (x = 0; x < xcount; x++)
2233 {
2234 abase_x = &abase[x*axstride];
2235 s = (GFC_REAL_10) 0;
2236 for (n = 0; n < count; n++)
2237 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2238 dest_y[x*rxstride] = s;
2239 }
2240 }
2241 }
2242}
2243#undef POW3
2244#undef min
2245#undef max
2246
2247
2248/* Compiling main function, with selection code for the processor. */
2249
2250/* Currently, this is i386 only. Adjust for other architectures. */
2251
2252#include <config/i386/cpuinfo.h>
2253void matmul_r10 (gfc_array_r10 * const restrict retarray,
2254 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
2255 int blas_limit, blas_call gemm)
2256{
2257 static void (*matmul_p) (gfc_array_r10 * const restrict retarray,
2258 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
2259 int blas_limit, blas_call gemm) = NULL;
2260
2261 if (matmul_p == NULL)
2262 {
2263 matmul_p = matmul_r10_vanilla;
2264 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2265 {
2266 /* Run down the available processors in order of preference. */
2267#ifdef HAVE_AVX512F
2268 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2269 {
2270 matmul_p = matmul_r10_avx512f;
2271 goto tailcall;
2272 }
2273
2274#endif /* HAVE_AVX512F */
2275
2276#ifdef HAVE_AVX2
6d03bdcc
TK
2277 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2278 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
31cfd832
TK
2279 {
2280 matmul_p = matmul_r10_avx2;
2281 goto tailcall;
2282 }
2283
2284#endif
2285
2286#ifdef HAVE_AVX
2287 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2288 {
2289 matmul_p = matmul_r10_avx;
2290 goto tailcall;
2291 }
2292#endif /* HAVE_AVX */
2293 }
2294 }
2295
2296tailcall:
2297 (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2298}
2299
2300#else /* Just the vanilla function. */
2301
644cb69f 2302void
85206901 2303matmul_r10 (gfc_array_r10 * const restrict retarray,
5a0aad31
FXC
2304 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
2305 int blas_limit, blas_call gemm)
644cb69f 2306{
85206901
JB
2307 const GFC_REAL_10 * restrict abase;
2308 const GFC_REAL_10 * restrict bbase;
2309 GFC_REAL_10 * restrict dest;
644cb69f
FXC
2310
2311 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2312 index_type x, y, n, count, xcount, ycount;
2313
2314 assert (GFC_DESCRIPTOR_RANK (a) == 2
2315 || GFC_DESCRIPTOR_RANK (b) == 2);
2316
2317/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2318
2319 Either A or B (but not both) can be rank 1:
2320
2321 o One-dimensional argument A is implicitly treated as a row matrix
2322 dimensioned [1,count], so xcount=1.
2323
2324 o One-dimensional argument B is implicitly treated as a column matrix
2325 dimensioned [count, 1], so ycount=1.
5d70ab07 2326*/
644cb69f 2327
21d1335b 2328 if (retarray->base_addr == NULL)
644cb69f
FXC
2329 {
2330 if (GFC_DESCRIPTOR_RANK (a) == 1)
2331 {
dfb55fdc
TK
2332 GFC_DIMENSION_SET(retarray->dim[0], 0,
2333 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
644cb69f
FXC
2334 }
2335 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2336 {
dfb55fdc
TK
2337 GFC_DIMENSION_SET(retarray->dim[0], 0,
2338 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
644cb69f
FXC
2339 }
2340 else
2341 {
dfb55fdc
TK
2342 GFC_DIMENSION_SET(retarray->dim[0], 0,
2343 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
644cb69f 2344
dfb55fdc
TK
2345 GFC_DIMENSION_SET(retarray->dim[1], 0,
2346 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2347 GFC_DESCRIPTOR_EXTENT(retarray,0));
644cb69f
FXC
2348 }
2349
21d1335b 2350 retarray->base_addr
92e6f3a4 2351 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
644cb69f
FXC
2352 retarray->offset = 0;
2353 }
5d70ab07
JD
2354 else if (unlikely (compile_options.bounds_check))
2355 {
2356 index_type ret_extent, arg_extent;
2357
2358 if (GFC_DESCRIPTOR_RANK (a) == 1)
2359 {
2360 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2361 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2362 if (arg_extent != ret_extent)
2363 runtime_error ("Incorrect extent in return array in"
2364 " MATMUL intrinsic: is %ld, should be %ld",
2365 (long int) ret_extent, (long int) arg_extent);
2366 }
2367 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2368 {
2369 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2370 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2371 if (arg_extent != ret_extent)
2372 runtime_error ("Incorrect extent in return array in"
2373 " MATMUL intrinsic: is %ld, should be %ld",
2374 (long int) ret_extent, (long int) arg_extent);
2375 }
2376 else
2377 {
2378 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2379 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2380 if (arg_extent != ret_extent)
2381 runtime_error ("Incorrect extent in return array in"
2382 " MATMUL intrinsic for dimension 1:"
2383 " is %ld, should be %ld",
2384 (long int) ret_extent, (long int) arg_extent);
2385
2386 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2387 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2388 if (arg_extent != ret_extent)
2389 runtime_error ("Incorrect extent in return array in"
2390 " MATMUL intrinsic for dimension 2:"
2391 " is %ld, should be %ld",
2392 (long int) ret_extent, (long int) arg_extent);
2393 }
2394 }
644cb69f 2395
644cb69f
FXC
2396
2397 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2398 {
2399 /* One-dimensional result may be addressed in the code below
2400 either as a row or a column matrix. We want both cases to
2401 work. */
dfb55fdc 2402 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
644cb69f
FXC
2403 }
2404 else
2405 {
dfb55fdc
TK
2406 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2407 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
644cb69f
FXC
2408 }
2409
2410
2411 if (GFC_DESCRIPTOR_RANK (a) == 1)
2412 {
2413 /* Treat it as a a row matrix A[1,count]. */
dfb55fdc 2414 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
644cb69f
FXC
2415 aystride = 1;
2416
2417 xcount = 1;
dfb55fdc 2418 count = GFC_DESCRIPTOR_EXTENT(a,0);
644cb69f
FXC
2419 }
2420 else
2421 {
dfb55fdc
TK
2422 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2423 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
644cb69f 2424
dfb55fdc
TK
2425 count = GFC_DESCRIPTOR_EXTENT(a,1);
2426 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
644cb69f
FXC
2427 }
2428
dfb55fdc 2429 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
7edc89d4 2430 {
dfb55fdc 2431 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
7edc89d4
TK
2432 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2433 }
644cb69f
FXC
2434
2435 if (GFC_DESCRIPTOR_RANK (b) == 1)
2436 {
2437 /* Treat it as a column matrix B[count,1] */
dfb55fdc 2438 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
644cb69f
FXC
2439
2440 /* bystride should never be used for 1-dimensional b.
2441 in case it is we want it to cause a segfault, rather than
2442 an incorrect result. */
2443 bystride = 0xDEADBEEF;
2444 ycount = 1;
2445 }
2446 else
2447 {
dfb55fdc
TK
2448 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2449 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2450 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
644cb69f
FXC
2451 }
2452
21d1335b
TB
2453 abase = a->base_addr;
2454 bbase = b->base_addr;
2455 dest = retarray->base_addr;
644cb69f 2456
5d70ab07 2457 /* Now that everything is set up, we perform the multiplication
5a0aad31
FXC
2458 itself. */
2459
2460#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
5d70ab07
JD
2461#define min(a,b) ((a) <= (b) ? (a) : (b))
2462#define max(a,b) ((a) >= (b) ? (a) : (b))
5a0aad31
FXC
2463
2464 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2465 && (bxstride == 1 || bystride == 1)
2466 && (((float) xcount) * ((float) ycount) * ((float) count)
2467 > POW3(blas_limit)))
644cb69f 2468 {
5d70ab07
JD
2469 const int m = xcount, n = ycount, k = count, ldc = rystride;
2470 const GFC_REAL_10 one = 1, zero = 0;
2471 const int lda = (axstride == 1) ? aystride : axstride,
2472 ldb = (bxstride == 1) ? bystride : bxstride;
644cb69f 2473
5d70ab07 2474 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
644cb69f 2475 {
5d70ab07
JD
2476 assert (gemm != NULL);
2477 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2478 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2479 &ldc, 1, 1);
2480 return;
644cb69f 2481 }
5d70ab07 2482 }
644cb69f 2483
5d70ab07
JD
2484 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2485 {
2486 /* This block of code implements a tuned matmul, derived from
2487 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2488
2489 Bo Kagstrom and Per Ling
2490 Department of Computing Science
2491 Umea University
2492 S-901 87 Umea, Sweden
2493
2494 from netlib.org, translated to C, and modified for matmul.m4. */
2495
2496 const GFC_REAL_10 *a, *b;
2497 GFC_REAL_10 *c;
2498 const index_type m = xcount, n = ycount, k = count;
2499
2500 /* System generated locals */
2501 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2502 i1, i2, i3, i4, i5, i6;
2503
2504 /* Local variables */
2505 GFC_REAL_10 t1[65536], /* was [256][256] */
2506 f11, f12, f21, f22, f31, f32, f41, f42,
2507 f13, f14, f23, f24, f33, f34, f43, f44;
2508 index_type i, j, l, ii, jj, ll;
2509 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2510
2511 a = abase;
2512 b = bbase;
2513 c = retarray->base_addr;
2514
2515 /* Parameter adjustments */
2516 c_dim1 = rystride;
2517 c_offset = 1 + c_dim1;
2518 c -= c_offset;
2519 a_dim1 = aystride;
2520 a_offset = 1 + a_dim1;
2521 a -= a_offset;
2522 b_dim1 = bystride;
2523 b_offset = 1 + b_dim1;
2524 b -= b_offset;
2525
2526 /* Early exit if possible */
2527 if (m == 0 || n == 0 || k == 0)
2528 return;
2529
2530 /* Empty c first. */
2531 for (j=1; j<=n; j++)
2532 for (i=1; i<=m; i++)
2533 c[i + j * c_dim1] = (GFC_REAL_10)0;
2534
2535 /* Start turning the crank. */
2536 i1 = n;
2537 for (jj = 1; jj <= i1; jj += 512)
644cb69f 2538 {
5d70ab07
JD
2539 /* Computing MIN */
2540 i2 = 512;
2541 i3 = n - jj + 1;
2542 jsec = min(i2,i3);
2543 ujsec = jsec - jsec % 4;
2544 i2 = k;
2545 for (ll = 1; ll <= i2; ll += 256)
644cb69f 2546 {
5d70ab07
JD
2547 /* Computing MIN */
2548 i3 = 256;
2549 i4 = k - ll + 1;
2550 lsec = min(i3,i4);
2551 ulsec = lsec - lsec % 2;
2552
2553 i3 = m;
2554 for (ii = 1; ii <= i3; ii += 256)
644cb69f 2555 {
5d70ab07
JD
2556 /* Computing MIN */
2557 i4 = 256;
2558 i5 = m - ii + 1;
2559 isec = min(i4,i5);
2560 uisec = isec - isec % 2;
2561 i4 = ll + ulsec - 1;
2562 for (l = ll; l <= i4; l += 2)
2563 {
2564 i5 = ii + uisec - 1;
2565 for (i = ii; i <= i5; i += 2)
2566 {
2567 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2568 a[i + l * a_dim1];
2569 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2570 a[i + (l + 1) * a_dim1];
2571 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2572 a[i + 1 + l * a_dim1];
2573 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2574 a[i + 1 + (l + 1) * a_dim1];
2575 }
2576 if (uisec < isec)
2577 {
2578 t1[l - ll + 1 + (isec << 8) - 257] =
2579 a[ii + isec - 1 + l * a_dim1];
2580 t1[l - ll + 2 + (isec << 8) - 257] =
2581 a[ii + isec - 1 + (l + 1) * a_dim1];
2582 }
2583 }
2584 if (ulsec < lsec)
2585 {
2586 i4 = ii + isec - 1;
2587 for (i = ii; i<= i4; ++i)
2588 {
2589 t1[lsec + ((i - ii + 1) << 8) - 257] =
2590 a[i + (ll + lsec - 1) * a_dim1];
2591 }
2592 }
2593
2594 uisec = isec - isec % 4;
2595 i4 = jj + ujsec - 1;
2596 for (j = jj; j <= i4; j += 4)
2597 {
2598 i5 = ii + uisec - 1;
2599 for (i = ii; i <= i5; i += 4)
2600 {
2601 f11 = c[i + j * c_dim1];
2602 f21 = c[i + 1 + j * c_dim1];
2603 f12 = c[i + (j + 1) * c_dim1];
2604 f22 = c[i + 1 + (j + 1) * c_dim1];
2605 f13 = c[i + (j + 2) * c_dim1];
2606 f23 = c[i + 1 + (j + 2) * c_dim1];
2607 f14 = c[i + (j + 3) * c_dim1];
2608 f24 = c[i + 1 + (j + 3) * c_dim1];
2609 f31 = c[i + 2 + j * c_dim1];
2610 f41 = c[i + 3 + j * c_dim1];
2611 f32 = c[i + 2 + (j + 1) * c_dim1];
2612 f42 = c[i + 3 + (j + 1) * c_dim1];
2613 f33 = c[i + 2 + (j + 2) * c_dim1];
2614 f43 = c[i + 3 + (j + 2) * c_dim1];
2615 f34 = c[i + 2 + (j + 3) * c_dim1];
2616 f44 = c[i + 3 + (j + 3) * c_dim1];
2617 i6 = ll + lsec - 1;
2618 for (l = ll; l <= i6; ++l)
2619 {
2620 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2621 * b[l + j * b_dim1];
2622 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2623 * b[l + j * b_dim1];
2624 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2625 * b[l + (j + 1) * b_dim1];
2626 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2627 * b[l + (j + 1) * b_dim1];
2628 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2629 * b[l + (j + 2) * b_dim1];
2630 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2631 * b[l + (j + 2) * b_dim1];
2632 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2633 * b[l + (j + 3) * b_dim1];
2634 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2635 * b[l + (j + 3) * b_dim1];
2636 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2637 * b[l + j * b_dim1];
2638 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2639 * b[l + j * b_dim1];
2640 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2641 * b[l + (j + 1) * b_dim1];
2642 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2643 * b[l + (j + 1) * b_dim1];
2644 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2645 * b[l + (j + 2) * b_dim1];
2646 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2647 * b[l + (j + 2) * b_dim1];
2648 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2649 * b[l + (j + 3) * b_dim1];
2650 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2651 * b[l + (j + 3) * b_dim1];
2652 }
2653 c[i + j * c_dim1] = f11;
2654 c[i + 1 + j * c_dim1] = f21;
2655 c[i + (j + 1) * c_dim1] = f12;
2656 c[i + 1 + (j + 1) * c_dim1] = f22;
2657 c[i + (j + 2) * c_dim1] = f13;
2658 c[i + 1 + (j + 2) * c_dim1] = f23;
2659 c[i + (j + 3) * c_dim1] = f14;
2660 c[i + 1 + (j + 3) * c_dim1] = f24;
2661 c[i + 2 + j * c_dim1] = f31;
2662 c[i + 3 + j * c_dim1] = f41;
2663 c[i + 2 + (j + 1) * c_dim1] = f32;
2664 c[i + 3 + (j + 1) * c_dim1] = f42;
2665 c[i + 2 + (j + 2) * c_dim1] = f33;
2666 c[i + 3 + (j + 2) * c_dim1] = f43;
2667 c[i + 2 + (j + 3) * c_dim1] = f34;
2668 c[i + 3 + (j + 3) * c_dim1] = f44;
2669 }
2670 if (uisec < isec)
2671 {
2672 i5 = ii + isec - 1;
2673 for (i = ii + uisec; i <= i5; ++i)
2674 {
2675 f11 = c[i + j * c_dim1];
2676 f12 = c[i + (j + 1) * c_dim1];
2677 f13 = c[i + (j + 2) * c_dim1];
2678 f14 = c[i + (j + 3) * c_dim1];
2679 i6 = ll + lsec - 1;
2680 for (l = ll; l <= i6; ++l)
2681 {
2682 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2683 257] * b[l + j * b_dim1];
2684 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2685 257] * b[l + (j + 1) * b_dim1];
2686 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2687 257] * b[l + (j + 2) * b_dim1];
2688 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + (j + 3) * b_dim1];
2690 }
2691 c[i + j * c_dim1] = f11;
2692 c[i + (j + 1) * c_dim1] = f12;
2693 c[i + (j + 2) * c_dim1] = f13;
2694 c[i + (j + 3) * c_dim1] = f14;
2695 }
2696 }
2697 }
2698 if (ujsec < jsec)
2699 {
2700 i4 = jj + jsec - 1;
2701 for (j = jj + ujsec; j <= i4; ++j)
2702 {
2703 i5 = ii + uisec - 1;
2704 for (i = ii; i <= i5; i += 4)
2705 {
2706 f11 = c[i + j * c_dim1];
2707 f21 = c[i + 1 + j * c_dim1];
2708 f31 = c[i + 2 + j * c_dim1];
2709 f41 = c[i + 3 + j * c_dim1];
2710 i6 = ll + lsec - 1;
2711 for (l = ll; l <= i6; ++l)
2712 {
2713 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2714 257] * b[l + j * b_dim1];
2715 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2716 257] * b[l + j * b_dim1];
2717 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2718 257] * b[l + j * b_dim1];
2719 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2720 257] * b[l + j * b_dim1];
2721 }
2722 c[i + j * c_dim1] = f11;
2723 c[i + 1 + j * c_dim1] = f21;
2724 c[i + 2 + j * c_dim1] = f31;
2725 c[i + 3 + j * c_dim1] = f41;
2726 }
2727 i5 = ii + isec - 1;
2728 for (i = ii + uisec; i <= i5; ++i)
2729 {
2730 f11 = c[i + j * c_dim1];
2731 i6 = ll + lsec - 1;
2732 for (l = ll; l <= i6; ++l)
2733 {
2734 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2735 257] * b[l + j * b_dim1];
2736 }
2737 c[i + j * c_dim1] = f11;
2738 }
2739 }
2740 }
644cb69f
FXC
2741 }
2742 }
2743 }
5d70ab07 2744 return;
644cb69f 2745 }
1524f80b
RS
2746 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2747 {
a4a11197
PT
2748 if (GFC_DESCRIPTOR_RANK (a) != 1)
2749 {
2750 const GFC_REAL_10 *restrict abase_x;
2751 const GFC_REAL_10 *restrict bbase_y;
2752 GFC_REAL_10 *restrict dest_y;
2753 GFC_REAL_10 s;
1524f80b 2754
a4a11197
PT
2755 for (y = 0; y < ycount; y++)
2756 {
2757 bbase_y = &bbase[y*bystride];
2758 dest_y = &dest[y*rystride];
2759 for (x = 0; x < xcount; x++)
2760 {
2761 abase_x = &abase[x*axstride];
2762 s = (GFC_REAL_10) 0;
2763 for (n = 0; n < count; n++)
2764 s += abase_x[n] * bbase_y[n];
2765 dest_y[x] = s;
2766 }
2767 }
2768 }
2769 else
1524f80b 2770 {
a4a11197
PT
2771 const GFC_REAL_10 *restrict bbase_y;
2772 GFC_REAL_10 s;
2773
2774 for (y = 0; y < ycount; y++)
1524f80b 2775 {
a4a11197 2776 bbase_y = &bbase[y*bystride];
1524f80b
RS
2777 s = (GFC_REAL_10) 0;
2778 for (n = 0; n < count; n++)
a4a11197
PT
2779 s += abase[n*axstride] * bbase_y[n];
2780 dest[y*rystride] = s;
1524f80b
RS
2781 }
2782 }
2783 }
2784 else if (axstride < aystride)
644cb69f
FXC
2785 {
2786 for (y = 0; y < ycount; y++)
2787 for (x = 0; x < xcount; x++)
2788 dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
2789
2790 for (y = 0; y < ycount; y++)
2791 for (n = 0; n < count; n++)
2792 for (x = 0; x < xcount; x++)
2793 /* dest[x,y] += a[x,n] * b[n,y] */
5d70ab07
JD
2794 dest[x*rxstride + y*rystride] +=
2795 abase[x*axstride + n*aystride] *
2796 bbase[n*bxstride + y*bystride];
644cb69f 2797 }
f0e871d6
PT
2798 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2799 {
2800 const GFC_REAL_10 *restrict bbase_y;
2801 GFC_REAL_10 s;
2802
2803 for (y = 0; y < ycount; y++)
2804 {
2805 bbase_y = &bbase[y*bystride];
2806 s = (GFC_REAL_10) 0;
2807 for (n = 0; n < count; n++)
2808 s += abase[n*axstride] * bbase_y[n*bxstride];
2809 dest[y*rxstride] = s;
2810 }
2811 }
1524f80b
RS
2812 else
2813 {
2814 const GFC_REAL_10 *restrict abase_x;
2815 const GFC_REAL_10 *restrict bbase_y;
2816 GFC_REAL_10 *restrict dest_y;
2817 GFC_REAL_10 s;
2818
2819 for (y = 0; y < ycount; y++)
2820 {
2821 bbase_y = &bbase[y*bystride];
2822 dest_y = &dest[y*rystride];
2823 for (x = 0; x < xcount; x++)
2824 {
2825 abase_x = &abase[x*axstride];
2826 s = (GFC_REAL_10) 0;
2827 for (n = 0; n < count; n++)
2828 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2829 dest_y[x*rxstride] = s;
2830 }
2831 }
2832 }
644cb69f 2833}
31cfd832
TK
2834#undef POW3
2835#undef min
2836#undef max
2837
644cb69f 2838#endif
31cfd832
TK
2839#endif
2840