]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgfortran/generated/matmul_r16.c
gthr-win32.h: Define NOGDI before windows.h include to prevent w32api CC_NONE macro...
[thirdparty/gcc.git] / libgfortran / generated / matmul_r16.c
CommitLineData
644cb69f 1/* Implementation of the MATMUL intrinsic
cbe34bb5 2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
644cb69f
FXC
3 Contributed by Paul Brook <paul@nowt.org>
4
21d1335b 5This file is part of the GNU Fortran runtime library (libgfortran).
644cb69f
FXC
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
748086b7 10version 3 of the License, or (at your option) any later version.
644cb69f
FXC
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
748086b7
JJ
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24<http://www.gnu.org/licenses/>. */
644cb69f 25
36ae8a61 26#include "libgfortran.h"
644cb69f
FXC
27#include <string.h>
28#include <assert.h>
36ae8a61 29
644cb69f
FXC
30
31#if defined (HAVE_GFC_REAL_16)
32
5a0aad31 33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
5d70ab07 34 passed to us by the front-end, in which case we call it for large
5a0aad31
FXC
35 matrices. */
36
37typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
39 const int *, const GFC_REAL_16 *, const int *,
40 const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
41 int, int);
42
1524f80b
RS
43/* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
644cb69f
FXC
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
1524f80b
RS
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
644cb69f 60 DO I=1,M
1524f80b
RS
61 S = 0
62 DO K=1,COUNT
5a0aad31 63 S = S+A(I,K)*B(K,J)
1524f80b
RS
64 C(I,J) = S
65 ENDIF
644cb69f
FXC
66*/
67
5a0aad31
FXC
68/* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
85206901 72extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
5a0aad31
FXC
73 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
644cb69f
FXC
75export_proto(matmul_r16);
76
31cfd832
TK
77#if defined(HAVE_AVX) && defined(HAVE_AVX2)
78/* REAL types generate identical code for AVX and AVX2. Only generate
79 an AVX2 function if we are dealing with integer. */
80#undef HAVE_AVX2
81#endif
82
83
84/* Put exhaustive list of possible architectures here here, ORed together. */
85
86#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
87
88#ifdef HAVE_AVX
89static void
90matmul_r16_avx (gfc_array_r16 * const restrict retarray,
91 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
92 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
93static void
94matmul_r16_avx (gfc_array_r16 * const restrict retarray,
95 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
96 int blas_limit, blas_call gemm)
97{
98 const GFC_REAL_16 * restrict abase;
99 const GFC_REAL_16 * restrict bbase;
100 GFC_REAL_16 * restrict dest;
101
102 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
103 index_type x, y, n, count, xcount, ycount;
104
105 assert (GFC_DESCRIPTOR_RANK (a) == 2
106 || GFC_DESCRIPTOR_RANK (b) == 2);
107
108/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
109
110 Either A or B (but not both) can be rank 1:
111
112 o One-dimensional argument A is implicitly treated as a row matrix
113 dimensioned [1,count], so xcount=1.
114
115 o One-dimensional argument B is implicitly treated as a column matrix
116 dimensioned [count, 1], so ycount=1.
117*/
118
119 if (retarray->base_addr == NULL)
120 {
121 if (GFC_DESCRIPTOR_RANK (a) == 1)
122 {
123 GFC_DIMENSION_SET(retarray->dim[0], 0,
124 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
125 }
126 else if (GFC_DESCRIPTOR_RANK (b) == 1)
127 {
128 GFC_DIMENSION_SET(retarray->dim[0], 0,
129 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
130 }
131 else
132 {
133 GFC_DIMENSION_SET(retarray->dim[0], 0,
134 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
135
136 GFC_DIMENSION_SET(retarray->dim[1], 0,
137 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
138 GFC_DESCRIPTOR_EXTENT(retarray,0));
139 }
140
141 retarray->base_addr
142 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
143 retarray->offset = 0;
144 }
145 else if (unlikely (compile_options.bounds_check))
146 {
147 index_type ret_extent, arg_extent;
148
149 if (GFC_DESCRIPTOR_RANK (a) == 1)
150 {
151 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
152 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
153 if (arg_extent != ret_extent)
154 runtime_error ("Incorrect extent in return array in"
155 " MATMUL intrinsic: is %ld, should be %ld",
156 (long int) ret_extent, (long int) arg_extent);
157 }
158 else if (GFC_DESCRIPTOR_RANK (b) == 1)
159 {
160 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
161 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
162 if (arg_extent != ret_extent)
163 runtime_error ("Incorrect extent in return array in"
164 " MATMUL intrinsic: is %ld, should be %ld",
165 (long int) ret_extent, (long int) arg_extent);
166 }
167 else
168 {
169 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
170 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
171 if (arg_extent != ret_extent)
172 runtime_error ("Incorrect extent in return array in"
173 " MATMUL intrinsic for dimension 1:"
174 " is %ld, should be %ld",
175 (long int) ret_extent, (long int) arg_extent);
176
177 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
178 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
179 if (arg_extent != ret_extent)
180 runtime_error ("Incorrect extent in return array in"
181 " MATMUL intrinsic for dimension 2:"
182 " is %ld, should be %ld",
183 (long int) ret_extent, (long int) arg_extent);
184 }
185 }
186
187
188 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
189 {
190 /* One-dimensional result may be addressed in the code below
191 either as a row or a column matrix. We want both cases to
192 work. */
193 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
194 }
195 else
196 {
197 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
198 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
199 }
200
201
202 if (GFC_DESCRIPTOR_RANK (a) == 1)
203 {
204 /* Treat it as a a row matrix A[1,count]. */
205 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
206 aystride = 1;
207
208 xcount = 1;
209 count = GFC_DESCRIPTOR_EXTENT(a,0);
210 }
211 else
212 {
213 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
214 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
215
216 count = GFC_DESCRIPTOR_EXTENT(a,1);
217 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
218 }
219
220 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
221 {
222 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
223 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
224 }
225
226 if (GFC_DESCRIPTOR_RANK (b) == 1)
227 {
228 /* Treat it as a column matrix B[count,1] */
229 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
230
231 /* bystride should never be used for 1-dimensional b.
232 in case it is we want it to cause a segfault, rather than
233 an incorrect result. */
234 bystride = 0xDEADBEEF;
235 ycount = 1;
236 }
237 else
238 {
239 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
240 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
241 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
242 }
243
244 abase = a->base_addr;
245 bbase = b->base_addr;
246 dest = retarray->base_addr;
247
248 /* Now that everything is set up, we perform the multiplication
249 itself. */
250
251#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
252#define min(a,b) ((a) <= (b) ? (a) : (b))
253#define max(a,b) ((a) >= (b) ? (a) : (b))
254
255 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
256 && (bxstride == 1 || bystride == 1)
257 && (((float) xcount) * ((float) ycount) * ((float) count)
258 > POW3(blas_limit)))
259 {
260 const int m = xcount, n = ycount, k = count, ldc = rystride;
261 const GFC_REAL_16 one = 1, zero = 0;
262 const int lda = (axstride == 1) ? aystride : axstride,
263 ldb = (bxstride == 1) ? bystride : bxstride;
264
265 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
266 {
267 assert (gemm != NULL);
268 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
269 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
270 &ldc, 1, 1);
271 return;
272 }
273 }
274
275 if (rxstride == 1 && axstride == 1 && bxstride == 1)
276 {
277 /* This block of code implements a tuned matmul, derived from
278 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
279
280 Bo Kagstrom and Per Ling
281 Department of Computing Science
282 Umea University
283 S-901 87 Umea, Sweden
284
285 from netlib.org, translated to C, and modified for matmul.m4. */
286
287 const GFC_REAL_16 *a, *b;
288 GFC_REAL_16 *c;
289 const index_type m = xcount, n = ycount, k = count;
290
291 /* System generated locals */
292 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
293 i1, i2, i3, i4, i5, i6;
294
295 /* Local variables */
296 GFC_REAL_16 t1[65536], /* was [256][256] */
297 f11, f12, f21, f22, f31, f32, f41, f42,
298 f13, f14, f23, f24, f33, f34, f43, f44;
299 index_type i, j, l, ii, jj, ll;
300 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
301
302 a = abase;
303 b = bbase;
304 c = retarray->base_addr;
305
306 /* Parameter adjustments */
307 c_dim1 = rystride;
308 c_offset = 1 + c_dim1;
309 c -= c_offset;
310 a_dim1 = aystride;
311 a_offset = 1 + a_dim1;
312 a -= a_offset;
313 b_dim1 = bystride;
314 b_offset = 1 + b_dim1;
315 b -= b_offset;
316
317 /* Early exit if possible */
318 if (m == 0 || n == 0 || k == 0)
319 return;
320
321 /* Empty c first. */
322 for (j=1; j<=n; j++)
323 for (i=1; i<=m; i++)
324 c[i + j * c_dim1] = (GFC_REAL_16)0;
325
326 /* Start turning the crank. */
327 i1 = n;
328 for (jj = 1; jj <= i1; jj += 512)
329 {
330 /* Computing MIN */
331 i2 = 512;
332 i3 = n - jj + 1;
333 jsec = min(i2,i3);
334 ujsec = jsec - jsec % 4;
335 i2 = k;
336 for (ll = 1; ll <= i2; ll += 256)
337 {
338 /* Computing MIN */
339 i3 = 256;
340 i4 = k - ll + 1;
341 lsec = min(i3,i4);
342 ulsec = lsec - lsec % 2;
343
344 i3 = m;
345 for (ii = 1; ii <= i3; ii += 256)
346 {
347 /* Computing MIN */
348 i4 = 256;
349 i5 = m - ii + 1;
350 isec = min(i4,i5);
351 uisec = isec - isec % 2;
352 i4 = ll + ulsec - 1;
353 for (l = ll; l <= i4; l += 2)
354 {
355 i5 = ii + uisec - 1;
356 for (i = ii; i <= i5; i += 2)
357 {
358 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
359 a[i + l * a_dim1];
360 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
361 a[i + (l + 1) * a_dim1];
362 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
363 a[i + 1 + l * a_dim1];
364 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
365 a[i + 1 + (l + 1) * a_dim1];
366 }
367 if (uisec < isec)
368 {
369 t1[l - ll + 1 + (isec << 8) - 257] =
370 a[ii + isec - 1 + l * a_dim1];
371 t1[l - ll + 2 + (isec << 8) - 257] =
372 a[ii + isec - 1 + (l + 1) * a_dim1];
373 }
374 }
375 if (ulsec < lsec)
376 {
377 i4 = ii + isec - 1;
378 for (i = ii; i<= i4; ++i)
379 {
380 t1[lsec + ((i - ii + 1) << 8) - 257] =
381 a[i + (ll + lsec - 1) * a_dim1];
382 }
383 }
384
385 uisec = isec - isec % 4;
386 i4 = jj + ujsec - 1;
387 for (j = jj; j <= i4; j += 4)
388 {
389 i5 = ii + uisec - 1;
390 for (i = ii; i <= i5; i += 4)
391 {
392 f11 = c[i + j * c_dim1];
393 f21 = c[i + 1 + j * c_dim1];
394 f12 = c[i + (j + 1) * c_dim1];
395 f22 = c[i + 1 + (j + 1) * c_dim1];
396 f13 = c[i + (j + 2) * c_dim1];
397 f23 = c[i + 1 + (j + 2) * c_dim1];
398 f14 = c[i + (j + 3) * c_dim1];
399 f24 = c[i + 1 + (j + 3) * c_dim1];
400 f31 = c[i + 2 + j * c_dim1];
401 f41 = c[i + 3 + j * c_dim1];
402 f32 = c[i + 2 + (j + 1) * c_dim1];
403 f42 = c[i + 3 + (j + 1) * c_dim1];
404 f33 = c[i + 2 + (j + 2) * c_dim1];
405 f43 = c[i + 3 + (j + 2) * c_dim1];
406 f34 = c[i + 2 + (j + 3) * c_dim1];
407 f44 = c[i + 3 + (j + 3) * c_dim1];
408 i6 = ll + lsec - 1;
409 for (l = ll; l <= i6; ++l)
410 {
411 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
412 * b[l + j * b_dim1];
413 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
414 * b[l + j * b_dim1];
415 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
416 * b[l + (j + 1) * b_dim1];
417 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
418 * b[l + (j + 1) * b_dim1];
419 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
420 * b[l + (j + 2) * b_dim1];
421 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
422 * b[l + (j + 2) * b_dim1];
423 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
424 * b[l + (j + 3) * b_dim1];
425 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
426 * b[l + (j + 3) * b_dim1];
427 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
428 * b[l + j * b_dim1];
429 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
430 * b[l + j * b_dim1];
431 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
432 * b[l + (j + 1) * b_dim1];
433 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
434 * b[l + (j + 1) * b_dim1];
435 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
436 * b[l + (j + 2) * b_dim1];
437 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
438 * b[l + (j + 2) * b_dim1];
439 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
440 * b[l + (j + 3) * b_dim1];
441 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
442 * b[l + (j + 3) * b_dim1];
443 }
444 c[i + j * c_dim1] = f11;
445 c[i + 1 + j * c_dim1] = f21;
446 c[i + (j + 1) * c_dim1] = f12;
447 c[i + 1 + (j + 1) * c_dim1] = f22;
448 c[i + (j + 2) * c_dim1] = f13;
449 c[i + 1 + (j + 2) * c_dim1] = f23;
450 c[i + (j + 3) * c_dim1] = f14;
451 c[i + 1 + (j + 3) * c_dim1] = f24;
452 c[i + 2 + j * c_dim1] = f31;
453 c[i + 3 + j * c_dim1] = f41;
454 c[i + 2 + (j + 1) * c_dim1] = f32;
455 c[i + 3 + (j + 1) * c_dim1] = f42;
456 c[i + 2 + (j + 2) * c_dim1] = f33;
457 c[i + 3 + (j + 2) * c_dim1] = f43;
458 c[i + 2 + (j + 3) * c_dim1] = f34;
459 c[i + 3 + (j + 3) * c_dim1] = f44;
460 }
461 if (uisec < isec)
462 {
463 i5 = ii + isec - 1;
464 for (i = ii + uisec; i <= i5; ++i)
465 {
466 f11 = c[i + j * c_dim1];
467 f12 = c[i + (j + 1) * c_dim1];
468 f13 = c[i + (j + 2) * c_dim1];
469 f14 = c[i + (j + 3) * c_dim1];
470 i6 = ll + lsec - 1;
471 for (l = ll; l <= i6; ++l)
472 {
473 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
474 257] * b[l + j * b_dim1];
475 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
476 257] * b[l + (j + 1) * b_dim1];
477 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
478 257] * b[l + (j + 2) * b_dim1];
479 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
480 257] * b[l + (j + 3) * b_dim1];
481 }
482 c[i + j * c_dim1] = f11;
483 c[i + (j + 1) * c_dim1] = f12;
484 c[i + (j + 2) * c_dim1] = f13;
485 c[i + (j + 3) * c_dim1] = f14;
486 }
487 }
488 }
489 if (ujsec < jsec)
490 {
491 i4 = jj + jsec - 1;
492 for (j = jj + ujsec; j <= i4; ++j)
493 {
494 i5 = ii + uisec - 1;
495 for (i = ii; i <= i5; i += 4)
496 {
497 f11 = c[i + j * c_dim1];
498 f21 = c[i + 1 + j * c_dim1];
499 f31 = c[i + 2 + j * c_dim1];
500 f41 = c[i + 3 + j * c_dim1];
501 i6 = ll + lsec - 1;
502 for (l = ll; l <= i6; ++l)
503 {
504 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
505 257] * b[l + j * b_dim1];
506 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
507 257] * b[l + j * b_dim1];
508 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
509 257] * b[l + j * b_dim1];
510 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
511 257] * b[l + j * b_dim1];
512 }
513 c[i + j * c_dim1] = f11;
514 c[i + 1 + j * c_dim1] = f21;
515 c[i + 2 + j * c_dim1] = f31;
516 c[i + 3 + j * c_dim1] = f41;
517 }
518 i5 = ii + isec - 1;
519 for (i = ii + uisec; i <= i5; ++i)
520 {
521 f11 = c[i + j * c_dim1];
522 i6 = ll + lsec - 1;
523 for (l = ll; l <= i6; ++l)
524 {
525 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
526 257] * b[l + j * b_dim1];
527 }
528 c[i + j * c_dim1] = f11;
529 }
530 }
531 }
532 }
533 }
534 }
535 return;
536 }
537 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
538 {
539 if (GFC_DESCRIPTOR_RANK (a) != 1)
540 {
541 const GFC_REAL_16 *restrict abase_x;
542 const GFC_REAL_16 *restrict bbase_y;
543 GFC_REAL_16 *restrict dest_y;
544 GFC_REAL_16 s;
545
546 for (y = 0; y < ycount; y++)
547 {
548 bbase_y = &bbase[y*bystride];
549 dest_y = &dest[y*rystride];
550 for (x = 0; x < xcount; x++)
551 {
552 abase_x = &abase[x*axstride];
553 s = (GFC_REAL_16) 0;
554 for (n = 0; n < count; n++)
555 s += abase_x[n] * bbase_y[n];
556 dest_y[x] = s;
557 }
558 }
559 }
560 else
561 {
562 const GFC_REAL_16 *restrict bbase_y;
563 GFC_REAL_16 s;
564
565 for (y = 0; y < ycount; y++)
566 {
567 bbase_y = &bbase[y*bystride];
568 s = (GFC_REAL_16) 0;
569 for (n = 0; n < count; n++)
570 s += abase[n*axstride] * bbase_y[n];
571 dest[y*rystride] = s;
572 }
573 }
574 }
575 else if (axstride < aystride)
576 {
577 for (y = 0; y < ycount; y++)
578 for (x = 0; x < xcount; x++)
579 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
580
581 for (y = 0; y < ycount; y++)
582 for (n = 0; n < count; n++)
583 for (x = 0; x < xcount; x++)
584 /* dest[x,y] += a[x,n] * b[n,y] */
585 dest[x*rxstride + y*rystride] +=
586 abase[x*axstride + n*aystride] *
587 bbase[n*bxstride + y*bystride];
588 }
589 else if (GFC_DESCRIPTOR_RANK (a) == 1)
590 {
591 const GFC_REAL_16 *restrict bbase_y;
592 GFC_REAL_16 s;
593
594 for (y = 0; y < ycount; y++)
595 {
596 bbase_y = &bbase[y*bystride];
597 s = (GFC_REAL_16) 0;
598 for (n = 0; n < count; n++)
599 s += abase[n*axstride] * bbase_y[n*bxstride];
600 dest[y*rxstride] = s;
601 }
602 }
603 else
604 {
605 const GFC_REAL_16 *restrict abase_x;
606 const GFC_REAL_16 *restrict bbase_y;
607 GFC_REAL_16 *restrict dest_y;
608 GFC_REAL_16 s;
609
610 for (y = 0; y < ycount; y++)
611 {
612 bbase_y = &bbase[y*bystride];
613 dest_y = &dest[y*rystride];
614 for (x = 0; x < xcount; x++)
615 {
616 abase_x = &abase[x*axstride];
617 s = (GFC_REAL_16) 0;
618 for (n = 0; n < count; n++)
619 s += abase_x[n*aystride] * bbase_y[n*bxstride];
620 dest_y[x*rxstride] = s;
621 }
622 }
623 }
624}
625#undef POW3
626#undef min
627#undef max
628
629#endif /* HAVE_AVX */
630
631#ifdef HAVE_AVX2
632static void
633matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
634 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
635 int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
636static void
637matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
638 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
639 int blas_limit, blas_call gemm)
640{
641 const GFC_REAL_16 * restrict abase;
642 const GFC_REAL_16 * restrict bbase;
643 GFC_REAL_16 * restrict dest;
644
645 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
646 index_type x, y, n, count, xcount, ycount;
647
648 assert (GFC_DESCRIPTOR_RANK (a) == 2
649 || GFC_DESCRIPTOR_RANK (b) == 2);
650
651/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
652
653 Either A or B (but not both) can be rank 1:
654
655 o One-dimensional argument A is implicitly treated as a row matrix
656 dimensioned [1,count], so xcount=1.
657
658 o One-dimensional argument B is implicitly treated as a column matrix
659 dimensioned [count, 1], so ycount=1.
660*/
661
662 if (retarray->base_addr == NULL)
663 {
664 if (GFC_DESCRIPTOR_RANK (a) == 1)
665 {
666 GFC_DIMENSION_SET(retarray->dim[0], 0,
667 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
668 }
669 else if (GFC_DESCRIPTOR_RANK (b) == 1)
670 {
671 GFC_DIMENSION_SET(retarray->dim[0], 0,
672 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
673 }
674 else
675 {
676 GFC_DIMENSION_SET(retarray->dim[0], 0,
677 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
678
679 GFC_DIMENSION_SET(retarray->dim[1], 0,
680 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
681 GFC_DESCRIPTOR_EXTENT(retarray,0));
682 }
683
684 retarray->base_addr
685 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
686 retarray->offset = 0;
687 }
688 else if (unlikely (compile_options.bounds_check))
689 {
690 index_type ret_extent, arg_extent;
691
692 if (GFC_DESCRIPTOR_RANK (a) == 1)
693 {
694 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
695 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
696 if (arg_extent != ret_extent)
697 runtime_error ("Incorrect extent in return array in"
698 " MATMUL intrinsic: is %ld, should be %ld",
699 (long int) ret_extent, (long int) arg_extent);
700 }
701 else if (GFC_DESCRIPTOR_RANK (b) == 1)
702 {
703 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
704 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
705 if (arg_extent != ret_extent)
706 runtime_error ("Incorrect extent in return array in"
707 " MATMUL intrinsic: is %ld, should be %ld",
708 (long int) ret_extent, (long int) arg_extent);
709 }
710 else
711 {
712 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
713 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
714 if (arg_extent != ret_extent)
715 runtime_error ("Incorrect extent in return array in"
716 " MATMUL intrinsic for dimension 1:"
717 " is %ld, should be %ld",
718 (long int) ret_extent, (long int) arg_extent);
719
720 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
721 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
722 if (arg_extent != ret_extent)
723 runtime_error ("Incorrect extent in return array in"
724 " MATMUL intrinsic for dimension 2:"
725 " is %ld, should be %ld",
726 (long int) ret_extent, (long int) arg_extent);
727 }
728 }
729
730
731 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
732 {
733 /* One-dimensional result may be addressed in the code below
734 either as a row or a column matrix. We want both cases to
735 work. */
736 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
737 }
738 else
739 {
740 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
741 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
742 }
743
744
745 if (GFC_DESCRIPTOR_RANK (a) == 1)
746 {
747 /* Treat it as a a row matrix A[1,count]. */
748 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
749 aystride = 1;
750
751 xcount = 1;
752 count = GFC_DESCRIPTOR_EXTENT(a,0);
753 }
754 else
755 {
756 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
757 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
758
759 count = GFC_DESCRIPTOR_EXTENT(a,1);
760 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
761 }
762
763 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
764 {
765 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
766 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
767 }
768
769 if (GFC_DESCRIPTOR_RANK (b) == 1)
770 {
771 /* Treat it as a column matrix B[count,1] */
772 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
773
774 /* bystride should never be used for 1-dimensional b.
775 in case it is we want it to cause a segfault, rather than
776 an incorrect result. */
777 bystride = 0xDEADBEEF;
778 ycount = 1;
779 }
780 else
781 {
782 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
783 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
784 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
785 }
786
787 abase = a->base_addr;
788 bbase = b->base_addr;
789 dest = retarray->base_addr;
790
791 /* Now that everything is set up, we perform the multiplication
792 itself. */
793
794#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
795#define min(a,b) ((a) <= (b) ? (a) : (b))
796#define max(a,b) ((a) >= (b) ? (a) : (b))
797
798 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
799 && (bxstride == 1 || bystride == 1)
800 && (((float) xcount) * ((float) ycount) * ((float) count)
801 > POW3(blas_limit)))
802 {
803 const int m = xcount, n = ycount, k = count, ldc = rystride;
804 const GFC_REAL_16 one = 1, zero = 0;
805 const int lda = (axstride == 1) ? aystride : axstride,
806 ldb = (bxstride == 1) ? bystride : bxstride;
807
808 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
809 {
810 assert (gemm != NULL);
811 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
812 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
813 &ldc, 1, 1);
814 return;
815 }
816 }
817
818 if (rxstride == 1 && axstride == 1 && bxstride == 1)
819 {
820 /* This block of code implements a tuned matmul, derived from
821 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
822
823 Bo Kagstrom and Per Ling
824 Department of Computing Science
825 Umea University
826 S-901 87 Umea, Sweden
827
828 from netlib.org, translated to C, and modified for matmul.m4. */
829
830 const GFC_REAL_16 *a, *b;
831 GFC_REAL_16 *c;
832 const index_type m = xcount, n = ycount, k = count;
833
834 /* System generated locals */
835 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
836 i1, i2, i3, i4, i5, i6;
837
838 /* Local variables */
839 GFC_REAL_16 t1[65536], /* was [256][256] */
840 f11, f12, f21, f22, f31, f32, f41, f42,
841 f13, f14, f23, f24, f33, f34, f43, f44;
842 index_type i, j, l, ii, jj, ll;
843 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
844
845 a = abase;
846 b = bbase;
847 c = retarray->base_addr;
848
849 /* Parameter adjustments */
850 c_dim1 = rystride;
851 c_offset = 1 + c_dim1;
852 c -= c_offset;
853 a_dim1 = aystride;
854 a_offset = 1 + a_dim1;
855 a -= a_offset;
856 b_dim1 = bystride;
857 b_offset = 1 + b_dim1;
858 b -= b_offset;
859
860 /* Early exit if possible */
861 if (m == 0 || n == 0 || k == 0)
862 return;
863
864 /* Empty c first. */
865 for (j=1; j<=n; j++)
866 for (i=1; i<=m; i++)
867 c[i + j * c_dim1] = (GFC_REAL_16)0;
868
869 /* Start turning the crank. */
870 i1 = n;
871 for (jj = 1; jj <= i1; jj += 512)
872 {
873 /* Computing MIN */
874 i2 = 512;
875 i3 = n - jj + 1;
876 jsec = min(i2,i3);
877 ujsec = jsec - jsec % 4;
878 i2 = k;
879 for (ll = 1; ll <= i2; ll += 256)
880 {
881 /* Computing MIN */
882 i3 = 256;
883 i4 = k - ll + 1;
884 lsec = min(i3,i4);
885 ulsec = lsec - lsec % 2;
886
887 i3 = m;
888 for (ii = 1; ii <= i3; ii += 256)
889 {
890 /* Computing MIN */
891 i4 = 256;
892 i5 = m - ii + 1;
893 isec = min(i4,i5);
894 uisec = isec - isec % 2;
895 i4 = ll + ulsec - 1;
896 for (l = ll; l <= i4; l += 2)
897 {
898 i5 = ii + uisec - 1;
899 for (i = ii; i <= i5; i += 2)
900 {
901 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
902 a[i + l * a_dim1];
903 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
904 a[i + (l + 1) * a_dim1];
905 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
906 a[i + 1 + l * a_dim1];
907 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
908 a[i + 1 + (l + 1) * a_dim1];
909 }
910 if (uisec < isec)
911 {
912 t1[l - ll + 1 + (isec << 8) - 257] =
913 a[ii + isec - 1 + l * a_dim1];
914 t1[l - ll + 2 + (isec << 8) - 257] =
915 a[ii + isec - 1 + (l + 1) * a_dim1];
916 }
917 }
918 if (ulsec < lsec)
919 {
920 i4 = ii + isec - 1;
921 for (i = ii; i<= i4; ++i)
922 {
923 t1[lsec + ((i - ii + 1) << 8) - 257] =
924 a[i + (ll + lsec - 1) * a_dim1];
925 }
926 }
927
928 uisec = isec - isec % 4;
929 i4 = jj + ujsec - 1;
930 for (j = jj; j <= i4; j += 4)
931 {
932 i5 = ii + uisec - 1;
933 for (i = ii; i <= i5; i += 4)
934 {
935 f11 = c[i + j * c_dim1];
936 f21 = c[i + 1 + j * c_dim1];
937 f12 = c[i + (j + 1) * c_dim1];
938 f22 = c[i + 1 + (j + 1) * c_dim1];
939 f13 = c[i + (j + 2) * c_dim1];
940 f23 = c[i + 1 + (j + 2) * c_dim1];
941 f14 = c[i + (j + 3) * c_dim1];
942 f24 = c[i + 1 + (j + 3) * c_dim1];
943 f31 = c[i + 2 + j * c_dim1];
944 f41 = c[i + 3 + j * c_dim1];
945 f32 = c[i + 2 + (j + 1) * c_dim1];
946 f42 = c[i + 3 + (j + 1) * c_dim1];
947 f33 = c[i + 2 + (j + 2) * c_dim1];
948 f43 = c[i + 3 + (j + 2) * c_dim1];
949 f34 = c[i + 2 + (j + 3) * c_dim1];
950 f44 = c[i + 3 + (j + 3) * c_dim1];
951 i6 = ll + lsec - 1;
952 for (l = ll; l <= i6; ++l)
953 {
954 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
955 * b[l + j * b_dim1];
956 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
957 * b[l + j * b_dim1];
958 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
959 * b[l + (j + 1) * b_dim1];
960 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
961 * b[l + (j + 1) * b_dim1];
962 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
963 * b[l + (j + 2) * b_dim1];
964 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
965 * b[l + (j + 2) * b_dim1];
966 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
967 * b[l + (j + 3) * b_dim1];
968 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
969 * b[l + (j + 3) * b_dim1];
970 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
971 * b[l + j * b_dim1];
972 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
973 * b[l + j * b_dim1];
974 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
975 * b[l + (j + 1) * b_dim1];
976 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
977 * b[l + (j + 1) * b_dim1];
978 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
979 * b[l + (j + 2) * b_dim1];
980 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
981 * b[l + (j + 2) * b_dim1];
982 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
983 * b[l + (j + 3) * b_dim1];
984 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
985 * b[l + (j + 3) * b_dim1];
986 }
987 c[i + j * c_dim1] = f11;
988 c[i + 1 + j * c_dim1] = f21;
989 c[i + (j + 1) * c_dim1] = f12;
990 c[i + 1 + (j + 1) * c_dim1] = f22;
991 c[i + (j + 2) * c_dim1] = f13;
992 c[i + 1 + (j + 2) * c_dim1] = f23;
993 c[i + (j + 3) * c_dim1] = f14;
994 c[i + 1 + (j + 3) * c_dim1] = f24;
995 c[i + 2 + j * c_dim1] = f31;
996 c[i + 3 + j * c_dim1] = f41;
997 c[i + 2 + (j + 1) * c_dim1] = f32;
998 c[i + 3 + (j + 1) * c_dim1] = f42;
999 c[i + 2 + (j + 2) * c_dim1] = f33;
1000 c[i + 3 + (j + 2) * c_dim1] = f43;
1001 c[i + 2 + (j + 3) * c_dim1] = f34;
1002 c[i + 3 + (j + 3) * c_dim1] = f44;
1003 }
1004 if (uisec < isec)
1005 {
1006 i5 = ii + isec - 1;
1007 for (i = ii + uisec; i <= i5; ++i)
1008 {
1009 f11 = c[i + j * c_dim1];
1010 f12 = c[i + (j + 1) * c_dim1];
1011 f13 = c[i + (j + 2) * c_dim1];
1012 f14 = c[i + (j + 3) * c_dim1];
1013 i6 = ll + lsec - 1;
1014 for (l = ll; l <= i6; ++l)
1015 {
1016 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1017 257] * b[l + j * b_dim1];
1018 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1019 257] * b[l + (j + 1) * b_dim1];
1020 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1021 257] * b[l + (j + 2) * b_dim1];
1022 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1023 257] * b[l + (j + 3) * b_dim1];
1024 }
1025 c[i + j * c_dim1] = f11;
1026 c[i + (j + 1) * c_dim1] = f12;
1027 c[i + (j + 2) * c_dim1] = f13;
1028 c[i + (j + 3) * c_dim1] = f14;
1029 }
1030 }
1031 }
1032 if (ujsec < jsec)
1033 {
1034 i4 = jj + jsec - 1;
1035 for (j = jj + ujsec; j <= i4; ++j)
1036 {
1037 i5 = ii + uisec - 1;
1038 for (i = ii; i <= i5; i += 4)
1039 {
1040 f11 = c[i + j * c_dim1];
1041 f21 = c[i + 1 + j * c_dim1];
1042 f31 = c[i + 2 + j * c_dim1];
1043 f41 = c[i + 3 + j * c_dim1];
1044 i6 = ll + lsec - 1;
1045 for (l = ll; l <= i6; ++l)
1046 {
1047 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1048 257] * b[l + j * b_dim1];
1049 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1050 257] * b[l + j * b_dim1];
1051 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1052 257] * b[l + j * b_dim1];
1053 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1054 257] * b[l + j * b_dim1];
1055 }
1056 c[i + j * c_dim1] = f11;
1057 c[i + 1 + j * c_dim1] = f21;
1058 c[i + 2 + j * c_dim1] = f31;
1059 c[i + 3 + j * c_dim1] = f41;
1060 }
1061 i5 = ii + isec - 1;
1062 for (i = ii + uisec; i <= i5; ++i)
1063 {
1064 f11 = c[i + j * c_dim1];
1065 i6 = ll + lsec - 1;
1066 for (l = ll; l <= i6; ++l)
1067 {
1068 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1069 257] * b[l + j * b_dim1];
1070 }
1071 c[i + j * c_dim1] = f11;
1072 }
1073 }
1074 }
1075 }
1076 }
1077 }
1078 return;
1079 }
1080 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1081 {
1082 if (GFC_DESCRIPTOR_RANK (a) != 1)
1083 {
1084 const GFC_REAL_16 *restrict abase_x;
1085 const GFC_REAL_16 *restrict bbase_y;
1086 GFC_REAL_16 *restrict dest_y;
1087 GFC_REAL_16 s;
1088
1089 for (y = 0; y < ycount; y++)
1090 {
1091 bbase_y = &bbase[y*bystride];
1092 dest_y = &dest[y*rystride];
1093 for (x = 0; x < xcount; x++)
1094 {
1095 abase_x = &abase[x*axstride];
1096 s = (GFC_REAL_16) 0;
1097 for (n = 0; n < count; n++)
1098 s += abase_x[n] * bbase_y[n];
1099 dest_y[x] = s;
1100 }
1101 }
1102 }
1103 else
1104 {
1105 const GFC_REAL_16 *restrict bbase_y;
1106 GFC_REAL_16 s;
1107
1108 for (y = 0; y < ycount; y++)
1109 {
1110 bbase_y = &bbase[y*bystride];
1111 s = (GFC_REAL_16) 0;
1112 for (n = 0; n < count; n++)
1113 s += abase[n*axstride] * bbase_y[n];
1114 dest[y*rystride] = s;
1115 }
1116 }
1117 }
1118 else if (axstride < aystride)
1119 {
1120 for (y = 0; y < ycount; y++)
1121 for (x = 0; x < xcount; x++)
1122 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1123
1124 for (y = 0; y < ycount; y++)
1125 for (n = 0; n < count; n++)
1126 for (x = 0; x < xcount; x++)
1127 /* dest[x,y] += a[x,n] * b[n,y] */
1128 dest[x*rxstride + y*rystride] +=
1129 abase[x*axstride + n*aystride] *
1130 bbase[n*bxstride + y*bystride];
1131 }
1132 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1133 {
1134 const GFC_REAL_16 *restrict bbase_y;
1135 GFC_REAL_16 s;
1136
1137 for (y = 0; y < ycount; y++)
1138 {
1139 bbase_y = &bbase[y*bystride];
1140 s = (GFC_REAL_16) 0;
1141 for (n = 0; n < count; n++)
1142 s += abase[n*axstride] * bbase_y[n*bxstride];
1143 dest[y*rxstride] = s;
1144 }
1145 }
1146 else
1147 {
1148 const GFC_REAL_16 *restrict abase_x;
1149 const GFC_REAL_16 *restrict bbase_y;
1150 GFC_REAL_16 *restrict dest_y;
1151 GFC_REAL_16 s;
1152
1153 for (y = 0; y < ycount; y++)
1154 {
1155 bbase_y = &bbase[y*bystride];
1156 dest_y = &dest[y*rystride];
1157 for (x = 0; x < xcount; x++)
1158 {
1159 abase_x = &abase[x*axstride];
1160 s = (GFC_REAL_16) 0;
1161 for (n = 0; n < count; n++)
1162 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1163 dest_y[x*rxstride] = s;
1164 }
1165 }
1166 }
1167}
1168#undef POW3
1169#undef min
1170#undef max
1171
1172#endif /* HAVE_AVX2 */
1173
1174#ifdef HAVE_AVX512F
1175static void
1176matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1177 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1178 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1179static void
1180matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1181 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1182 int blas_limit, blas_call gemm)
1183{
1184 const GFC_REAL_16 * restrict abase;
1185 const GFC_REAL_16 * restrict bbase;
1186 GFC_REAL_16 * restrict dest;
1187
1188 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1189 index_type x, y, n, count, xcount, ycount;
1190
1191 assert (GFC_DESCRIPTOR_RANK (a) == 2
1192 || GFC_DESCRIPTOR_RANK (b) == 2);
1193
1194/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1195
1196 Either A or B (but not both) can be rank 1:
1197
1198 o One-dimensional argument A is implicitly treated as a row matrix
1199 dimensioned [1,count], so xcount=1.
1200
1201 o One-dimensional argument B is implicitly treated as a column matrix
1202 dimensioned [count, 1], so ycount=1.
1203*/
1204
1205 if (retarray->base_addr == NULL)
1206 {
1207 if (GFC_DESCRIPTOR_RANK (a) == 1)
1208 {
1209 GFC_DIMENSION_SET(retarray->dim[0], 0,
1210 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1211 }
1212 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1213 {
1214 GFC_DIMENSION_SET(retarray->dim[0], 0,
1215 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1216 }
1217 else
1218 {
1219 GFC_DIMENSION_SET(retarray->dim[0], 0,
1220 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1221
1222 GFC_DIMENSION_SET(retarray->dim[1], 0,
1223 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1224 GFC_DESCRIPTOR_EXTENT(retarray,0));
1225 }
1226
1227 retarray->base_addr
1228 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1229 retarray->offset = 0;
1230 }
1231 else if (unlikely (compile_options.bounds_check))
1232 {
1233 index_type ret_extent, arg_extent;
1234
1235 if (GFC_DESCRIPTOR_RANK (a) == 1)
1236 {
1237 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1238 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1239 if (arg_extent != ret_extent)
1240 runtime_error ("Incorrect extent in return array in"
1241 " MATMUL intrinsic: is %ld, should be %ld",
1242 (long int) ret_extent, (long int) arg_extent);
1243 }
1244 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1245 {
1246 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1247 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1248 if (arg_extent != ret_extent)
1249 runtime_error ("Incorrect extent in return array in"
1250 " MATMUL intrinsic: is %ld, should be %ld",
1251 (long int) ret_extent, (long int) arg_extent);
1252 }
1253 else
1254 {
1255 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1256 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1257 if (arg_extent != ret_extent)
1258 runtime_error ("Incorrect extent in return array in"
1259 " MATMUL intrinsic for dimension 1:"
1260 " is %ld, should be %ld",
1261 (long int) ret_extent, (long int) arg_extent);
1262
1263 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1264 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1265 if (arg_extent != ret_extent)
1266 runtime_error ("Incorrect extent in return array in"
1267 " MATMUL intrinsic for dimension 2:"
1268 " is %ld, should be %ld",
1269 (long int) ret_extent, (long int) arg_extent);
1270 }
1271 }
1272
1273
1274 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1275 {
1276 /* One-dimensional result may be addressed in the code below
1277 either as a row or a column matrix. We want both cases to
1278 work. */
1279 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1280 }
1281 else
1282 {
1283 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1284 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1285 }
1286
1287
1288 if (GFC_DESCRIPTOR_RANK (a) == 1)
1289 {
1290 /* Treat it as a a row matrix A[1,count]. */
1291 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1292 aystride = 1;
1293
1294 xcount = 1;
1295 count = GFC_DESCRIPTOR_EXTENT(a,0);
1296 }
1297 else
1298 {
1299 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1300 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1301
1302 count = GFC_DESCRIPTOR_EXTENT(a,1);
1303 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1304 }
1305
1306 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1307 {
1308 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1309 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1310 }
1311
1312 if (GFC_DESCRIPTOR_RANK (b) == 1)
1313 {
1314 /* Treat it as a column matrix B[count,1] */
1315 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1316
1317 /* bystride should never be used for 1-dimensional b.
1318 in case it is we want it to cause a segfault, rather than
1319 an incorrect result. */
1320 bystride = 0xDEADBEEF;
1321 ycount = 1;
1322 }
1323 else
1324 {
1325 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1326 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1327 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1328 }
1329
1330 abase = a->base_addr;
1331 bbase = b->base_addr;
1332 dest = retarray->base_addr;
1333
1334 /* Now that everything is set up, we perform the multiplication
1335 itself. */
1336
1337#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1338#define min(a,b) ((a) <= (b) ? (a) : (b))
1339#define max(a,b) ((a) >= (b) ? (a) : (b))
1340
1341 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1342 && (bxstride == 1 || bystride == 1)
1343 && (((float) xcount) * ((float) ycount) * ((float) count)
1344 > POW3(blas_limit)))
1345 {
1346 const int m = xcount, n = ycount, k = count, ldc = rystride;
1347 const GFC_REAL_16 one = 1, zero = 0;
1348 const int lda = (axstride == 1) ? aystride : axstride,
1349 ldb = (bxstride == 1) ? bystride : bxstride;
1350
1351 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1352 {
1353 assert (gemm != NULL);
1354 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1355 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1356 &ldc, 1, 1);
1357 return;
1358 }
1359 }
1360
1361 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1362 {
1363 /* This block of code implements a tuned matmul, derived from
1364 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1365
1366 Bo Kagstrom and Per Ling
1367 Department of Computing Science
1368 Umea University
1369 S-901 87 Umea, Sweden
1370
1371 from netlib.org, translated to C, and modified for matmul.m4. */
1372
1373 const GFC_REAL_16 *a, *b;
1374 GFC_REAL_16 *c;
1375 const index_type m = xcount, n = ycount, k = count;
1376
1377 /* System generated locals */
1378 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1379 i1, i2, i3, i4, i5, i6;
1380
1381 /* Local variables */
1382 GFC_REAL_16 t1[65536], /* was [256][256] */
1383 f11, f12, f21, f22, f31, f32, f41, f42,
1384 f13, f14, f23, f24, f33, f34, f43, f44;
1385 index_type i, j, l, ii, jj, ll;
1386 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1387
1388 a = abase;
1389 b = bbase;
1390 c = retarray->base_addr;
1391
1392 /* Parameter adjustments */
1393 c_dim1 = rystride;
1394 c_offset = 1 + c_dim1;
1395 c -= c_offset;
1396 a_dim1 = aystride;
1397 a_offset = 1 + a_dim1;
1398 a -= a_offset;
1399 b_dim1 = bystride;
1400 b_offset = 1 + b_dim1;
1401 b -= b_offset;
1402
1403 /* Early exit if possible */
1404 if (m == 0 || n == 0 || k == 0)
1405 return;
1406
1407 /* Empty c first. */
1408 for (j=1; j<=n; j++)
1409 for (i=1; i<=m; i++)
1410 c[i + j * c_dim1] = (GFC_REAL_16)0;
1411
1412 /* Start turning the crank. */
1413 i1 = n;
1414 for (jj = 1; jj <= i1; jj += 512)
1415 {
1416 /* Computing MIN */
1417 i2 = 512;
1418 i3 = n - jj + 1;
1419 jsec = min(i2,i3);
1420 ujsec = jsec - jsec % 4;
1421 i2 = k;
1422 for (ll = 1; ll <= i2; ll += 256)
1423 {
1424 /* Computing MIN */
1425 i3 = 256;
1426 i4 = k - ll + 1;
1427 lsec = min(i3,i4);
1428 ulsec = lsec - lsec % 2;
1429
1430 i3 = m;
1431 for (ii = 1; ii <= i3; ii += 256)
1432 {
1433 /* Computing MIN */
1434 i4 = 256;
1435 i5 = m - ii + 1;
1436 isec = min(i4,i5);
1437 uisec = isec - isec % 2;
1438 i4 = ll + ulsec - 1;
1439 for (l = ll; l <= i4; l += 2)
1440 {
1441 i5 = ii + uisec - 1;
1442 for (i = ii; i <= i5; i += 2)
1443 {
1444 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1445 a[i + l * a_dim1];
1446 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1447 a[i + (l + 1) * a_dim1];
1448 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1449 a[i + 1 + l * a_dim1];
1450 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1451 a[i + 1 + (l + 1) * a_dim1];
1452 }
1453 if (uisec < isec)
1454 {
1455 t1[l - ll + 1 + (isec << 8) - 257] =
1456 a[ii + isec - 1 + l * a_dim1];
1457 t1[l - ll + 2 + (isec << 8) - 257] =
1458 a[ii + isec - 1 + (l + 1) * a_dim1];
1459 }
1460 }
1461 if (ulsec < lsec)
1462 {
1463 i4 = ii + isec - 1;
1464 for (i = ii; i<= i4; ++i)
1465 {
1466 t1[lsec + ((i - ii + 1) << 8) - 257] =
1467 a[i + (ll + lsec - 1) * a_dim1];
1468 }
1469 }
1470
1471 uisec = isec - isec % 4;
1472 i4 = jj + ujsec - 1;
1473 for (j = jj; j <= i4; j += 4)
1474 {
1475 i5 = ii + uisec - 1;
1476 for (i = ii; i <= i5; i += 4)
1477 {
1478 f11 = c[i + j * c_dim1];
1479 f21 = c[i + 1 + j * c_dim1];
1480 f12 = c[i + (j + 1) * c_dim1];
1481 f22 = c[i + 1 + (j + 1) * c_dim1];
1482 f13 = c[i + (j + 2) * c_dim1];
1483 f23 = c[i + 1 + (j + 2) * c_dim1];
1484 f14 = c[i + (j + 3) * c_dim1];
1485 f24 = c[i + 1 + (j + 3) * c_dim1];
1486 f31 = c[i + 2 + j * c_dim1];
1487 f41 = c[i + 3 + j * c_dim1];
1488 f32 = c[i + 2 + (j + 1) * c_dim1];
1489 f42 = c[i + 3 + (j + 1) * c_dim1];
1490 f33 = c[i + 2 + (j + 2) * c_dim1];
1491 f43 = c[i + 3 + (j + 2) * c_dim1];
1492 f34 = c[i + 2 + (j + 3) * c_dim1];
1493 f44 = c[i + 3 + (j + 3) * c_dim1];
1494 i6 = ll + lsec - 1;
1495 for (l = ll; l <= i6; ++l)
1496 {
1497 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1498 * b[l + j * b_dim1];
1499 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1500 * b[l + j * b_dim1];
1501 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1502 * b[l + (j + 1) * b_dim1];
1503 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1504 * b[l + (j + 1) * b_dim1];
1505 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1506 * b[l + (j + 2) * b_dim1];
1507 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1508 * b[l + (j + 2) * b_dim1];
1509 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1510 * b[l + (j + 3) * b_dim1];
1511 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1512 * b[l + (j + 3) * b_dim1];
1513 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1514 * b[l + j * b_dim1];
1515 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1516 * b[l + j * b_dim1];
1517 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1518 * b[l + (j + 1) * b_dim1];
1519 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1520 * b[l + (j + 1) * b_dim1];
1521 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1522 * b[l + (j + 2) * b_dim1];
1523 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1524 * b[l + (j + 2) * b_dim1];
1525 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1526 * b[l + (j + 3) * b_dim1];
1527 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1528 * b[l + (j + 3) * b_dim1];
1529 }
1530 c[i + j * c_dim1] = f11;
1531 c[i + 1 + j * c_dim1] = f21;
1532 c[i + (j + 1) * c_dim1] = f12;
1533 c[i + 1 + (j + 1) * c_dim1] = f22;
1534 c[i + (j + 2) * c_dim1] = f13;
1535 c[i + 1 + (j + 2) * c_dim1] = f23;
1536 c[i + (j + 3) * c_dim1] = f14;
1537 c[i + 1 + (j + 3) * c_dim1] = f24;
1538 c[i + 2 + j * c_dim1] = f31;
1539 c[i + 3 + j * c_dim1] = f41;
1540 c[i + 2 + (j + 1) * c_dim1] = f32;
1541 c[i + 3 + (j + 1) * c_dim1] = f42;
1542 c[i + 2 + (j + 2) * c_dim1] = f33;
1543 c[i + 3 + (j + 2) * c_dim1] = f43;
1544 c[i + 2 + (j + 3) * c_dim1] = f34;
1545 c[i + 3 + (j + 3) * c_dim1] = f44;
1546 }
1547 if (uisec < isec)
1548 {
1549 i5 = ii + isec - 1;
1550 for (i = ii + uisec; i <= i5; ++i)
1551 {
1552 f11 = c[i + j * c_dim1];
1553 f12 = c[i + (j + 1) * c_dim1];
1554 f13 = c[i + (j + 2) * c_dim1];
1555 f14 = c[i + (j + 3) * c_dim1];
1556 i6 = ll + lsec - 1;
1557 for (l = ll; l <= i6; ++l)
1558 {
1559 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1560 257] * b[l + j * b_dim1];
1561 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1562 257] * b[l + (j + 1) * b_dim1];
1563 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1564 257] * b[l + (j + 2) * b_dim1];
1565 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1566 257] * b[l + (j + 3) * b_dim1];
1567 }
1568 c[i + j * c_dim1] = f11;
1569 c[i + (j + 1) * c_dim1] = f12;
1570 c[i + (j + 2) * c_dim1] = f13;
1571 c[i + (j + 3) * c_dim1] = f14;
1572 }
1573 }
1574 }
1575 if (ujsec < jsec)
1576 {
1577 i4 = jj + jsec - 1;
1578 for (j = jj + ujsec; j <= i4; ++j)
1579 {
1580 i5 = ii + uisec - 1;
1581 for (i = ii; i <= i5; i += 4)
1582 {
1583 f11 = c[i + j * c_dim1];
1584 f21 = c[i + 1 + j * c_dim1];
1585 f31 = c[i + 2 + j * c_dim1];
1586 f41 = c[i + 3 + j * c_dim1];
1587 i6 = ll + lsec - 1;
1588 for (l = ll; l <= i6; ++l)
1589 {
1590 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1591 257] * b[l + j * b_dim1];
1592 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1593 257] * b[l + j * b_dim1];
1594 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1595 257] * b[l + j * b_dim1];
1596 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1597 257] * b[l + j * b_dim1];
1598 }
1599 c[i + j * c_dim1] = f11;
1600 c[i + 1 + j * c_dim1] = f21;
1601 c[i + 2 + j * c_dim1] = f31;
1602 c[i + 3 + j * c_dim1] = f41;
1603 }
1604 i5 = ii + isec - 1;
1605 for (i = ii + uisec; i <= i5; ++i)
1606 {
1607 f11 = c[i + j * c_dim1];
1608 i6 = ll + lsec - 1;
1609 for (l = ll; l <= i6; ++l)
1610 {
1611 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1612 257] * b[l + j * b_dim1];
1613 }
1614 c[i + j * c_dim1] = f11;
1615 }
1616 }
1617 }
1618 }
1619 }
1620 }
1621 return;
1622 }
1623 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1624 {
1625 if (GFC_DESCRIPTOR_RANK (a) != 1)
1626 {
1627 const GFC_REAL_16 *restrict abase_x;
1628 const GFC_REAL_16 *restrict bbase_y;
1629 GFC_REAL_16 *restrict dest_y;
1630 GFC_REAL_16 s;
1631
1632 for (y = 0; y < ycount; y++)
1633 {
1634 bbase_y = &bbase[y*bystride];
1635 dest_y = &dest[y*rystride];
1636 for (x = 0; x < xcount; x++)
1637 {
1638 abase_x = &abase[x*axstride];
1639 s = (GFC_REAL_16) 0;
1640 for (n = 0; n < count; n++)
1641 s += abase_x[n] * bbase_y[n];
1642 dest_y[x] = s;
1643 }
1644 }
1645 }
1646 else
1647 {
1648 const GFC_REAL_16 *restrict bbase_y;
1649 GFC_REAL_16 s;
1650
1651 for (y = 0; y < ycount; y++)
1652 {
1653 bbase_y = &bbase[y*bystride];
1654 s = (GFC_REAL_16) 0;
1655 for (n = 0; n < count; n++)
1656 s += abase[n*axstride] * bbase_y[n];
1657 dest[y*rystride] = s;
1658 }
1659 }
1660 }
1661 else if (axstride < aystride)
1662 {
1663 for (y = 0; y < ycount; y++)
1664 for (x = 0; x < xcount; x++)
1665 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1666
1667 for (y = 0; y < ycount; y++)
1668 for (n = 0; n < count; n++)
1669 for (x = 0; x < xcount; x++)
1670 /* dest[x,y] += a[x,n] * b[n,y] */
1671 dest[x*rxstride + y*rystride] +=
1672 abase[x*axstride + n*aystride] *
1673 bbase[n*bxstride + y*bystride];
1674 }
1675 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1676 {
1677 const GFC_REAL_16 *restrict bbase_y;
1678 GFC_REAL_16 s;
1679
1680 for (y = 0; y < ycount; y++)
1681 {
1682 bbase_y = &bbase[y*bystride];
1683 s = (GFC_REAL_16) 0;
1684 for (n = 0; n < count; n++)
1685 s += abase[n*axstride] * bbase_y[n*bxstride];
1686 dest[y*rxstride] = s;
1687 }
1688 }
1689 else
1690 {
1691 const GFC_REAL_16 *restrict abase_x;
1692 const GFC_REAL_16 *restrict bbase_y;
1693 GFC_REAL_16 *restrict dest_y;
1694 GFC_REAL_16 s;
1695
1696 for (y = 0; y < ycount; y++)
1697 {
1698 bbase_y = &bbase[y*bystride];
1699 dest_y = &dest[y*rystride];
1700 for (x = 0; x < xcount; x++)
1701 {
1702 abase_x = &abase[x*axstride];
1703 s = (GFC_REAL_16) 0;
1704 for (n = 0; n < count; n++)
1705 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1706 dest_y[x*rxstride] = s;
1707 }
1708 }
1709 }
1710}
1711#undef POW3
1712#undef min
1713#undef max
1714
1715#endif /* HAVE_AVX512F */
1716
1717/* Function to fall back to if there is no special processor-specific version. */
1718static void
1719matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
1720 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1721 int blas_limit, blas_call gemm)
1722{
1723 const GFC_REAL_16 * restrict abase;
1724 const GFC_REAL_16 * restrict bbase;
1725 GFC_REAL_16 * restrict dest;
1726
1727 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1728 index_type x, y, n, count, xcount, ycount;
1729
1730 assert (GFC_DESCRIPTOR_RANK (a) == 2
1731 || GFC_DESCRIPTOR_RANK (b) == 2);
1732
1733/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1734
1735 Either A or B (but not both) can be rank 1:
1736
1737 o One-dimensional argument A is implicitly treated as a row matrix
1738 dimensioned [1,count], so xcount=1.
1739
1740 o One-dimensional argument B is implicitly treated as a column matrix
1741 dimensioned [count, 1], so ycount=1.
1742*/
1743
1744 if (retarray->base_addr == NULL)
1745 {
1746 if (GFC_DESCRIPTOR_RANK (a) == 1)
1747 {
1748 GFC_DIMENSION_SET(retarray->dim[0], 0,
1749 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1750 }
1751 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1752 {
1753 GFC_DIMENSION_SET(retarray->dim[0], 0,
1754 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1755 }
1756 else
1757 {
1758 GFC_DIMENSION_SET(retarray->dim[0], 0,
1759 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1760
1761 GFC_DIMENSION_SET(retarray->dim[1], 0,
1762 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1763 GFC_DESCRIPTOR_EXTENT(retarray,0));
1764 }
1765
1766 retarray->base_addr
1767 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1768 retarray->offset = 0;
1769 }
1770 else if (unlikely (compile_options.bounds_check))
1771 {
1772 index_type ret_extent, arg_extent;
1773
1774 if (GFC_DESCRIPTOR_RANK (a) == 1)
1775 {
1776 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1777 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1778 if (arg_extent != ret_extent)
1779 runtime_error ("Incorrect extent in return array in"
1780 " MATMUL intrinsic: is %ld, should be %ld",
1781 (long int) ret_extent, (long int) arg_extent);
1782 }
1783 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1784 {
1785 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1786 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1787 if (arg_extent != ret_extent)
1788 runtime_error ("Incorrect extent in return array in"
1789 " MATMUL intrinsic: is %ld, should be %ld",
1790 (long int) ret_extent, (long int) arg_extent);
1791 }
1792 else
1793 {
1794 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1795 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1796 if (arg_extent != ret_extent)
1797 runtime_error ("Incorrect extent in return array in"
1798 " MATMUL intrinsic for dimension 1:"
1799 " is %ld, should be %ld",
1800 (long int) ret_extent, (long int) arg_extent);
1801
1802 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1803 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1804 if (arg_extent != ret_extent)
1805 runtime_error ("Incorrect extent in return array in"
1806 " MATMUL intrinsic for dimension 2:"
1807 " is %ld, should be %ld",
1808 (long int) ret_extent, (long int) arg_extent);
1809 }
1810 }
1811
1812
1813 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1814 {
1815 /* One-dimensional result may be addressed in the code below
1816 either as a row or a column matrix. We want both cases to
1817 work. */
1818 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1819 }
1820 else
1821 {
1822 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1823 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1824 }
1825
1826
1827 if (GFC_DESCRIPTOR_RANK (a) == 1)
1828 {
1829 /* Treat it as a a row matrix A[1,count]. */
1830 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1831 aystride = 1;
1832
1833 xcount = 1;
1834 count = GFC_DESCRIPTOR_EXTENT(a,0);
1835 }
1836 else
1837 {
1838 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1839 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1840
1841 count = GFC_DESCRIPTOR_EXTENT(a,1);
1842 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1843 }
1844
1845 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1846 {
1847 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1848 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1849 }
1850
1851 if (GFC_DESCRIPTOR_RANK (b) == 1)
1852 {
1853 /* Treat it as a column matrix B[count,1] */
1854 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1855
1856 /* bystride should never be used for 1-dimensional b.
1857 in case it is we want it to cause a segfault, rather than
1858 an incorrect result. */
1859 bystride = 0xDEADBEEF;
1860 ycount = 1;
1861 }
1862 else
1863 {
1864 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1865 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1866 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1867 }
1868
1869 abase = a->base_addr;
1870 bbase = b->base_addr;
1871 dest = retarray->base_addr;
1872
1873 /* Now that everything is set up, we perform the multiplication
1874 itself. */
1875
1876#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1877#define min(a,b) ((a) <= (b) ? (a) : (b))
1878#define max(a,b) ((a) >= (b) ? (a) : (b))
1879
1880 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1881 && (bxstride == 1 || bystride == 1)
1882 && (((float) xcount) * ((float) ycount) * ((float) count)
1883 > POW3(blas_limit)))
1884 {
1885 const int m = xcount, n = ycount, k = count, ldc = rystride;
1886 const GFC_REAL_16 one = 1, zero = 0;
1887 const int lda = (axstride == 1) ? aystride : axstride,
1888 ldb = (bxstride == 1) ? bystride : bxstride;
1889
1890 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1891 {
1892 assert (gemm != NULL);
1893 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1894 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1895 &ldc, 1, 1);
1896 return;
1897 }
1898 }
1899
1900 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1901 {
1902 /* This block of code implements a tuned matmul, derived from
1903 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1904
1905 Bo Kagstrom and Per Ling
1906 Department of Computing Science
1907 Umea University
1908 S-901 87 Umea, Sweden
1909
1910 from netlib.org, translated to C, and modified for matmul.m4. */
1911
1912 const GFC_REAL_16 *a, *b;
1913 GFC_REAL_16 *c;
1914 const index_type m = xcount, n = ycount, k = count;
1915
1916 /* System generated locals */
1917 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1918 i1, i2, i3, i4, i5, i6;
1919
1920 /* Local variables */
1921 GFC_REAL_16 t1[65536], /* was [256][256] */
1922 f11, f12, f21, f22, f31, f32, f41, f42,
1923 f13, f14, f23, f24, f33, f34, f43, f44;
1924 index_type i, j, l, ii, jj, ll;
1925 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1926
1927 a = abase;
1928 b = bbase;
1929 c = retarray->base_addr;
1930
1931 /* Parameter adjustments */
1932 c_dim1 = rystride;
1933 c_offset = 1 + c_dim1;
1934 c -= c_offset;
1935 a_dim1 = aystride;
1936 a_offset = 1 + a_dim1;
1937 a -= a_offset;
1938 b_dim1 = bystride;
1939 b_offset = 1 + b_dim1;
1940 b -= b_offset;
1941
1942 /* Early exit if possible */
1943 if (m == 0 || n == 0 || k == 0)
1944 return;
1945
1946 /* Empty c first. */
1947 for (j=1; j<=n; j++)
1948 for (i=1; i<=m; i++)
1949 c[i + j * c_dim1] = (GFC_REAL_16)0;
1950
1951 /* Start turning the crank. */
1952 i1 = n;
1953 for (jj = 1; jj <= i1; jj += 512)
1954 {
1955 /* Computing MIN */
1956 i2 = 512;
1957 i3 = n - jj + 1;
1958 jsec = min(i2,i3);
1959 ujsec = jsec - jsec % 4;
1960 i2 = k;
1961 for (ll = 1; ll <= i2; ll += 256)
1962 {
1963 /* Computing MIN */
1964 i3 = 256;
1965 i4 = k - ll + 1;
1966 lsec = min(i3,i4);
1967 ulsec = lsec - lsec % 2;
1968
1969 i3 = m;
1970 for (ii = 1; ii <= i3; ii += 256)
1971 {
1972 /* Computing MIN */
1973 i4 = 256;
1974 i5 = m - ii + 1;
1975 isec = min(i4,i5);
1976 uisec = isec - isec % 2;
1977 i4 = ll + ulsec - 1;
1978 for (l = ll; l <= i4; l += 2)
1979 {
1980 i5 = ii + uisec - 1;
1981 for (i = ii; i <= i5; i += 2)
1982 {
1983 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1984 a[i + l * a_dim1];
1985 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1986 a[i + (l + 1) * a_dim1];
1987 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1988 a[i + 1 + l * a_dim1];
1989 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1990 a[i + 1 + (l + 1) * a_dim1];
1991 }
1992 if (uisec < isec)
1993 {
1994 t1[l - ll + 1 + (isec << 8) - 257] =
1995 a[ii + isec - 1 + l * a_dim1];
1996 t1[l - ll + 2 + (isec << 8) - 257] =
1997 a[ii + isec - 1 + (l + 1) * a_dim1];
1998 }
1999 }
2000 if (ulsec < lsec)
2001 {
2002 i4 = ii + isec - 1;
2003 for (i = ii; i<= i4; ++i)
2004 {
2005 t1[lsec + ((i - ii + 1) << 8) - 257] =
2006 a[i + (ll + lsec - 1) * a_dim1];
2007 }
2008 }
2009
2010 uisec = isec - isec % 4;
2011 i4 = jj + ujsec - 1;
2012 for (j = jj; j <= i4; j += 4)
2013 {
2014 i5 = ii + uisec - 1;
2015 for (i = ii; i <= i5; i += 4)
2016 {
2017 f11 = c[i + j * c_dim1];
2018 f21 = c[i + 1 + j * c_dim1];
2019 f12 = c[i + (j + 1) * c_dim1];
2020 f22 = c[i + 1 + (j + 1) * c_dim1];
2021 f13 = c[i + (j + 2) * c_dim1];
2022 f23 = c[i + 1 + (j + 2) * c_dim1];
2023 f14 = c[i + (j + 3) * c_dim1];
2024 f24 = c[i + 1 + (j + 3) * c_dim1];
2025 f31 = c[i + 2 + j * c_dim1];
2026 f41 = c[i + 3 + j * c_dim1];
2027 f32 = c[i + 2 + (j + 1) * c_dim1];
2028 f42 = c[i + 3 + (j + 1) * c_dim1];
2029 f33 = c[i + 2 + (j + 2) * c_dim1];
2030 f43 = c[i + 3 + (j + 2) * c_dim1];
2031 f34 = c[i + 2 + (j + 3) * c_dim1];
2032 f44 = c[i + 3 + (j + 3) * c_dim1];
2033 i6 = ll + lsec - 1;
2034 for (l = ll; l <= i6; ++l)
2035 {
2036 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2037 * b[l + j * b_dim1];
2038 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2039 * b[l + j * b_dim1];
2040 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2041 * b[l + (j + 1) * b_dim1];
2042 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2043 * b[l + (j + 1) * b_dim1];
2044 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2045 * b[l + (j + 2) * b_dim1];
2046 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2047 * b[l + (j + 2) * b_dim1];
2048 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2049 * b[l + (j + 3) * b_dim1];
2050 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2051 * b[l + (j + 3) * b_dim1];
2052 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2053 * b[l + j * b_dim1];
2054 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2055 * b[l + j * b_dim1];
2056 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2057 * b[l + (j + 1) * b_dim1];
2058 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2059 * b[l + (j + 1) * b_dim1];
2060 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2061 * b[l + (j + 2) * b_dim1];
2062 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2063 * b[l + (j + 2) * b_dim1];
2064 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2065 * b[l + (j + 3) * b_dim1];
2066 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2067 * b[l + (j + 3) * b_dim1];
2068 }
2069 c[i + j * c_dim1] = f11;
2070 c[i + 1 + j * c_dim1] = f21;
2071 c[i + (j + 1) * c_dim1] = f12;
2072 c[i + 1 + (j + 1) * c_dim1] = f22;
2073 c[i + (j + 2) * c_dim1] = f13;
2074 c[i + 1 + (j + 2) * c_dim1] = f23;
2075 c[i + (j + 3) * c_dim1] = f14;
2076 c[i + 1 + (j + 3) * c_dim1] = f24;
2077 c[i + 2 + j * c_dim1] = f31;
2078 c[i + 3 + j * c_dim1] = f41;
2079 c[i + 2 + (j + 1) * c_dim1] = f32;
2080 c[i + 3 + (j + 1) * c_dim1] = f42;
2081 c[i + 2 + (j + 2) * c_dim1] = f33;
2082 c[i + 3 + (j + 2) * c_dim1] = f43;
2083 c[i + 2 + (j + 3) * c_dim1] = f34;
2084 c[i + 3 + (j + 3) * c_dim1] = f44;
2085 }
2086 if (uisec < isec)
2087 {
2088 i5 = ii + isec - 1;
2089 for (i = ii + uisec; i <= i5; ++i)
2090 {
2091 f11 = c[i + j * c_dim1];
2092 f12 = c[i + (j + 1) * c_dim1];
2093 f13 = c[i + (j + 2) * c_dim1];
2094 f14 = c[i + (j + 3) * c_dim1];
2095 i6 = ll + lsec - 1;
2096 for (l = ll; l <= i6; ++l)
2097 {
2098 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2099 257] * b[l + j * b_dim1];
2100 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2101 257] * b[l + (j + 1) * b_dim1];
2102 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2103 257] * b[l + (j + 2) * b_dim1];
2104 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2105 257] * b[l + (j + 3) * b_dim1];
2106 }
2107 c[i + j * c_dim1] = f11;
2108 c[i + (j + 1) * c_dim1] = f12;
2109 c[i + (j + 2) * c_dim1] = f13;
2110 c[i + (j + 3) * c_dim1] = f14;
2111 }
2112 }
2113 }
2114 if (ujsec < jsec)
2115 {
2116 i4 = jj + jsec - 1;
2117 for (j = jj + ujsec; j <= i4; ++j)
2118 {
2119 i5 = ii + uisec - 1;
2120 for (i = ii; i <= i5; i += 4)
2121 {
2122 f11 = c[i + j * c_dim1];
2123 f21 = c[i + 1 + j * c_dim1];
2124 f31 = c[i + 2 + j * c_dim1];
2125 f41 = c[i + 3 + j * c_dim1];
2126 i6 = ll + lsec - 1;
2127 for (l = ll; l <= i6; ++l)
2128 {
2129 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2130 257] * b[l + j * b_dim1];
2131 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2132 257] * b[l + j * b_dim1];
2133 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2134 257] * b[l + j * b_dim1];
2135 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2136 257] * b[l + j * b_dim1];
2137 }
2138 c[i + j * c_dim1] = f11;
2139 c[i + 1 + j * c_dim1] = f21;
2140 c[i + 2 + j * c_dim1] = f31;
2141 c[i + 3 + j * c_dim1] = f41;
2142 }
2143 i5 = ii + isec - 1;
2144 for (i = ii + uisec; i <= i5; ++i)
2145 {
2146 f11 = c[i + j * c_dim1];
2147 i6 = ll + lsec - 1;
2148 for (l = ll; l <= i6; ++l)
2149 {
2150 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2151 257] * b[l + j * b_dim1];
2152 }
2153 c[i + j * c_dim1] = f11;
2154 }
2155 }
2156 }
2157 }
2158 }
2159 }
2160 return;
2161 }
2162 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2163 {
2164 if (GFC_DESCRIPTOR_RANK (a) != 1)
2165 {
2166 const GFC_REAL_16 *restrict abase_x;
2167 const GFC_REAL_16 *restrict bbase_y;
2168 GFC_REAL_16 *restrict dest_y;
2169 GFC_REAL_16 s;
2170
2171 for (y = 0; y < ycount; y++)
2172 {
2173 bbase_y = &bbase[y*bystride];
2174 dest_y = &dest[y*rystride];
2175 for (x = 0; x < xcount; x++)
2176 {
2177 abase_x = &abase[x*axstride];
2178 s = (GFC_REAL_16) 0;
2179 for (n = 0; n < count; n++)
2180 s += abase_x[n] * bbase_y[n];
2181 dest_y[x] = s;
2182 }
2183 }
2184 }
2185 else
2186 {
2187 const GFC_REAL_16 *restrict bbase_y;
2188 GFC_REAL_16 s;
2189
2190 for (y = 0; y < ycount; y++)
2191 {
2192 bbase_y = &bbase[y*bystride];
2193 s = (GFC_REAL_16) 0;
2194 for (n = 0; n < count; n++)
2195 s += abase[n*axstride] * bbase_y[n];
2196 dest[y*rystride] = s;
2197 }
2198 }
2199 }
2200 else if (axstride < aystride)
2201 {
2202 for (y = 0; y < ycount; y++)
2203 for (x = 0; x < xcount; x++)
2204 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2205
2206 for (y = 0; y < ycount; y++)
2207 for (n = 0; n < count; n++)
2208 for (x = 0; x < xcount; x++)
2209 /* dest[x,y] += a[x,n] * b[n,y] */
2210 dest[x*rxstride + y*rystride] +=
2211 abase[x*axstride + n*aystride] *
2212 bbase[n*bxstride + y*bystride];
2213 }
2214 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2215 {
2216 const GFC_REAL_16 *restrict bbase_y;
2217 GFC_REAL_16 s;
2218
2219 for (y = 0; y < ycount; y++)
2220 {
2221 bbase_y = &bbase[y*bystride];
2222 s = (GFC_REAL_16) 0;
2223 for (n = 0; n < count; n++)
2224 s += abase[n*axstride] * bbase_y[n*bxstride];
2225 dest[y*rxstride] = s;
2226 }
2227 }
2228 else
2229 {
2230 const GFC_REAL_16 *restrict abase_x;
2231 const GFC_REAL_16 *restrict bbase_y;
2232 GFC_REAL_16 *restrict dest_y;
2233 GFC_REAL_16 s;
2234
2235 for (y = 0; y < ycount; y++)
2236 {
2237 bbase_y = &bbase[y*bystride];
2238 dest_y = &dest[y*rystride];
2239 for (x = 0; x < xcount; x++)
2240 {
2241 abase_x = &abase[x*axstride];
2242 s = (GFC_REAL_16) 0;
2243 for (n = 0; n < count; n++)
2244 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2245 dest_y[x*rxstride] = s;
2246 }
2247 }
2248 }
2249}
2250#undef POW3
2251#undef min
2252#undef max
2253
2254
2255/* Compiling main function, with selection code for the processor. */
2256
2257/* Currently, this is i386 only. Adjust for other architectures. */
2258
2259#include <config/i386/cpuinfo.h>
2260void matmul_r16 (gfc_array_r16 * const restrict retarray,
2261 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2262 int blas_limit, blas_call gemm)
2263{
2264 static void (*matmul_p) (gfc_array_r16 * const restrict retarray,
2265 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2266 int blas_limit, blas_call gemm) = NULL;
2267
2268 if (matmul_p == NULL)
2269 {
2270 matmul_p = matmul_r16_vanilla;
2271 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2272 {
2273 /* Run down the available processors in order of preference. */
2274#ifdef HAVE_AVX512F
2275 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2276 {
2277 matmul_p = matmul_r16_avx512f;
2278 goto tailcall;
2279 }
2280
2281#endif /* HAVE_AVX512F */
2282
2283#ifdef HAVE_AVX2
2284 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2285 {
2286 matmul_p = matmul_r16_avx2;
2287 goto tailcall;
2288 }
2289
2290#endif
2291
2292#ifdef HAVE_AVX
2293 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2294 {
2295 matmul_p = matmul_r16_avx;
2296 goto tailcall;
2297 }
2298#endif /* HAVE_AVX */
2299 }
2300 }
2301
2302tailcall:
2303 (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2304}
2305
2306#else /* Just the vanilla function. */
2307
644cb69f 2308void
85206901 2309matmul_r16 (gfc_array_r16 * const restrict retarray,
5a0aad31
FXC
2310 gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2311 int blas_limit, blas_call gemm)
644cb69f 2312{
85206901
JB
2313 const GFC_REAL_16 * restrict abase;
2314 const GFC_REAL_16 * restrict bbase;
2315 GFC_REAL_16 * restrict dest;
644cb69f
FXC
2316
2317 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2318 index_type x, y, n, count, xcount, ycount;
2319
2320 assert (GFC_DESCRIPTOR_RANK (a) == 2
2321 || GFC_DESCRIPTOR_RANK (b) == 2);
2322
2323/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2324
2325 Either A or B (but not both) can be rank 1:
2326
2327 o One-dimensional argument A is implicitly treated as a row matrix
2328 dimensioned [1,count], so xcount=1.
2329
2330 o One-dimensional argument B is implicitly treated as a column matrix
2331 dimensioned [count, 1], so ycount=1.
5d70ab07 2332*/
644cb69f 2333
21d1335b 2334 if (retarray->base_addr == NULL)
644cb69f
FXC
2335 {
2336 if (GFC_DESCRIPTOR_RANK (a) == 1)
2337 {
dfb55fdc
TK
2338 GFC_DIMENSION_SET(retarray->dim[0], 0,
2339 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
644cb69f
FXC
2340 }
2341 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2342 {
dfb55fdc
TK
2343 GFC_DIMENSION_SET(retarray->dim[0], 0,
2344 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
644cb69f
FXC
2345 }
2346 else
2347 {
dfb55fdc
TK
2348 GFC_DIMENSION_SET(retarray->dim[0], 0,
2349 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
644cb69f 2350
dfb55fdc
TK
2351 GFC_DIMENSION_SET(retarray->dim[1], 0,
2352 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2353 GFC_DESCRIPTOR_EXTENT(retarray,0));
644cb69f
FXC
2354 }
2355
21d1335b 2356 retarray->base_addr
92e6f3a4 2357 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
644cb69f
FXC
2358 retarray->offset = 0;
2359 }
5d70ab07
JD
2360 else if (unlikely (compile_options.bounds_check))
2361 {
2362 index_type ret_extent, arg_extent;
2363
2364 if (GFC_DESCRIPTOR_RANK (a) == 1)
2365 {
2366 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2367 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2368 if (arg_extent != ret_extent)
2369 runtime_error ("Incorrect extent in return array in"
2370 " MATMUL intrinsic: is %ld, should be %ld",
2371 (long int) ret_extent, (long int) arg_extent);
2372 }
2373 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2374 {
2375 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2376 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2377 if (arg_extent != ret_extent)
2378 runtime_error ("Incorrect extent in return array in"
2379 " MATMUL intrinsic: is %ld, should be %ld",
2380 (long int) ret_extent, (long int) arg_extent);
2381 }
2382 else
2383 {
2384 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2385 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2386 if (arg_extent != ret_extent)
2387 runtime_error ("Incorrect extent in return array in"
2388 " MATMUL intrinsic for dimension 1:"
2389 " is %ld, should be %ld",
2390 (long int) ret_extent, (long int) arg_extent);
2391
2392 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2393 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2394 if (arg_extent != ret_extent)
2395 runtime_error ("Incorrect extent in return array in"
2396 " MATMUL intrinsic for dimension 2:"
2397 " is %ld, should be %ld",
2398 (long int) ret_extent, (long int) arg_extent);
2399 }
2400 }
644cb69f 2401
644cb69f
FXC
2402
2403 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2404 {
2405 /* One-dimensional result may be addressed in the code below
2406 either as a row or a column matrix. We want both cases to
2407 work. */
dfb55fdc 2408 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
644cb69f
FXC
2409 }
2410 else
2411 {
dfb55fdc
TK
2412 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2413 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
644cb69f
FXC
2414 }
2415
2416
2417 if (GFC_DESCRIPTOR_RANK (a) == 1)
2418 {
2419 /* Treat it as a a row matrix A[1,count]. */
dfb55fdc 2420 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
644cb69f
FXC
2421 aystride = 1;
2422
2423 xcount = 1;
dfb55fdc 2424 count = GFC_DESCRIPTOR_EXTENT(a,0);
644cb69f
FXC
2425 }
2426 else
2427 {
dfb55fdc
TK
2428 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2429 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
644cb69f 2430
dfb55fdc
TK
2431 count = GFC_DESCRIPTOR_EXTENT(a,1);
2432 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
644cb69f
FXC
2433 }
2434
dfb55fdc 2435 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
7edc89d4 2436 {
dfb55fdc 2437 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
7edc89d4
TK
2438 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2439 }
644cb69f
FXC
2440
2441 if (GFC_DESCRIPTOR_RANK (b) == 1)
2442 {
2443 /* Treat it as a column matrix B[count,1] */
dfb55fdc 2444 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
644cb69f
FXC
2445
2446 /* bystride should never be used for 1-dimensional b.
2447 in case it is we want it to cause a segfault, rather than
2448 an incorrect result. */
2449 bystride = 0xDEADBEEF;
2450 ycount = 1;
2451 }
2452 else
2453 {
dfb55fdc
TK
2454 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2455 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2456 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
644cb69f
FXC
2457 }
2458
21d1335b
TB
2459 abase = a->base_addr;
2460 bbase = b->base_addr;
2461 dest = retarray->base_addr;
644cb69f 2462
5d70ab07 2463 /* Now that everything is set up, we perform the multiplication
5a0aad31
FXC
2464 itself. */
2465
2466#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
5d70ab07
JD
2467#define min(a,b) ((a) <= (b) ? (a) : (b))
2468#define max(a,b) ((a) >= (b) ? (a) : (b))
5a0aad31
FXC
2469
2470 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2471 && (bxstride == 1 || bystride == 1)
2472 && (((float) xcount) * ((float) ycount) * ((float) count)
2473 > POW3(blas_limit)))
644cb69f 2474 {
5d70ab07
JD
2475 const int m = xcount, n = ycount, k = count, ldc = rystride;
2476 const GFC_REAL_16 one = 1, zero = 0;
2477 const int lda = (axstride == 1) ? aystride : axstride,
2478 ldb = (bxstride == 1) ? bystride : bxstride;
644cb69f 2479
5d70ab07 2480 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
644cb69f 2481 {
5d70ab07
JD
2482 assert (gemm != NULL);
2483 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2484 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2485 &ldc, 1, 1);
2486 return;
644cb69f 2487 }
5d70ab07 2488 }
644cb69f 2489
5d70ab07
JD
2490 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2491 {
2492 /* This block of code implements a tuned matmul, derived from
2493 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2494
2495 Bo Kagstrom and Per Ling
2496 Department of Computing Science
2497 Umea University
2498 S-901 87 Umea, Sweden
2499
2500 from netlib.org, translated to C, and modified for matmul.m4. */
2501
2502 const GFC_REAL_16 *a, *b;
2503 GFC_REAL_16 *c;
2504 const index_type m = xcount, n = ycount, k = count;
2505
2506 /* System generated locals */
2507 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2508 i1, i2, i3, i4, i5, i6;
2509
2510 /* Local variables */
2511 GFC_REAL_16 t1[65536], /* was [256][256] */
2512 f11, f12, f21, f22, f31, f32, f41, f42,
2513 f13, f14, f23, f24, f33, f34, f43, f44;
2514 index_type i, j, l, ii, jj, ll;
2515 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2516
2517 a = abase;
2518 b = bbase;
2519 c = retarray->base_addr;
2520
2521 /* Parameter adjustments */
2522 c_dim1 = rystride;
2523 c_offset = 1 + c_dim1;
2524 c -= c_offset;
2525 a_dim1 = aystride;
2526 a_offset = 1 + a_dim1;
2527 a -= a_offset;
2528 b_dim1 = bystride;
2529 b_offset = 1 + b_dim1;
2530 b -= b_offset;
2531
2532 /* Early exit if possible */
2533 if (m == 0 || n == 0 || k == 0)
2534 return;
2535
2536 /* Empty c first. */
2537 for (j=1; j<=n; j++)
2538 for (i=1; i<=m; i++)
2539 c[i + j * c_dim1] = (GFC_REAL_16)0;
2540
2541 /* Start turning the crank. */
2542 i1 = n;
2543 for (jj = 1; jj <= i1; jj += 512)
644cb69f 2544 {
5d70ab07
JD
2545 /* Computing MIN */
2546 i2 = 512;
2547 i3 = n - jj + 1;
2548 jsec = min(i2,i3);
2549 ujsec = jsec - jsec % 4;
2550 i2 = k;
2551 for (ll = 1; ll <= i2; ll += 256)
644cb69f 2552 {
5d70ab07
JD
2553 /* Computing MIN */
2554 i3 = 256;
2555 i4 = k - ll + 1;
2556 lsec = min(i3,i4);
2557 ulsec = lsec - lsec % 2;
2558
2559 i3 = m;
2560 for (ii = 1; ii <= i3; ii += 256)
644cb69f 2561 {
5d70ab07
JD
2562 /* Computing MIN */
2563 i4 = 256;
2564 i5 = m - ii + 1;
2565 isec = min(i4,i5);
2566 uisec = isec - isec % 2;
2567 i4 = ll + ulsec - 1;
2568 for (l = ll; l <= i4; l += 2)
2569 {
2570 i5 = ii + uisec - 1;
2571 for (i = ii; i <= i5; i += 2)
2572 {
2573 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2574 a[i + l * a_dim1];
2575 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2576 a[i + (l + 1) * a_dim1];
2577 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2578 a[i + 1 + l * a_dim1];
2579 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2580 a[i + 1 + (l + 1) * a_dim1];
2581 }
2582 if (uisec < isec)
2583 {
2584 t1[l - ll + 1 + (isec << 8) - 257] =
2585 a[ii + isec - 1 + l * a_dim1];
2586 t1[l - ll + 2 + (isec << 8) - 257] =
2587 a[ii + isec - 1 + (l + 1) * a_dim1];
2588 }
2589 }
2590 if (ulsec < lsec)
2591 {
2592 i4 = ii + isec - 1;
2593 for (i = ii; i<= i4; ++i)
2594 {
2595 t1[lsec + ((i - ii + 1) << 8) - 257] =
2596 a[i + (ll + lsec - 1) * a_dim1];
2597 }
2598 }
2599
2600 uisec = isec - isec % 4;
2601 i4 = jj + ujsec - 1;
2602 for (j = jj; j <= i4; j += 4)
2603 {
2604 i5 = ii + uisec - 1;
2605 for (i = ii; i <= i5; i += 4)
2606 {
2607 f11 = c[i + j * c_dim1];
2608 f21 = c[i + 1 + j * c_dim1];
2609 f12 = c[i + (j + 1) * c_dim1];
2610 f22 = c[i + 1 + (j + 1) * c_dim1];
2611 f13 = c[i + (j + 2) * c_dim1];
2612 f23 = c[i + 1 + (j + 2) * c_dim1];
2613 f14 = c[i + (j + 3) * c_dim1];
2614 f24 = c[i + 1 + (j + 3) * c_dim1];
2615 f31 = c[i + 2 + j * c_dim1];
2616 f41 = c[i + 3 + j * c_dim1];
2617 f32 = c[i + 2 + (j + 1) * c_dim1];
2618 f42 = c[i + 3 + (j + 1) * c_dim1];
2619 f33 = c[i + 2 + (j + 2) * c_dim1];
2620 f43 = c[i + 3 + (j + 2) * c_dim1];
2621 f34 = c[i + 2 + (j + 3) * c_dim1];
2622 f44 = c[i + 3 + (j + 3) * c_dim1];
2623 i6 = ll + lsec - 1;
2624 for (l = ll; l <= i6; ++l)
2625 {
2626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627 * b[l + j * b_dim1];
2628 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629 * b[l + j * b_dim1];
2630 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631 * b[l + (j + 1) * b_dim1];
2632 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633 * b[l + (j + 1) * b_dim1];
2634 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635 * b[l + (j + 2) * b_dim1];
2636 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637 * b[l + (j + 2) * b_dim1];
2638 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2639 * b[l + (j + 3) * b_dim1];
2640 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2641 * b[l + (j + 3) * b_dim1];
2642 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643 * b[l + j * b_dim1];
2644 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645 * b[l + j * b_dim1];
2646 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647 * b[l + (j + 1) * b_dim1];
2648 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649 * b[l + (j + 1) * b_dim1];
2650 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651 * b[l + (j + 2) * b_dim1];
2652 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653 * b[l + (j + 2) * b_dim1];
2654 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2655 * b[l + (j + 3) * b_dim1];
2656 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2657 * b[l + (j + 3) * b_dim1];
2658 }
2659 c[i + j * c_dim1] = f11;
2660 c[i + 1 + j * c_dim1] = f21;
2661 c[i + (j + 1) * c_dim1] = f12;
2662 c[i + 1 + (j + 1) * c_dim1] = f22;
2663 c[i + (j + 2) * c_dim1] = f13;
2664 c[i + 1 + (j + 2) * c_dim1] = f23;
2665 c[i + (j + 3) * c_dim1] = f14;
2666 c[i + 1 + (j + 3) * c_dim1] = f24;
2667 c[i + 2 + j * c_dim1] = f31;
2668 c[i + 3 + j * c_dim1] = f41;
2669 c[i + 2 + (j + 1) * c_dim1] = f32;
2670 c[i + 3 + (j + 1) * c_dim1] = f42;
2671 c[i + 2 + (j + 2) * c_dim1] = f33;
2672 c[i + 3 + (j + 2) * c_dim1] = f43;
2673 c[i + 2 + (j + 3) * c_dim1] = f34;
2674 c[i + 3 + (j + 3) * c_dim1] = f44;
2675 }
2676 if (uisec < isec)
2677 {
2678 i5 = ii + isec - 1;
2679 for (i = ii + uisec; i <= i5; ++i)
2680 {
2681 f11 = c[i + j * c_dim1];
2682 f12 = c[i + (j + 1) * c_dim1];
2683 f13 = c[i + (j + 2) * c_dim1];
2684 f14 = c[i + (j + 3) * c_dim1];
2685 i6 = ll + lsec - 1;
2686 for (l = ll; l <= i6; ++l)
2687 {
2688 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + j * b_dim1];
2690 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691 257] * b[l + (j + 1) * b_dim1];
2692 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2693 257] * b[l + (j + 2) * b_dim1];
2694 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2695 257] * b[l + (j + 3) * b_dim1];
2696 }
2697 c[i + j * c_dim1] = f11;
2698 c[i + (j + 1) * c_dim1] = f12;
2699 c[i + (j + 2) * c_dim1] = f13;
2700 c[i + (j + 3) * c_dim1] = f14;
2701 }
2702 }
2703 }
2704 if (ujsec < jsec)
2705 {
2706 i4 = jj + jsec - 1;
2707 for (j = jj + ujsec; j <= i4; ++j)
2708 {
2709 i5 = ii + uisec - 1;
2710 for (i = ii; i <= i5; i += 4)
2711 {
2712 f11 = c[i + j * c_dim1];
2713 f21 = c[i + 1 + j * c_dim1];
2714 f31 = c[i + 2 + j * c_dim1];
2715 f41 = c[i + 3 + j * c_dim1];
2716 i6 = ll + lsec - 1;
2717 for (l = ll; l <= i6; ++l)
2718 {
2719 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2720 257] * b[l + j * b_dim1];
2721 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2722 257] * b[l + j * b_dim1];
2723 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2724 257] * b[l + j * b_dim1];
2725 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2726 257] * b[l + j * b_dim1];
2727 }
2728 c[i + j * c_dim1] = f11;
2729 c[i + 1 + j * c_dim1] = f21;
2730 c[i + 2 + j * c_dim1] = f31;
2731 c[i + 3 + j * c_dim1] = f41;
2732 }
2733 i5 = ii + isec - 1;
2734 for (i = ii + uisec; i <= i5; ++i)
2735 {
2736 f11 = c[i + j * c_dim1];
2737 i6 = ll + lsec - 1;
2738 for (l = ll; l <= i6; ++l)
2739 {
2740 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741 257] * b[l + j * b_dim1];
2742 }
2743 c[i + j * c_dim1] = f11;
2744 }
2745 }
2746 }
644cb69f
FXC
2747 }
2748 }
2749 }
5d70ab07 2750 return;
644cb69f 2751 }
1524f80b
RS
2752 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2753 {
a4a11197
PT
2754 if (GFC_DESCRIPTOR_RANK (a) != 1)
2755 {
2756 const GFC_REAL_16 *restrict abase_x;
2757 const GFC_REAL_16 *restrict bbase_y;
2758 GFC_REAL_16 *restrict dest_y;
2759 GFC_REAL_16 s;
1524f80b 2760
a4a11197
PT
2761 for (y = 0; y < ycount; y++)
2762 {
2763 bbase_y = &bbase[y*bystride];
2764 dest_y = &dest[y*rystride];
2765 for (x = 0; x < xcount; x++)
2766 {
2767 abase_x = &abase[x*axstride];
2768 s = (GFC_REAL_16) 0;
2769 for (n = 0; n < count; n++)
2770 s += abase_x[n] * bbase_y[n];
2771 dest_y[x] = s;
2772 }
2773 }
2774 }
2775 else
1524f80b 2776 {
a4a11197
PT
2777 const GFC_REAL_16 *restrict bbase_y;
2778 GFC_REAL_16 s;
2779
2780 for (y = 0; y < ycount; y++)
1524f80b 2781 {
a4a11197 2782 bbase_y = &bbase[y*bystride];
1524f80b
RS
2783 s = (GFC_REAL_16) 0;
2784 for (n = 0; n < count; n++)
a4a11197
PT
2785 s += abase[n*axstride] * bbase_y[n];
2786 dest[y*rystride] = s;
1524f80b
RS
2787 }
2788 }
2789 }
2790 else if (axstride < aystride)
644cb69f
FXC
2791 {
2792 for (y = 0; y < ycount; y++)
2793 for (x = 0; x < xcount; x++)
2794 dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2795
2796 for (y = 0; y < ycount; y++)
2797 for (n = 0; n < count; n++)
2798 for (x = 0; x < xcount; x++)
2799 /* dest[x,y] += a[x,n] * b[n,y] */
5d70ab07
JD
2800 dest[x*rxstride + y*rystride] +=
2801 abase[x*axstride + n*aystride] *
2802 bbase[n*bxstride + y*bystride];
644cb69f 2803 }
f0e871d6
PT
2804 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2805 {
2806 const GFC_REAL_16 *restrict bbase_y;
2807 GFC_REAL_16 s;
2808
2809 for (y = 0; y < ycount; y++)
2810 {
2811 bbase_y = &bbase[y*bystride];
2812 s = (GFC_REAL_16) 0;
2813 for (n = 0; n < count; n++)
2814 s += abase[n*axstride] * bbase_y[n*bxstride];
2815 dest[y*rxstride] = s;
2816 }
2817 }
1524f80b
RS
2818 else
2819 {
2820 const GFC_REAL_16 *restrict abase_x;
2821 const GFC_REAL_16 *restrict bbase_y;
2822 GFC_REAL_16 *restrict dest_y;
2823 GFC_REAL_16 s;
2824
2825 for (y = 0; y < ycount; y++)
2826 {
2827 bbase_y = &bbase[y*bystride];
2828 dest_y = &dest[y*rystride];
2829 for (x = 0; x < xcount; x++)
2830 {
2831 abase_x = &abase[x*axstride];
2832 s = (GFC_REAL_16) 0;
2833 for (n = 0; n < count; n++)
2834 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2835 dest_y[x*rxstride] = s;
2836 }
2837 }
2838 }
644cb69f 2839}
31cfd832
TK
2840#undef POW3
2841#undef min
2842#undef max
2843
644cb69f 2844#endif
31cfd832
TK
2845#endif
2846