]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_c10.c
re PR fortran/86704 (Segmentation fault when using matmul in combination with transpose)
[thirdparty/gcc.git] / libgfortran / generated / matmul_c10.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_COMPLEX_10)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
39 const int *, const GFC_COMPLEX_10 *, const int *,
40 const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
73 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_c10);
76
77 /* Put exhaustive list of possible architectures here here, ORed together. */
78
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81 #ifdef HAVE_AVX
82 static void
83 matmul_c10_avx (gfc_array_c10 * const restrict retarray,
84 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_c10_avx (gfc_array_c10 * const restrict retarray,
88 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90 {
91 const GFC_COMPLEX_10 * restrict abase;
92 const GFC_COMPLEX_10 * restrict bbase;
93 GFC_COMPLEX_10 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110 */
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
169
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
177 }
178 }
179
180
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182 {
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187 }
188 else
189 {
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192 }
193
194
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
196 {
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
200
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
203 }
204 else
205 {
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211 }
212
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214 {
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 The value is only used for calculation of the
226 memory by the buffer. */
227 bystride = 256;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_COMPLEX_10 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
265 }
266 }
267
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
269 {
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
272
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
277
278 from netlib.org, translated to C, and modified for matmul.m4. */
279
280 const GFC_COMPLEX_10 *a, *b;
281 GFC_COMPLEX_10 *c;
282 const index_type m = xcount, n = ycount, k = count;
283
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
287
288 /* Local variables */
289 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
290 f13, f14, f23, f24, f33, f34, f43, f44;
291 index_type i, j, l, ii, jj, ll;
292 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
293 GFC_COMPLEX_10 *t1;
294
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
298
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
309
310 /* Empty c first. */
311 for (j=1; j<=n; j++)
312 for (i=1; i<=m; i++)
313 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
314
315 /* Early exit if possible */
316 if (m == 0 || n == 0 || k == 0)
317 return;
318
319 /* Adjust size of t1 to what is needed. */
320 index_type t1_dim, a_sz;
321 if (aystride == 1)
322 a_sz = rystride;
323 else
324 a_sz = a_dim1;
325
326 t1_dim = a_sz * 256 + b_dim1;
327 if (t1_dim > 65536)
328 t1_dim = 65536;
329
330 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
331
332 /* Start turning the crank. */
333 i1 = n;
334 for (jj = 1; jj <= i1; jj += 512)
335 {
336 /* Computing MIN */
337 i2 = 512;
338 i3 = n - jj + 1;
339 jsec = min(i2,i3);
340 ujsec = jsec - jsec % 4;
341 i2 = k;
342 for (ll = 1; ll <= i2; ll += 256)
343 {
344 /* Computing MIN */
345 i3 = 256;
346 i4 = k - ll + 1;
347 lsec = min(i3,i4);
348 ulsec = lsec - lsec % 2;
349
350 i3 = m;
351 for (ii = 1; ii <= i3; ii += 256)
352 {
353 /* Computing MIN */
354 i4 = 256;
355 i5 = m - ii + 1;
356 isec = min(i4,i5);
357 uisec = isec - isec % 2;
358 i4 = ll + ulsec - 1;
359 for (l = ll; l <= i4; l += 2)
360 {
361 i5 = ii + uisec - 1;
362 for (i = ii; i <= i5; i += 2)
363 {
364 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
365 a[i + l * a_dim1];
366 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
367 a[i + (l + 1) * a_dim1];
368 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
369 a[i + 1 + l * a_dim1];
370 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
371 a[i + 1 + (l + 1) * a_dim1];
372 }
373 if (uisec < isec)
374 {
375 t1[l - ll + 1 + (isec << 8) - 257] =
376 a[ii + isec - 1 + l * a_dim1];
377 t1[l - ll + 2 + (isec << 8) - 257] =
378 a[ii + isec - 1 + (l + 1) * a_dim1];
379 }
380 }
381 if (ulsec < lsec)
382 {
383 i4 = ii + isec - 1;
384 for (i = ii; i<= i4; ++i)
385 {
386 t1[lsec + ((i - ii + 1) << 8) - 257] =
387 a[i + (ll + lsec - 1) * a_dim1];
388 }
389 }
390
391 uisec = isec - isec % 4;
392 i4 = jj + ujsec - 1;
393 for (j = jj; j <= i4; j += 4)
394 {
395 i5 = ii + uisec - 1;
396 for (i = ii; i <= i5; i += 4)
397 {
398 f11 = c[i + j * c_dim1];
399 f21 = c[i + 1 + j * c_dim1];
400 f12 = c[i + (j + 1) * c_dim1];
401 f22 = c[i + 1 + (j + 1) * c_dim1];
402 f13 = c[i + (j + 2) * c_dim1];
403 f23 = c[i + 1 + (j + 2) * c_dim1];
404 f14 = c[i + (j + 3) * c_dim1];
405 f24 = c[i + 1 + (j + 3) * c_dim1];
406 f31 = c[i + 2 + j * c_dim1];
407 f41 = c[i + 3 + j * c_dim1];
408 f32 = c[i + 2 + (j + 1) * c_dim1];
409 f42 = c[i + 3 + (j + 1) * c_dim1];
410 f33 = c[i + 2 + (j + 2) * c_dim1];
411 f43 = c[i + 3 + (j + 2) * c_dim1];
412 f34 = c[i + 2 + (j + 3) * c_dim1];
413 f44 = c[i + 3 + (j + 3) * c_dim1];
414 i6 = ll + lsec - 1;
415 for (l = ll; l <= i6; ++l)
416 {
417 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
418 * b[l + j * b_dim1];
419 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
420 * b[l + j * b_dim1];
421 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
422 * b[l + (j + 1) * b_dim1];
423 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
424 * b[l + (j + 1) * b_dim1];
425 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
426 * b[l + (j + 2) * b_dim1];
427 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
428 * b[l + (j + 2) * b_dim1];
429 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
430 * b[l + (j + 3) * b_dim1];
431 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
432 * b[l + (j + 3) * b_dim1];
433 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
434 * b[l + j * b_dim1];
435 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
436 * b[l + j * b_dim1];
437 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
438 * b[l + (j + 1) * b_dim1];
439 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
440 * b[l + (j + 1) * b_dim1];
441 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
442 * b[l + (j + 2) * b_dim1];
443 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
444 * b[l + (j + 2) * b_dim1];
445 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
446 * b[l + (j + 3) * b_dim1];
447 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
448 * b[l + (j + 3) * b_dim1];
449 }
450 c[i + j * c_dim1] = f11;
451 c[i + 1 + j * c_dim1] = f21;
452 c[i + (j + 1) * c_dim1] = f12;
453 c[i + 1 + (j + 1) * c_dim1] = f22;
454 c[i + (j + 2) * c_dim1] = f13;
455 c[i + 1 + (j + 2) * c_dim1] = f23;
456 c[i + (j + 3) * c_dim1] = f14;
457 c[i + 1 + (j + 3) * c_dim1] = f24;
458 c[i + 2 + j * c_dim1] = f31;
459 c[i + 3 + j * c_dim1] = f41;
460 c[i + 2 + (j + 1) * c_dim1] = f32;
461 c[i + 3 + (j + 1) * c_dim1] = f42;
462 c[i + 2 + (j + 2) * c_dim1] = f33;
463 c[i + 3 + (j + 2) * c_dim1] = f43;
464 c[i + 2 + (j + 3) * c_dim1] = f34;
465 c[i + 3 + (j + 3) * c_dim1] = f44;
466 }
467 if (uisec < isec)
468 {
469 i5 = ii + isec - 1;
470 for (i = ii + uisec; i <= i5; ++i)
471 {
472 f11 = c[i + j * c_dim1];
473 f12 = c[i + (j + 1) * c_dim1];
474 f13 = c[i + (j + 2) * c_dim1];
475 f14 = c[i + (j + 3) * c_dim1];
476 i6 = ll + lsec - 1;
477 for (l = ll; l <= i6; ++l)
478 {
479 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
480 257] * b[l + j * b_dim1];
481 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
482 257] * b[l + (j + 1) * b_dim1];
483 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
484 257] * b[l + (j + 2) * b_dim1];
485 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
486 257] * b[l + (j + 3) * b_dim1];
487 }
488 c[i + j * c_dim1] = f11;
489 c[i + (j + 1) * c_dim1] = f12;
490 c[i + (j + 2) * c_dim1] = f13;
491 c[i + (j + 3) * c_dim1] = f14;
492 }
493 }
494 }
495 if (ujsec < jsec)
496 {
497 i4 = jj + jsec - 1;
498 for (j = jj + ujsec; j <= i4; ++j)
499 {
500 i5 = ii + uisec - 1;
501 for (i = ii; i <= i5; i += 4)
502 {
503 f11 = c[i + j * c_dim1];
504 f21 = c[i + 1 + j * c_dim1];
505 f31 = c[i + 2 + j * c_dim1];
506 f41 = c[i + 3 + j * c_dim1];
507 i6 = ll + lsec - 1;
508 for (l = ll; l <= i6; ++l)
509 {
510 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
511 257] * b[l + j * b_dim1];
512 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
513 257] * b[l + j * b_dim1];
514 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
515 257] * b[l + j * b_dim1];
516 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
517 257] * b[l + j * b_dim1];
518 }
519 c[i + j * c_dim1] = f11;
520 c[i + 1 + j * c_dim1] = f21;
521 c[i + 2 + j * c_dim1] = f31;
522 c[i + 3 + j * c_dim1] = f41;
523 }
524 i5 = ii + isec - 1;
525 for (i = ii + uisec; i <= i5; ++i)
526 {
527 f11 = c[i + j * c_dim1];
528 i6 = ll + lsec - 1;
529 for (l = ll; l <= i6; ++l)
530 {
531 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
532 257] * b[l + j * b_dim1];
533 }
534 c[i + j * c_dim1] = f11;
535 }
536 }
537 }
538 }
539 }
540 }
541 free(t1);
542 return;
543 }
544 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
545 {
546 if (GFC_DESCRIPTOR_RANK (a) != 1)
547 {
548 const GFC_COMPLEX_10 *restrict abase_x;
549 const GFC_COMPLEX_10 *restrict bbase_y;
550 GFC_COMPLEX_10 *restrict dest_y;
551 GFC_COMPLEX_10 s;
552
553 for (y = 0; y < ycount; y++)
554 {
555 bbase_y = &bbase[y*bystride];
556 dest_y = &dest[y*rystride];
557 for (x = 0; x < xcount; x++)
558 {
559 abase_x = &abase[x*axstride];
560 s = (GFC_COMPLEX_10) 0;
561 for (n = 0; n < count; n++)
562 s += abase_x[n] * bbase_y[n];
563 dest_y[x] = s;
564 }
565 }
566 }
567 else
568 {
569 const GFC_COMPLEX_10 *restrict bbase_y;
570 GFC_COMPLEX_10 s;
571
572 for (y = 0; y < ycount; y++)
573 {
574 bbase_y = &bbase[y*bystride];
575 s = (GFC_COMPLEX_10) 0;
576 for (n = 0; n < count; n++)
577 s += abase[n*axstride] * bbase_y[n];
578 dest[y*rystride] = s;
579 }
580 }
581 }
582 else if (axstride < aystride)
583 {
584 for (y = 0; y < ycount; y++)
585 for (x = 0; x < xcount; x++)
586 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
587
588 for (y = 0; y < ycount; y++)
589 for (n = 0; n < count; n++)
590 for (x = 0; x < xcount; x++)
591 /* dest[x,y] += a[x,n] * b[n,y] */
592 dest[x*rxstride + y*rystride] +=
593 abase[x*axstride + n*aystride] *
594 bbase[n*bxstride + y*bystride];
595 }
596 else if (GFC_DESCRIPTOR_RANK (a) == 1)
597 {
598 const GFC_COMPLEX_10 *restrict bbase_y;
599 GFC_COMPLEX_10 s;
600
601 for (y = 0; y < ycount; y++)
602 {
603 bbase_y = &bbase[y*bystride];
604 s = (GFC_COMPLEX_10) 0;
605 for (n = 0; n < count; n++)
606 s += abase[n*axstride] * bbase_y[n*bxstride];
607 dest[y*rxstride] = s;
608 }
609 }
610 else
611 {
612 const GFC_COMPLEX_10 *restrict abase_x;
613 const GFC_COMPLEX_10 *restrict bbase_y;
614 GFC_COMPLEX_10 *restrict dest_y;
615 GFC_COMPLEX_10 s;
616
617 for (y = 0; y < ycount; y++)
618 {
619 bbase_y = &bbase[y*bystride];
620 dest_y = &dest[y*rystride];
621 for (x = 0; x < xcount; x++)
622 {
623 abase_x = &abase[x*axstride];
624 s = (GFC_COMPLEX_10) 0;
625 for (n = 0; n < count; n++)
626 s += abase_x[n*aystride] * bbase_y[n*bxstride];
627 dest_y[x*rxstride] = s;
628 }
629 }
630 }
631 }
632 #undef POW3
633 #undef min
634 #undef max
635
636 #endif /* HAVE_AVX */
637
638 #ifdef HAVE_AVX2
639 static void
640 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
641 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
642 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
643 static void
644 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
645 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
646 int blas_limit, blas_call gemm)
647 {
648 const GFC_COMPLEX_10 * restrict abase;
649 const GFC_COMPLEX_10 * restrict bbase;
650 GFC_COMPLEX_10 * restrict dest;
651
652 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
653 index_type x, y, n, count, xcount, ycount;
654
655 assert (GFC_DESCRIPTOR_RANK (a) == 2
656 || GFC_DESCRIPTOR_RANK (b) == 2);
657
658 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
659
660 Either A or B (but not both) can be rank 1:
661
662 o One-dimensional argument A is implicitly treated as a row matrix
663 dimensioned [1,count], so xcount=1.
664
665 o One-dimensional argument B is implicitly treated as a column matrix
666 dimensioned [count, 1], so ycount=1.
667 */
668
669 if (retarray->base_addr == NULL)
670 {
671 if (GFC_DESCRIPTOR_RANK (a) == 1)
672 {
673 GFC_DIMENSION_SET(retarray->dim[0], 0,
674 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
675 }
676 else if (GFC_DESCRIPTOR_RANK (b) == 1)
677 {
678 GFC_DIMENSION_SET(retarray->dim[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
680 }
681 else
682 {
683 GFC_DIMENSION_SET(retarray->dim[0], 0,
684 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
685
686 GFC_DIMENSION_SET(retarray->dim[1], 0,
687 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
688 GFC_DESCRIPTOR_EXTENT(retarray,0));
689 }
690
691 retarray->base_addr
692 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
693 retarray->offset = 0;
694 }
695 else if (unlikely (compile_options.bounds_check))
696 {
697 index_type ret_extent, arg_extent;
698
699 if (GFC_DESCRIPTOR_RANK (a) == 1)
700 {
701 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
702 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
703 if (arg_extent != ret_extent)
704 runtime_error ("Incorrect extent in return array in"
705 " MATMUL intrinsic: is %ld, should be %ld",
706 (long int) ret_extent, (long int) arg_extent);
707 }
708 else if (GFC_DESCRIPTOR_RANK (b) == 1)
709 {
710 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
711 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
712 if (arg_extent != ret_extent)
713 runtime_error ("Incorrect extent in return array in"
714 " MATMUL intrinsic: is %ld, should be %ld",
715 (long int) ret_extent, (long int) arg_extent);
716 }
717 else
718 {
719 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
720 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
721 if (arg_extent != ret_extent)
722 runtime_error ("Incorrect extent in return array in"
723 " MATMUL intrinsic for dimension 1:"
724 " is %ld, should be %ld",
725 (long int) ret_extent, (long int) arg_extent);
726
727 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
728 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
729 if (arg_extent != ret_extent)
730 runtime_error ("Incorrect extent in return array in"
731 " MATMUL intrinsic for dimension 2:"
732 " is %ld, should be %ld",
733 (long int) ret_extent, (long int) arg_extent);
734 }
735 }
736
737
738 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
739 {
740 /* One-dimensional result may be addressed in the code below
741 either as a row or a column matrix. We want both cases to
742 work. */
743 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
744 }
745 else
746 {
747 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
748 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
749 }
750
751
752 if (GFC_DESCRIPTOR_RANK (a) == 1)
753 {
754 /* Treat it as a a row matrix A[1,count]. */
755 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
756 aystride = 1;
757
758 xcount = 1;
759 count = GFC_DESCRIPTOR_EXTENT(a,0);
760 }
761 else
762 {
763 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
764 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
765
766 count = GFC_DESCRIPTOR_EXTENT(a,1);
767 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
768 }
769
770 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
771 {
772 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
773 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
774 }
775
776 if (GFC_DESCRIPTOR_RANK (b) == 1)
777 {
778 /* Treat it as a column matrix B[count,1] */
779 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
780
781 /* bystride should never be used for 1-dimensional b.
782 The value is only used for calculation of the
783 memory by the buffer. */
784 bystride = 256;
785 ycount = 1;
786 }
787 else
788 {
789 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
790 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
791 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
792 }
793
794 abase = a->base_addr;
795 bbase = b->base_addr;
796 dest = retarray->base_addr;
797
798 /* Now that everything is set up, we perform the multiplication
799 itself. */
800
801 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
802 #define min(a,b) ((a) <= (b) ? (a) : (b))
803 #define max(a,b) ((a) >= (b) ? (a) : (b))
804
805 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
806 && (bxstride == 1 || bystride == 1)
807 && (((float) xcount) * ((float) ycount) * ((float) count)
808 > POW3(blas_limit)))
809 {
810 const int m = xcount, n = ycount, k = count, ldc = rystride;
811 const GFC_COMPLEX_10 one = 1, zero = 0;
812 const int lda = (axstride == 1) ? aystride : axstride,
813 ldb = (bxstride == 1) ? bystride : bxstride;
814
815 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
816 {
817 assert (gemm != NULL);
818 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
819 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
820 &ldc, 1, 1);
821 return;
822 }
823 }
824
825 if (rxstride == 1 && axstride == 1 && bxstride == 1)
826 {
827 /* This block of code implements a tuned matmul, derived from
828 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
829
830 Bo Kagstrom and Per Ling
831 Department of Computing Science
832 Umea University
833 S-901 87 Umea, Sweden
834
835 from netlib.org, translated to C, and modified for matmul.m4. */
836
837 const GFC_COMPLEX_10 *a, *b;
838 GFC_COMPLEX_10 *c;
839 const index_type m = xcount, n = ycount, k = count;
840
841 /* System generated locals */
842 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
843 i1, i2, i3, i4, i5, i6;
844
845 /* Local variables */
846 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
847 f13, f14, f23, f24, f33, f34, f43, f44;
848 index_type i, j, l, ii, jj, ll;
849 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
850 GFC_COMPLEX_10 *t1;
851
852 a = abase;
853 b = bbase;
854 c = retarray->base_addr;
855
856 /* Parameter adjustments */
857 c_dim1 = rystride;
858 c_offset = 1 + c_dim1;
859 c -= c_offset;
860 a_dim1 = aystride;
861 a_offset = 1 + a_dim1;
862 a -= a_offset;
863 b_dim1 = bystride;
864 b_offset = 1 + b_dim1;
865 b -= b_offset;
866
867 /* Empty c first. */
868 for (j=1; j<=n; j++)
869 for (i=1; i<=m; i++)
870 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
871
872 /* Early exit if possible */
873 if (m == 0 || n == 0 || k == 0)
874 return;
875
876 /* Adjust size of t1 to what is needed. */
877 index_type t1_dim, a_sz;
878 if (aystride == 1)
879 a_sz = rystride;
880 else
881 a_sz = a_dim1;
882
883 t1_dim = a_sz * 256 + b_dim1;
884 if (t1_dim > 65536)
885 t1_dim = 65536;
886
887 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
888
889 /* Start turning the crank. */
890 i1 = n;
891 for (jj = 1; jj <= i1; jj += 512)
892 {
893 /* Computing MIN */
894 i2 = 512;
895 i3 = n - jj + 1;
896 jsec = min(i2,i3);
897 ujsec = jsec - jsec % 4;
898 i2 = k;
899 for (ll = 1; ll <= i2; ll += 256)
900 {
901 /* Computing MIN */
902 i3 = 256;
903 i4 = k - ll + 1;
904 lsec = min(i3,i4);
905 ulsec = lsec - lsec % 2;
906
907 i3 = m;
908 for (ii = 1; ii <= i3; ii += 256)
909 {
910 /* Computing MIN */
911 i4 = 256;
912 i5 = m - ii + 1;
913 isec = min(i4,i5);
914 uisec = isec - isec % 2;
915 i4 = ll + ulsec - 1;
916 for (l = ll; l <= i4; l += 2)
917 {
918 i5 = ii + uisec - 1;
919 for (i = ii; i <= i5; i += 2)
920 {
921 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
922 a[i + l * a_dim1];
923 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
924 a[i + (l + 1) * a_dim1];
925 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
926 a[i + 1 + l * a_dim1];
927 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
928 a[i + 1 + (l + 1) * a_dim1];
929 }
930 if (uisec < isec)
931 {
932 t1[l - ll + 1 + (isec << 8) - 257] =
933 a[ii + isec - 1 + l * a_dim1];
934 t1[l - ll + 2 + (isec << 8) - 257] =
935 a[ii + isec - 1 + (l + 1) * a_dim1];
936 }
937 }
938 if (ulsec < lsec)
939 {
940 i4 = ii + isec - 1;
941 for (i = ii; i<= i4; ++i)
942 {
943 t1[lsec + ((i - ii + 1) << 8) - 257] =
944 a[i + (ll + lsec - 1) * a_dim1];
945 }
946 }
947
948 uisec = isec - isec % 4;
949 i4 = jj + ujsec - 1;
950 for (j = jj; j <= i4; j += 4)
951 {
952 i5 = ii + uisec - 1;
953 for (i = ii; i <= i5; i += 4)
954 {
955 f11 = c[i + j * c_dim1];
956 f21 = c[i + 1 + j * c_dim1];
957 f12 = c[i + (j + 1) * c_dim1];
958 f22 = c[i + 1 + (j + 1) * c_dim1];
959 f13 = c[i + (j + 2) * c_dim1];
960 f23 = c[i + 1 + (j + 2) * c_dim1];
961 f14 = c[i + (j + 3) * c_dim1];
962 f24 = c[i + 1 + (j + 3) * c_dim1];
963 f31 = c[i + 2 + j * c_dim1];
964 f41 = c[i + 3 + j * c_dim1];
965 f32 = c[i + 2 + (j + 1) * c_dim1];
966 f42 = c[i + 3 + (j + 1) * c_dim1];
967 f33 = c[i + 2 + (j + 2) * c_dim1];
968 f43 = c[i + 3 + (j + 2) * c_dim1];
969 f34 = c[i + 2 + (j + 3) * c_dim1];
970 f44 = c[i + 3 + (j + 3) * c_dim1];
971 i6 = ll + lsec - 1;
972 for (l = ll; l <= i6; ++l)
973 {
974 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
975 * b[l + j * b_dim1];
976 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
977 * b[l + j * b_dim1];
978 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
979 * b[l + (j + 1) * b_dim1];
980 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
981 * b[l + (j + 1) * b_dim1];
982 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
983 * b[l + (j + 2) * b_dim1];
984 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
985 * b[l + (j + 2) * b_dim1];
986 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
987 * b[l + (j + 3) * b_dim1];
988 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
989 * b[l + (j + 3) * b_dim1];
990 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
991 * b[l + j * b_dim1];
992 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
993 * b[l + j * b_dim1];
994 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
995 * b[l + (j + 1) * b_dim1];
996 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
997 * b[l + (j + 1) * b_dim1];
998 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
999 * b[l + (j + 2) * b_dim1];
1000 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1001 * b[l + (j + 2) * b_dim1];
1002 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1003 * b[l + (j + 3) * b_dim1];
1004 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1005 * b[l + (j + 3) * b_dim1];
1006 }
1007 c[i + j * c_dim1] = f11;
1008 c[i + 1 + j * c_dim1] = f21;
1009 c[i + (j + 1) * c_dim1] = f12;
1010 c[i + 1 + (j + 1) * c_dim1] = f22;
1011 c[i + (j + 2) * c_dim1] = f13;
1012 c[i + 1 + (j + 2) * c_dim1] = f23;
1013 c[i + (j + 3) * c_dim1] = f14;
1014 c[i + 1 + (j + 3) * c_dim1] = f24;
1015 c[i + 2 + j * c_dim1] = f31;
1016 c[i + 3 + j * c_dim1] = f41;
1017 c[i + 2 + (j + 1) * c_dim1] = f32;
1018 c[i + 3 + (j + 1) * c_dim1] = f42;
1019 c[i + 2 + (j + 2) * c_dim1] = f33;
1020 c[i + 3 + (j + 2) * c_dim1] = f43;
1021 c[i + 2 + (j + 3) * c_dim1] = f34;
1022 c[i + 3 + (j + 3) * c_dim1] = f44;
1023 }
1024 if (uisec < isec)
1025 {
1026 i5 = ii + isec - 1;
1027 for (i = ii + uisec; i <= i5; ++i)
1028 {
1029 f11 = c[i + j * c_dim1];
1030 f12 = c[i + (j + 1) * c_dim1];
1031 f13 = c[i + (j + 2) * c_dim1];
1032 f14 = c[i + (j + 3) * c_dim1];
1033 i6 = ll + lsec - 1;
1034 for (l = ll; l <= i6; ++l)
1035 {
1036 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1037 257] * b[l + j * b_dim1];
1038 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1039 257] * b[l + (j + 1) * b_dim1];
1040 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 257] * b[l + (j + 2) * b_dim1];
1042 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1043 257] * b[l + (j + 3) * b_dim1];
1044 }
1045 c[i + j * c_dim1] = f11;
1046 c[i + (j + 1) * c_dim1] = f12;
1047 c[i + (j + 2) * c_dim1] = f13;
1048 c[i + (j + 3) * c_dim1] = f14;
1049 }
1050 }
1051 }
1052 if (ujsec < jsec)
1053 {
1054 i4 = jj + jsec - 1;
1055 for (j = jj + ujsec; j <= i4; ++j)
1056 {
1057 i5 = ii + uisec - 1;
1058 for (i = ii; i <= i5; i += 4)
1059 {
1060 f11 = c[i + j * c_dim1];
1061 f21 = c[i + 1 + j * c_dim1];
1062 f31 = c[i + 2 + j * c_dim1];
1063 f41 = c[i + 3 + j * c_dim1];
1064 i6 = ll + lsec - 1;
1065 for (l = ll; l <= i6; ++l)
1066 {
1067 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1068 257] * b[l + j * b_dim1];
1069 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1070 257] * b[l + j * b_dim1];
1071 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1072 257] * b[l + j * b_dim1];
1073 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1074 257] * b[l + j * b_dim1];
1075 }
1076 c[i + j * c_dim1] = f11;
1077 c[i + 1 + j * c_dim1] = f21;
1078 c[i + 2 + j * c_dim1] = f31;
1079 c[i + 3 + j * c_dim1] = f41;
1080 }
1081 i5 = ii + isec - 1;
1082 for (i = ii + uisec; i <= i5; ++i)
1083 {
1084 f11 = c[i + j * c_dim1];
1085 i6 = ll + lsec - 1;
1086 for (l = ll; l <= i6; ++l)
1087 {
1088 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1089 257] * b[l + j * b_dim1];
1090 }
1091 c[i + j * c_dim1] = f11;
1092 }
1093 }
1094 }
1095 }
1096 }
1097 }
1098 free(t1);
1099 return;
1100 }
1101 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1102 {
1103 if (GFC_DESCRIPTOR_RANK (a) != 1)
1104 {
1105 const GFC_COMPLEX_10 *restrict abase_x;
1106 const GFC_COMPLEX_10 *restrict bbase_y;
1107 GFC_COMPLEX_10 *restrict dest_y;
1108 GFC_COMPLEX_10 s;
1109
1110 for (y = 0; y < ycount; y++)
1111 {
1112 bbase_y = &bbase[y*bystride];
1113 dest_y = &dest[y*rystride];
1114 for (x = 0; x < xcount; x++)
1115 {
1116 abase_x = &abase[x*axstride];
1117 s = (GFC_COMPLEX_10) 0;
1118 for (n = 0; n < count; n++)
1119 s += abase_x[n] * bbase_y[n];
1120 dest_y[x] = s;
1121 }
1122 }
1123 }
1124 else
1125 {
1126 const GFC_COMPLEX_10 *restrict bbase_y;
1127 GFC_COMPLEX_10 s;
1128
1129 for (y = 0; y < ycount; y++)
1130 {
1131 bbase_y = &bbase[y*bystride];
1132 s = (GFC_COMPLEX_10) 0;
1133 for (n = 0; n < count; n++)
1134 s += abase[n*axstride] * bbase_y[n];
1135 dest[y*rystride] = s;
1136 }
1137 }
1138 }
1139 else if (axstride < aystride)
1140 {
1141 for (y = 0; y < ycount; y++)
1142 for (x = 0; x < xcount; x++)
1143 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
1144
1145 for (y = 0; y < ycount; y++)
1146 for (n = 0; n < count; n++)
1147 for (x = 0; x < xcount; x++)
1148 /* dest[x,y] += a[x,n] * b[n,y] */
1149 dest[x*rxstride + y*rystride] +=
1150 abase[x*axstride + n*aystride] *
1151 bbase[n*bxstride + y*bystride];
1152 }
1153 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1154 {
1155 const GFC_COMPLEX_10 *restrict bbase_y;
1156 GFC_COMPLEX_10 s;
1157
1158 for (y = 0; y < ycount; y++)
1159 {
1160 bbase_y = &bbase[y*bystride];
1161 s = (GFC_COMPLEX_10) 0;
1162 for (n = 0; n < count; n++)
1163 s += abase[n*axstride] * bbase_y[n*bxstride];
1164 dest[y*rxstride] = s;
1165 }
1166 }
1167 else
1168 {
1169 const GFC_COMPLEX_10 *restrict abase_x;
1170 const GFC_COMPLEX_10 *restrict bbase_y;
1171 GFC_COMPLEX_10 *restrict dest_y;
1172 GFC_COMPLEX_10 s;
1173
1174 for (y = 0; y < ycount; y++)
1175 {
1176 bbase_y = &bbase[y*bystride];
1177 dest_y = &dest[y*rystride];
1178 for (x = 0; x < xcount; x++)
1179 {
1180 abase_x = &abase[x*axstride];
1181 s = (GFC_COMPLEX_10) 0;
1182 for (n = 0; n < count; n++)
1183 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1184 dest_y[x*rxstride] = s;
1185 }
1186 }
1187 }
1188 }
1189 #undef POW3
1190 #undef min
1191 #undef max
1192
1193 #endif /* HAVE_AVX2 */
1194
1195 #ifdef HAVE_AVX512F
1196 static void
1197 matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
1198 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1199 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1200 static void
1201 matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
1202 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1203 int blas_limit, blas_call gemm)
1204 {
1205 const GFC_COMPLEX_10 * restrict abase;
1206 const GFC_COMPLEX_10 * restrict bbase;
1207 GFC_COMPLEX_10 * restrict dest;
1208
1209 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1210 index_type x, y, n, count, xcount, ycount;
1211
1212 assert (GFC_DESCRIPTOR_RANK (a) == 2
1213 || GFC_DESCRIPTOR_RANK (b) == 2);
1214
1215 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1216
1217 Either A or B (but not both) can be rank 1:
1218
1219 o One-dimensional argument A is implicitly treated as a row matrix
1220 dimensioned [1,count], so xcount=1.
1221
1222 o One-dimensional argument B is implicitly treated as a column matrix
1223 dimensioned [count, 1], so ycount=1.
1224 */
1225
1226 if (retarray->base_addr == NULL)
1227 {
1228 if (GFC_DESCRIPTOR_RANK (a) == 1)
1229 {
1230 GFC_DIMENSION_SET(retarray->dim[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1232 }
1233 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1234 {
1235 GFC_DIMENSION_SET(retarray->dim[0], 0,
1236 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1237 }
1238 else
1239 {
1240 GFC_DIMENSION_SET(retarray->dim[0], 0,
1241 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1242
1243 GFC_DIMENSION_SET(retarray->dim[1], 0,
1244 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1245 GFC_DESCRIPTOR_EXTENT(retarray,0));
1246 }
1247
1248 retarray->base_addr
1249 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
1250 retarray->offset = 0;
1251 }
1252 else if (unlikely (compile_options.bounds_check))
1253 {
1254 index_type ret_extent, arg_extent;
1255
1256 if (GFC_DESCRIPTOR_RANK (a) == 1)
1257 {
1258 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1259 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1260 if (arg_extent != ret_extent)
1261 runtime_error ("Incorrect extent in return array in"
1262 " MATMUL intrinsic: is %ld, should be %ld",
1263 (long int) ret_extent, (long int) arg_extent);
1264 }
1265 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1266 {
1267 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1268 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1269 if (arg_extent != ret_extent)
1270 runtime_error ("Incorrect extent in return array in"
1271 " MATMUL intrinsic: is %ld, should be %ld",
1272 (long int) ret_extent, (long int) arg_extent);
1273 }
1274 else
1275 {
1276 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1277 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1278 if (arg_extent != ret_extent)
1279 runtime_error ("Incorrect extent in return array in"
1280 " MATMUL intrinsic for dimension 1:"
1281 " is %ld, should be %ld",
1282 (long int) ret_extent, (long int) arg_extent);
1283
1284 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1285 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1286 if (arg_extent != ret_extent)
1287 runtime_error ("Incorrect extent in return array in"
1288 " MATMUL intrinsic for dimension 2:"
1289 " is %ld, should be %ld",
1290 (long int) ret_extent, (long int) arg_extent);
1291 }
1292 }
1293
1294
1295 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1296 {
1297 /* One-dimensional result may be addressed in the code below
1298 either as a row or a column matrix. We want both cases to
1299 work. */
1300 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1301 }
1302 else
1303 {
1304 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1305 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1306 }
1307
1308
1309 if (GFC_DESCRIPTOR_RANK (a) == 1)
1310 {
1311 /* Treat it as a a row matrix A[1,count]. */
1312 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1313 aystride = 1;
1314
1315 xcount = 1;
1316 count = GFC_DESCRIPTOR_EXTENT(a,0);
1317 }
1318 else
1319 {
1320 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1321 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1322
1323 count = GFC_DESCRIPTOR_EXTENT(a,1);
1324 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1325 }
1326
1327 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1328 {
1329 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1330 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1331 }
1332
1333 if (GFC_DESCRIPTOR_RANK (b) == 1)
1334 {
1335 /* Treat it as a column matrix B[count,1] */
1336 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337
1338 /* bystride should never be used for 1-dimensional b.
1339 The value is only used for calculation of the
1340 memory by the buffer. */
1341 bystride = 256;
1342 ycount = 1;
1343 }
1344 else
1345 {
1346 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1347 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1348 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1349 }
1350
1351 abase = a->base_addr;
1352 bbase = b->base_addr;
1353 dest = retarray->base_addr;
1354
1355 /* Now that everything is set up, we perform the multiplication
1356 itself. */
1357
1358 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1359 #define min(a,b) ((a) <= (b) ? (a) : (b))
1360 #define max(a,b) ((a) >= (b) ? (a) : (b))
1361
1362 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1363 && (bxstride == 1 || bystride == 1)
1364 && (((float) xcount) * ((float) ycount) * ((float) count)
1365 > POW3(blas_limit)))
1366 {
1367 const int m = xcount, n = ycount, k = count, ldc = rystride;
1368 const GFC_COMPLEX_10 one = 1, zero = 0;
1369 const int lda = (axstride == 1) ? aystride : axstride,
1370 ldb = (bxstride == 1) ? bystride : bxstride;
1371
1372 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1373 {
1374 assert (gemm != NULL);
1375 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1376 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1377 &ldc, 1, 1);
1378 return;
1379 }
1380 }
1381
1382 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1383 {
1384 /* This block of code implements a tuned matmul, derived from
1385 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1386
1387 Bo Kagstrom and Per Ling
1388 Department of Computing Science
1389 Umea University
1390 S-901 87 Umea, Sweden
1391
1392 from netlib.org, translated to C, and modified for matmul.m4. */
1393
1394 const GFC_COMPLEX_10 *a, *b;
1395 GFC_COMPLEX_10 *c;
1396 const index_type m = xcount, n = ycount, k = count;
1397
1398 /* System generated locals */
1399 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1400 i1, i2, i3, i4, i5, i6;
1401
1402 /* Local variables */
1403 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
1404 f13, f14, f23, f24, f33, f34, f43, f44;
1405 index_type i, j, l, ii, jj, ll;
1406 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1407 GFC_COMPLEX_10 *t1;
1408
1409 a = abase;
1410 b = bbase;
1411 c = retarray->base_addr;
1412
1413 /* Parameter adjustments */
1414 c_dim1 = rystride;
1415 c_offset = 1 + c_dim1;
1416 c -= c_offset;
1417 a_dim1 = aystride;
1418 a_offset = 1 + a_dim1;
1419 a -= a_offset;
1420 b_dim1 = bystride;
1421 b_offset = 1 + b_dim1;
1422 b -= b_offset;
1423
1424 /* Empty c first. */
1425 for (j=1; j<=n; j++)
1426 for (i=1; i<=m; i++)
1427 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
1428
1429 /* Early exit if possible */
1430 if (m == 0 || n == 0 || k == 0)
1431 return;
1432
1433 /* Adjust size of t1 to what is needed. */
1434 index_type t1_dim, a_sz;
1435 if (aystride == 1)
1436 a_sz = rystride;
1437 else
1438 a_sz = a_dim1;
1439
1440 t1_dim = a_sz * 256 + b_dim1;
1441 if (t1_dim > 65536)
1442 t1_dim = 65536;
1443
1444 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
1445
1446 /* Start turning the crank. */
1447 i1 = n;
1448 for (jj = 1; jj <= i1; jj += 512)
1449 {
1450 /* Computing MIN */
1451 i2 = 512;
1452 i3 = n - jj + 1;
1453 jsec = min(i2,i3);
1454 ujsec = jsec - jsec % 4;
1455 i2 = k;
1456 for (ll = 1; ll <= i2; ll += 256)
1457 {
1458 /* Computing MIN */
1459 i3 = 256;
1460 i4 = k - ll + 1;
1461 lsec = min(i3,i4);
1462 ulsec = lsec - lsec % 2;
1463
1464 i3 = m;
1465 for (ii = 1; ii <= i3; ii += 256)
1466 {
1467 /* Computing MIN */
1468 i4 = 256;
1469 i5 = m - ii + 1;
1470 isec = min(i4,i5);
1471 uisec = isec - isec % 2;
1472 i4 = ll + ulsec - 1;
1473 for (l = ll; l <= i4; l += 2)
1474 {
1475 i5 = ii + uisec - 1;
1476 for (i = ii; i <= i5; i += 2)
1477 {
1478 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1479 a[i + l * a_dim1];
1480 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1481 a[i + (l + 1) * a_dim1];
1482 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1483 a[i + 1 + l * a_dim1];
1484 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1485 a[i + 1 + (l + 1) * a_dim1];
1486 }
1487 if (uisec < isec)
1488 {
1489 t1[l - ll + 1 + (isec << 8) - 257] =
1490 a[ii + isec - 1 + l * a_dim1];
1491 t1[l - ll + 2 + (isec << 8) - 257] =
1492 a[ii + isec - 1 + (l + 1) * a_dim1];
1493 }
1494 }
1495 if (ulsec < lsec)
1496 {
1497 i4 = ii + isec - 1;
1498 for (i = ii; i<= i4; ++i)
1499 {
1500 t1[lsec + ((i - ii + 1) << 8) - 257] =
1501 a[i + (ll + lsec - 1) * a_dim1];
1502 }
1503 }
1504
1505 uisec = isec - isec % 4;
1506 i4 = jj + ujsec - 1;
1507 for (j = jj; j <= i4; j += 4)
1508 {
1509 i5 = ii + uisec - 1;
1510 for (i = ii; i <= i5; i += 4)
1511 {
1512 f11 = c[i + j * c_dim1];
1513 f21 = c[i + 1 + j * c_dim1];
1514 f12 = c[i + (j + 1) * c_dim1];
1515 f22 = c[i + 1 + (j + 1) * c_dim1];
1516 f13 = c[i + (j + 2) * c_dim1];
1517 f23 = c[i + 1 + (j + 2) * c_dim1];
1518 f14 = c[i + (j + 3) * c_dim1];
1519 f24 = c[i + 1 + (j + 3) * c_dim1];
1520 f31 = c[i + 2 + j * c_dim1];
1521 f41 = c[i + 3 + j * c_dim1];
1522 f32 = c[i + 2 + (j + 1) * c_dim1];
1523 f42 = c[i + 3 + (j + 1) * c_dim1];
1524 f33 = c[i + 2 + (j + 2) * c_dim1];
1525 f43 = c[i + 3 + (j + 2) * c_dim1];
1526 f34 = c[i + 2 + (j + 3) * c_dim1];
1527 f44 = c[i + 3 + (j + 3) * c_dim1];
1528 i6 = ll + lsec - 1;
1529 for (l = ll; l <= i6; ++l)
1530 {
1531 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1532 * b[l + j * b_dim1];
1533 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1534 * b[l + j * b_dim1];
1535 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1536 * b[l + (j + 1) * b_dim1];
1537 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1538 * b[l + (j + 1) * b_dim1];
1539 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1540 * b[l + (j + 2) * b_dim1];
1541 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1542 * b[l + (j + 2) * b_dim1];
1543 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1544 * b[l + (j + 3) * b_dim1];
1545 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1546 * b[l + (j + 3) * b_dim1];
1547 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1548 * b[l + j * b_dim1];
1549 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1550 * b[l + j * b_dim1];
1551 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1552 * b[l + (j + 1) * b_dim1];
1553 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1554 * b[l + (j + 1) * b_dim1];
1555 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1556 * b[l + (j + 2) * b_dim1];
1557 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1558 * b[l + (j + 2) * b_dim1];
1559 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1560 * b[l + (j + 3) * b_dim1];
1561 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1562 * b[l + (j + 3) * b_dim1];
1563 }
1564 c[i + j * c_dim1] = f11;
1565 c[i + 1 + j * c_dim1] = f21;
1566 c[i + (j + 1) * c_dim1] = f12;
1567 c[i + 1 + (j + 1) * c_dim1] = f22;
1568 c[i + (j + 2) * c_dim1] = f13;
1569 c[i + 1 + (j + 2) * c_dim1] = f23;
1570 c[i + (j + 3) * c_dim1] = f14;
1571 c[i + 1 + (j + 3) * c_dim1] = f24;
1572 c[i + 2 + j * c_dim1] = f31;
1573 c[i + 3 + j * c_dim1] = f41;
1574 c[i + 2 + (j + 1) * c_dim1] = f32;
1575 c[i + 3 + (j + 1) * c_dim1] = f42;
1576 c[i + 2 + (j + 2) * c_dim1] = f33;
1577 c[i + 3 + (j + 2) * c_dim1] = f43;
1578 c[i + 2 + (j + 3) * c_dim1] = f34;
1579 c[i + 3 + (j + 3) * c_dim1] = f44;
1580 }
1581 if (uisec < isec)
1582 {
1583 i5 = ii + isec - 1;
1584 for (i = ii + uisec; i <= i5; ++i)
1585 {
1586 f11 = c[i + j * c_dim1];
1587 f12 = c[i + (j + 1) * c_dim1];
1588 f13 = c[i + (j + 2) * c_dim1];
1589 f14 = c[i + (j + 3) * c_dim1];
1590 i6 = ll + lsec - 1;
1591 for (l = ll; l <= i6; ++l)
1592 {
1593 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1594 257] * b[l + j * b_dim1];
1595 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1596 257] * b[l + (j + 1) * b_dim1];
1597 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1598 257] * b[l + (j + 2) * b_dim1];
1599 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1600 257] * b[l + (j + 3) * b_dim1];
1601 }
1602 c[i + j * c_dim1] = f11;
1603 c[i + (j + 1) * c_dim1] = f12;
1604 c[i + (j + 2) * c_dim1] = f13;
1605 c[i + (j + 3) * c_dim1] = f14;
1606 }
1607 }
1608 }
1609 if (ujsec < jsec)
1610 {
1611 i4 = jj + jsec - 1;
1612 for (j = jj + ujsec; j <= i4; ++j)
1613 {
1614 i5 = ii + uisec - 1;
1615 for (i = ii; i <= i5; i += 4)
1616 {
1617 f11 = c[i + j * c_dim1];
1618 f21 = c[i + 1 + j * c_dim1];
1619 f31 = c[i + 2 + j * c_dim1];
1620 f41 = c[i + 3 + j * c_dim1];
1621 i6 = ll + lsec - 1;
1622 for (l = ll; l <= i6; ++l)
1623 {
1624 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1625 257] * b[l + j * b_dim1];
1626 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1627 257] * b[l + j * b_dim1];
1628 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1629 257] * b[l + j * b_dim1];
1630 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1631 257] * b[l + j * b_dim1];
1632 }
1633 c[i + j * c_dim1] = f11;
1634 c[i + 1 + j * c_dim1] = f21;
1635 c[i + 2 + j * c_dim1] = f31;
1636 c[i + 3 + j * c_dim1] = f41;
1637 }
1638 i5 = ii + isec - 1;
1639 for (i = ii + uisec; i <= i5; ++i)
1640 {
1641 f11 = c[i + j * c_dim1];
1642 i6 = ll + lsec - 1;
1643 for (l = ll; l <= i6; ++l)
1644 {
1645 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1646 257] * b[l + j * b_dim1];
1647 }
1648 c[i + j * c_dim1] = f11;
1649 }
1650 }
1651 }
1652 }
1653 }
1654 }
1655 free(t1);
1656 return;
1657 }
1658 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1659 {
1660 if (GFC_DESCRIPTOR_RANK (a) != 1)
1661 {
1662 const GFC_COMPLEX_10 *restrict abase_x;
1663 const GFC_COMPLEX_10 *restrict bbase_y;
1664 GFC_COMPLEX_10 *restrict dest_y;
1665 GFC_COMPLEX_10 s;
1666
1667 for (y = 0; y < ycount; y++)
1668 {
1669 bbase_y = &bbase[y*bystride];
1670 dest_y = &dest[y*rystride];
1671 for (x = 0; x < xcount; x++)
1672 {
1673 abase_x = &abase[x*axstride];
1674 s = (GFC_COMPLEX_10) 0;
1675 for (n = 0; n < count; n++)
1676 s += abase_x[n] * bbase_y[n];
1677 dest_y[x] = s;
1678 }
1679 }
1680 }
1681 else
1682 {
1683 const GFC_COMPLEX_10 *restrict bbase_y;
1684 GFC_COMPLEX_10 s;
1685
1686 for (y = 0; y < ycount; y++)
1687 {
1688 bbase_y = &bbase[y*bystride];
1689 s = (GFC_COMPLEX_10) 0;
1690 for (n = 0; n < count; n++)
1691 s += abase[n*axstride] * bbase_y[n];
1692 dest[y*rystride] = s;
1693 }
1694 }
1695 }
1696 else if (axstride < aystride)
1697 {
1698 for (y = 0; y < ycount; y++)
1699 for (x = 0; x < xcount; x++)
1700 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
1701
1702 for (y = 0; y < ycount; y++)
1703 for (n = 0; n < count; n++)
1704 for (x = 0; x < xcount; x++)
1705 /* dest[x,y] += a[x,n] * b[n,y] */
1706 dest[x*rxstride + y*rystride] +=
1707 abase[x*axstride + n*aystride] *
1708 bbase[n*bxstride + y*bystride];
1709 }
1710 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1711 {
1712 const GFC_COMPLEX_10 *restrict bbase_y;
1713 GFC_COMPLEX_10 s;
1714
1715 for (y = 0; y < ycount; y++)
1716 {
1717 bbase_y = &bbase[y*bystride];
1718 s = (GFC_COMPLEX_10) 0;
1719 for (n = 0; n < count; n++)
1720 s += abase[n*axstride] * bbase_y[n*bxstride];
1721 dest[y*rxstride] = s;
1722 }
1723 }
1724 else
1725 {
1726 const GFC_COMPLEX_10 *restrict abase_x;
1727 const GFC_COMPLEX_10 *restrict bbase_y;
1728 GFC_COMPLEX_10 *restrict dest_y;
1729 GFC_COMPLEX_10 s;
1730
1731 for (y = 0; y < ycount; y++)
1732 {
1733 bbase_y = &bbase[y*bystride];
1734 dest_y = &dest[y*rystride];
1735 for (x = 0; x < xcount; x++)
1736 {
1737 abase_x = &abase[x*axstride];
1738 s = (GFC_COMPLEX_10) 0;
1739 for (n = 0; n < count; n++)
1740 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1741 dest_y[x*rxstride] = s;
1742 }
1743 }
1744 }
1745 }
1746 #undef POW3
1747 #undef min
1748 #undef max
1749
1750 #endif /* HAVE_AVX512F */
1751
1752 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1753
1754 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1755 void
1756 matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
1757 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1758 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1759 internal_proto(matmul_c10_avx128_fma3);
1760 #endif
1761
1762 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1763 void
1764 matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
1765 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1766 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1767 internal_proto(matmul_c10_avx128_fma4);
1768 #endif
1769
1770 /* Function to fall back to if there is no special processor-specific version. */
1771 static void
1772 matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
1773 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1774 int blas_limit, blas_call gemm)
1775 {
1776 const GFC_COMPLEX_10 * restrict abase;
1777 const GFC_COMPLEX_10 * restrict bbase;
1778 GFC_COMPLEX_10 * restrict dest;
1779
1780 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1781 index_type x, y, n, count, xcount, ycount;
1782
1783 assert (GFC_DESCRIPTOR_RANK (a) == 2
1784 || GFC_DESCRIPTOR_RANK (b) == 2);
1785
1786 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1787
1788 Either A or B (but not both) can be rank 1:
1789
1790 o One-dimensional argument A is implicitly treated as a row matrix
1791 dimensioned [1,count], so xcount=1.
1792
1793 o One-dimensional argument B is implicitly treated as a column matrix
1794 dimensioned [count, 1], so ycount=1.
1795 */
1796
1797 if (retarray->base_addr == NULL)
1798 {
1799 if (GFC_DESCRIPTOR_RANK (a) == 1)
1800 {
1801 GFC_DIMENSION_SET(retarray->dim[0], 0,
1802 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1803 }
1804 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1805 {
1806 GFC_DIMENSION_SET(retarray->dim[0], 0,
1807 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1808 }
1809 else
1810 {
1811 GFC_DIMENSION_SET(retarray->dim[0], 0,
1812 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1813
1814 GFC_DIMENSION_SET(retarray->dim[1], 0,
1815 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1816 GFC_DESCRIPTOR_EXTENT(retarray,0));
1817 }
1818
1819 retarray->base_addr
1820 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
1821 retarray->offset = 0;
1822 }
1823 else if (unlikely (compile_options.bounds_check))
1824 {
1825 index_type ret_extent, arg_extent;
1826
1827 if (GFC_DESCRIPTOR_RANK (a) == 1)
1828 {
1829 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1830 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1831 if (arg_extent != ret_extent)
1832 runtime_error ("Incorrect extent in return array in"
1833 " MATMUL intrinsic: is %ld, should be %ld",
1834 (long int) ret_extent, (long int) arg_extent);
1835 }
1836 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1837 {
1838 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1839 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1840 if (arg_extent != ret_extent)
1841 runtime_error ("Incorrect extent in return array in"
1842 " MATMUL intrinsic: is %ld, should be %ld",
1843 (long int) ret_extent, (long int) arg_extent);
1844 }
1845 else
1846 {
1847 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1848 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1849 if (arg_extent != ret_extent)
1850 runtime_error ("Incorrect extent in return array in"
1851 " MATMUL intrinsic for dimension 1:"
1852 " is %ld, should be %ld",
1853 (long int) ret_extent, (long int) arg_extent);
1854
1855 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1856 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1857 if (arg_extent != ret_extent)
1858 runtime_error ("Incorrect extent in return array in"
1859 " MATMUL intrinsic for dimension 2:"
1860 " is %ld, should be %ld",
1861 (long int) ret_extent, (long int) arg_extent);
1862 }
1863 }
1864
1865
1866 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1867 {
1868 /* One-dimensional result may be addressed in the code below
1869 either as a row or a column matrix. We want both cases to
1870 work. */
1871 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1872 }
1873 else
1874 {
1875 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1876 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1877 }
1878
1879
1880 if (GFC_DESCRIPTOR_RANK (a) == 1)
1881 {
1882 /* Treat it as a a row matrix A[1,count]. */
1883 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1884 aystride = 1;
1885
1886 xcount = 1;
1887 count = GFC_DESCRIPTOR_EXTENT(a,0);
1888 }
1889 else
1890 {
1891 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1892 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1893
1894 count = GFC_DESCRIPTOR_EXTENT(a,1);
1895 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1896 }
1897
1898 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1899 {
1900 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1901 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1902 }
1903
1904 if (GFC_DESCRIPTOR_RANK (b) == 1)
1905 {
1906 /* Treat it as a column matrix B[count,1] */
1907 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1908
1909 /* bystride should never be used for 1-dimensional b.
1910 The value is only used for calculation of the
1911 memory by the buffer. */
1912 bystride = 256;
1913 ycount = 1;
1914 }
1915 else
1916 {
1917 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1918 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1919 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1920 }
1921
1922 abase = a->base_addr;
1923 bbase = b->base_addr;
1924 dest = retarray->base_addr;
1925
1926 /* Now that everything is set up, we perform the multiplication
1927 itself. */
1928
1929 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1930 #define min(a,b) ((a) <= (b) ? (a) : (b))
1931 #define max(a,b) ((a) >= (b) ? (a) : (b))
1932
1933 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1934 && (bxstride == 1 || bystride == 1)
1935 && (((float) xcount) * ((float) ycount) * ((float) count)
1936 > POW3(blas_limit)))
1937 {
1938 const int m = xcount, n = ycount, k = count, ldc = rystride;
1939 const GFC_COMPLEX_10 one = 1, zero = 0;
1940 const int lda = (axstride == 1) ? aystride : axstride,
1941 ldb = (bxstride == 1) ? bystride : bxstride;
1942
1943 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1944 {
1945 assert (gemm != NULL);
1946 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1947 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1948 &ldc, 1, 1);
1949 return;
1950 }
1951 }
1952
1953 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1954 {
1955 /* This block of code implements a tuned matmul, derived from
1956 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1957
1958 Bo Kagstrom and Per Ling
1959 Department of Computing Science
1960 Umea University
1961 S-901 87 Umea, Sweden
1962
1963 from netlib.org, translated to C, and modified for matmul.m4. */
1964
1965 const GFC_COMPLEX_10 *a, *b;
1966 GFC_COMPLEX_10 *c;
1967 const index_type m = xcount, n = ycount, k = count;
1968
1969 /* System generated locals */
1970 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1971 i1, i2, i3, i4, i5, i6;
1972
1973 /* Local variables */
1974 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
1975 f13, f14, f23, f24, f33, f34, f43, f44;
1976 index_type i, j, l, ii, jj, ll;
1977 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1978 GFC_COMPLEX_10 *t1;
1979
1980 a = abase;
1981 b = bbase;
1982 c = retarray->base_addr;
1983
1984 /* Parameter adjustments */
1985 c_dim1 = rystride;
1986 c_offset = 1 + c_dim1;
1987 c -= c_offset;
1988 a_dim1 = aystride;
1989 a_offset = 1 + a_dim1;
1990 a -= a_offset;
1991 b_dim1 = bystride;
1992 b_offset = 1 + b_dim1;
1993 b -= b_offset;
1994
1995 /* Empty c first. */
1996 for (j=1; j<=n; j++)
1997 for (i=1; i<=m; i++)
1998 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
1999
2000 /* Early exit if possible */
2001 if (m == 0 || n == 0 || k == 0)
2002 return;
2003
2004 /* Adjust size of t1 to what is needed. */
2005 index_type t1_dim, a_sz;
2006 if (aystride == 1)
2007 a_sz = rystride;
2008 else
2009 a_sz = a_dim1;
2010
2011 t1_dim = a_sz * 256 + b_dim1;
2012 if (t1_dim > 65536)
2013 t1_dim = 65536;
2014
2015 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
2016
2017 /* Start turning the crank. */
2018 i1 = n;
2019 for (jj = 1; jj <= i1; jj += 512)
2020 {
2021 /* Computing MIN */
2022 i2 = 512;
2023 i3 = n - jj + 1;
2024 jsec = min(i2,i3);
2025 ujsec = jsec - jsec % 4;
2026 i2 = k;
2027 for (ll = 1; ll <= i2; ll += 256)
2028 {
2029 /* Computing MIN */
2030 i3 = 256;
2031 i4 = k - ll + 1;
2032 lsec = min(i3,i4);
2033 ulsec = lsec - lsec % 2;
2034
2035 i3 = m;
2036 for (ii = 1; ii <= i3; ii += 256)
2037 {
2038 /* Computing MIN */
2039 i4 = 256;
2040 i5 = m - ii + 1;
2041 isec = min(i4,i5);
2042 uisec = isec - isec % 2;
2043 i4 = ll + ulsec - 1;
2044 for (l = ll; l <= i4; l += 2)
2045 {
2046 i5 = ii + uisec - 1;
2047 for (i = ii; i <= i5; i += 2)
2048 {
2049 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2050 a[i + l * a_dim1];
2051 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2052 a[i + (l + 1) * a_dim1];
2053 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2054 a[i + 1 + l * a_dim1];
2055 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2056 a[i + 1 + (l + 1) * a_dim1];
2057 }
2058 if (uisec < isec)
2059 {
2060 t1[l - ll + 1 + (isec << 8) - 257] =
2061 a[ii + isec - 1 + l * a_dim1];
2062 t1[l - ll + 2 + (isec << 8) - 257] =
2063 a[ii + isec - 1 + (l + 1) * a_dim1];
2064 }
2065 }
2066 if (ulsec < lsec)
2067 {
2068 i4 = ii + isec - 1;
2069 for (i = ii; i<= i4; ++i)
2070 {
2071 t1[lsec + ((i - ii + 1) << 8) - 257] =
2072 a[i + (ll + lsec - 1) * a_dim1];
2073 }
2074 }
2075
2076 uisec = isec - isec % 4;
2077 i4 = jj + ujsec - 1;
2078 for (j = jj; j <= i4; j += 4)
2079 {
2080 i5 = ii + uisec - 1;
2081 for (i = ii; i <= i5; i += 4)
2082 {
2083 f11 = c[i + j * c_dim1];
2084 f21 = c[i + 1 + j * c_dim1];
2085 f12 = c[i + (j + 1) * c_dim1];
2086 f22 = c[i + 1 + (j + 1) * c_dim1];
2087 f13 = c[i + (j + 2) * c_dim1];
2088 f23 = c[i + 1 + (j + 2) * c_dim1];
2089 f14 = c[i + (j + 3) * c_dim1];
2090 f24 = c[i + 1 + (j + 3) * c_dim1];
2091 f31 = c[i + 2 + j * c_dim1];
2092 f41 = c[i + 3 + j * c_dim1];
2093 f32 = c[i + 2 + (j + 1) * c_dim1];
2094 f42 = c[i + 3 + (j + 1) * c_dim1];
2095 f33 = c[i + 2 + (j + 2) * c_dim1];
2096 f43 = c[i + 3 + (j + 2) * c_dim1];
2097 f34 = c[i + 2 + (j + 3) * c_dim1];
2098 f44 = c[i + 3 + (j + 3) * c_dim1];
2099 i6 = ll + lsec - 1;
2100 for (l = ll; l <= i6; ++l)
2101 {
2102 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2103 * b[l + j * b_dim1];
2104 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2105 * b[l + j * b_dim1];
2106 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2107 * b[l + (j + 1) * b_dim1];
2108 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2109 * b[l + (j + 1) * b_dim1];
2110 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2111 * b[l + (j + 2) * b_dim1];
2112 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2113 * b[l + (j + 2) * b_dim1];
2114 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2115 * b[l + (j + 3) * b_dim1];
2116 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2117 * b[l + (j + 3) * b_dim1];
2118 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2119 * b[l + j * b_dim1];
2120 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2121 * b[l + j * b_dim1];
2122 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2123 * b[l + (j + 1) * b_dim1];
2124 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2125 * b[l + (j + 1) * b_dim1];
2126 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2127 * b[l + (j + 2) * b_dim1];
2128 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2129 * b[l + (j + 2) * b_dim1];
2130 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2131 * b[l + (j + 3) * b_dim1];
2132 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2133 * b[l + (j + 3) * b_dim1];
2134 }
2135 c[i + j * c_dim1] = f11;
2136 c[i + 1 + j * c_dim1] = f21;
2137 c[i + (j + 1) * c_dim1] = f12;
2138 c[i + 1 + (j + 1) * c_dim1] = f22;
2139 c[i + (j + 2) * c_dim1] = f13;
2140 c[i + 1 + (j + 2) * c_dim1] = f23;
2141 c[i + (j + 3) * c_dim1] = f14;
2142 c[i + 1 + (j + 3) * c_dim1] = f24;
2143 c[i + 2 + j * c_dim1] = f31;
2144 c[i + 3 + j * c_dim1] = f41;
2145 c[i + 2 + (j + 1) * c_dim1] = f32;
2146 c[i + 3 + (j + 1) * c_dim1] = f42;
2147 c[i + 2 + (j + 2) * c_dim1] = f33;
2148 c[i + 3 + (j + 2) * c_dim1] = f43;
2149 c[i + 2 + (j + 3) * c_dim1] = f34;
2150 c[i + 3 + (j + 3) * c_dim1] = f44;
2151 }
2152 if (uisec < isec)
2153 {
2154 i5 = ii + isec - 1;
2155 for (i = ii + uisec; i <= i5; ++i)
2156 {
2157 f11 = c[i + j * c_dim1];
2158 f12 = c[i + (j + 1) * c_dim1];
2159 f13 = c[i + (j + 2) * c_dim1];
2160 f14 = c[i + (j + 3) * c_dim1];
2161 i6 = ll + lsec - 1;
2162 for (l = ll; l <= i6; ++l)
2163 {
2164 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2165 257] * b[l + j * b_dim1];
2166 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2167 257] * b[l + (j + 1) * b_dim1];
2168 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2169 257] * b[l + (j + 2) * b_dim1];
2170 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2171 257] * b[l + (j + 3) * b_dim1];
2172 }
2173 c[i + j * c_dim1] = f11;
2174 c[i + (j + 1) * c_dim1] = f12;
2175 c[i + (j + 2) * c_dim1] = f13;
2176 c[i + (j + 3) * c_dim1] = f14;
2177 }
2178 }
2179 }
2180 if (ujsec < jsec)
2181 {
2182 i4 = jj + jsec - 1;
2183 for (j = jj + ujsec; j <= i4; ++j)
2184 {
2185 i5 = ii + uisec - 1;
2186 for (i = ii; i <= i5; i += 4)
2187 {
2188 f11 = c[i + j * c_dim1];
2189 f21 = c[i + 1 + j * c_dim1];
2190 f31 = c[i + 2 + j * c_dim1];
2191 f41 = c[i + 3 + j * c_dim1];
2192 i6 = ll + lsec - 1;
2193 for (l = ll; l <= i6; ++l)
2194 {
2195 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2196 257] * b[l + j * b_dim1];
2197 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2198 257] * b[l + j * b_dim1];
2199 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2200 257] * b[l + j * b_dim1];
2201 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2202 257] * b[l + j * b_dim1];
2203 }
2204 c[i + j * c_dim1] = f11;
2205 c[i + 1 + j * c_dim1] = f21;
2206 c[i + 2 + j * c_dim1] = f31;
2207 c[i + 3 + j * c_dim1] = f41;
2208 }
2209 i5 = ii + isec - 1;
2210 for (i = ii + uisec; i <= i5; ++i)
2211 {
2212 f11 = c[i + j * c_dim1];
2213 i6 = ll + lsec - 1;
2214 for (l = ll; l <= i6; ++l)
2215 {
2216 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217 257] * b[l + j * b_dim1];
2218 }
2219 c[i + j * c_dim1] = f11;
2220 }
2221 }
2222 }
2223 }
2224 }
2225 }
2226 free(t1);
2227 return;
2228 }
2229 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2230 {
2231 if (GFC_DESCRIPTOR_RANK (a) != 1)
2232 {
2233 const GFC_COMPLEX_10 *restrict abase_x;
2234 const GFC_COMPLEX_10 *restrict bbase_y;
2235 GFC_COMPLEX_10 *restrict dest_y;
2236 GFC_COMPLEX_10 s;
2237
2238 for (y = 0; y < ycount; y++)
2239 {
2240 bbase_y = &bbase[y*bystride];
2241 dest_y = &dest[y*rystride];
2242 for (x = 0; x < xcount; x++)
2243 {
2244 abase_x = &abase[x*axstride];
2245 s = (GFC_COMPLEX_10) 0;
2246 for (n = 0; n < count; n++)
2247 s += abase_x[n] * bbase_y[n];
2248 dest_y[x] = s;
2249 }
2250 }
2251 }
2252 else
2253 {
2254 const GFC_COMPLEX_10 *restrict bbase_y;
2255 GFC_COMPLEX_10 s;
2256
2257 for (y = 0; y < ycount; y++)
2258 {
2259 bbase_y = &bbase[y*bystride];
2260 s = (GFC_COMPLEX_10) 0;
2261 for (n = 0; n < count; n++)
2262 s += abase[n*axstride] * bbase_y[n];
2263 dest[y*rystride] = s;
2264 }
2265 }
2266 }
2267 else if (axstride < aystride)
2268 {
2269 for (y = 0; y < ycount; y++)
2270 for (x = 0; x < xcount; x++)
2271 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
2272
2273 for (y = 0; y < ycount; y++)
2274 for (n = 0; n < count; n++)
2275 for (x = 0; x < xcount; x++)
2276 /* dest[x,y] += a[x,n] * b[n,y] */
2277 dest[x*rxstride + y*rystride] +=
2278 abase[x*axstride + n*aystride] *
2279 bbase[n*bxstride + y*bystride];
2280 }
2281 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2282 {
2283 const GFC_COMPLEX_10 *restrict bbase_y;
2284 GFC_COMPLEX_10 s;
2285
2286 for (y = 0; y < ycount; y++)
2287 {
2288 bbase_y = &bbase[y*bystride];
2289 s = (GFC_COMPLEX_10) 0;
2290 for (n = 0; n < count; n++)
2291 s += abase[n*axstride] * bbase_y[n*bxstride];
2292 dest[y*rxstride] = s;
2293 }
2294 }
2295 else
2296 {
2297 const GFC_COMPLEX_10 *restrict abase_x;
2298 const GFC_COMPLEX_10 *restrict bbase_y;
2299 GFC_COMPLEX_10 *restrict dest_y;
2300 GFC_COMPLEX_10 s;
2301
2302 for (y = 0; y < ycount; y++)
2303 {
2304 bbase_y = &bbase[y*bystride];
2305 dest_y = &dest[y*rystride];
2306 for (x = 0; x < xcount; x++)
2307 {
2308 abase_x = &abase[x*axstride];
2309 s = (GFC_COMPLEX_10) 0;
2310 for (n = 0; n < count; n++)
2311 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2312 dest_y[x*rxstride] = s;
2313 }
2314 }
2315 }
2316 }
2317 #undef POW3
2318 #undef min
2319 #undef max
2320
2321
2322 /* Compiling main function, with selection code for the processor. */
2323
2324 /* Currently, this is i386 only. Adjust for other architectures. */
2325
2326 #include <config/i386/cpuinfo.h>
2327 void matmul_c10 (gfc_array_c10 * const restrict retarray,
2328 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2329 int blas_limit, blas_call gemm)
2330 {
2331 static void (*matmul_p) (gfc_array_c10 * const restrict retarray,
2332 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2333 int blas_limit, blas_call gemm);
2334
2335 void (*matmul_fn) (gfc_array_c10 * const restrict retarray,
2336 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2337 int blas_limit, blas_call gemm);
2338
2339 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2340 if (matmul_fn == NULL)
2341 {
2342 matmul_fn = matmul_c10_vanilla;
2343 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2344 {
2345 /* Run down the available processors in order of preference. */
2346 #ifdef HAVE_AVX512F
2347 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2348 {
2349 matmul_fn = matmul_c10_avx512f;
2350 goto store;
2351 }
2352
2353 #endif /* HAVE_AVX512F */
2354
2355 #ifdef HAVE_AVX2
2356 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2357 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2358 {
2359 matmul_fn = matmul_c10_avx2;
2360 goto store;
2361 }
2362
2363 #endif
2364
2365 #ifdef HAVE_AVX
2366 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2367 {
2368 matmul_fn = matmul_c10_avx;
2369 goto store;
2370 }
2371 #endif /* HAVE_AVX */
2372 }
2373 else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2374 {
2375 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2376 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2377 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2378 {
2379 matmul_fn = matmul_c10_avx128_fma3;
2380 goto store;
2381 }
2382 #endif
2383 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2384 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2385 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2386 {
2387 matmul_fn = matmul_c10_avx128_fma4;
2388 goto store;
2389 }
2390 #endif
2391
2392 }
2393 store:
2394 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2395 }
2396
2397 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2398 }
2399
2400 #else /* Just the vanilla function. */
2401
2402 void
2403 matmul_c10 (gfc_array_c10 * const restrict retarray,
2404 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2405 int blas_limit, blas_call gemm)
2406 {
2407 const GFC_COMPLEX_10 * restrict abase;
2408 const GFC_COMPLEX_10 * restrict bbase;
2409 GFC_COMPLEX_10 * restrict dest;
2410
2411 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2412 index_type x, y, n, count, xcount, ycount;
2413
2414 assert (GFC_DESCRIPTOR_RANK (a) == 2
2415 || GFC_DESCRIPTOR_RANK (b) == 2);
2416
2417 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2418
2419 Either A or B (but not both) can be rank 1:
2420
2421 o One-dimensional argument A is implicitly treated as a row matrix
2422 dimensioned [1,count], so xcount=1.
2423
2424 o One-dimensional argument B is implicitly treated as a column matrix
2425 dimensioned [count, 1], so ycount=1.
2426 */
2427
2428 if (retarray->base_addr == NULL)
2429 {
2430 if (GFC_DESCRIPTOR_RANK (a) == 1)
2431 {
2432 GFC_DIMENSION_SET(retarray->dim[0], 0,
2433 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2434 }
2435 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2436 {
2437 GFC_DIMENSION_SET(retarray->dim[0], 0,
2438 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2439 }
2440 else
2441 {
2442 GFC_DIMENSION_SET(retarray->dim[0], 0,
2443 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2444
2445 GFC_DIMENSION_SET(retarray->dim[1], 0,
2446 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2447 GFC_DESCRIPTOR_EXTENT(retarray,0));
2448 }
2449
2450 retarray->base_addr
2451 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
2452 retarray->offset = 0;
2453 }
2454 else if (unlikely (compile_options.bounds_check))
2455 {
2456 index_type ret_extent, arg_extent;
2457
2458 if (GFC_DESCRIPTOR_RANK (a) == 1)
2459 {
2460 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2461 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2462 if (arg_extent != ret_extent)
2463 runtime_error ("Incorrect extent in return array in"
2464 " MATMUL intrinsic: is %ld, should be %ld",
2465 (long int) ret_extent, (long int) arg_extent);
2466 }
2467 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2468 {
2469 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2470 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2471 if (arg_extent != ret_extent)
2472 runtime_error ("Incorrect extent in return array in"
2473 " MATMUL intrinsic: is %ld, should be %ld",
2474 (long int) ret_extent, (long int) arg_extent);
2475 }
2476 else
2477 {
2478 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2479 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2480 if (arg_extent != ret_extent)
2481 runtime_error ("Incorrect extent in return array in"
2482 " MATMUL intrinsic for dimension 1:"
2483 " is %ld, should be %ld",
2484 (long int) ret_extent, (long int) arg_extent);
2485
2486 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2487 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2488 if (arg_extent != ret_extent)
2489 runtime_error ("Incorrect extent in return array in"
2490 " MATMUL intrinsic for dimension 2:"
2491 " is %ld, should be %ld",
2492 (long int) ret_extent, (long int) arg_extent);
2493 }
2494 }
2495
2496
2497 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2498 {
2499 /* One-dimensional result may be addressed in the code below
2500 either as a row or a column matrix. We want both cases to
2501 work. */
2502 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2503 }
2504 else
2505 {
2506 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2507 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2508 }
2509
2510
2511 if (GFC_DESCRIPTOR_RANK (a) == 1)
2512 {
2513 /* Treat it as a a row matrix A[1,count]. */
2514 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2515 aystride = 1;
2516
2517 xcount = 1;
2518 count = GFC_DESCRIPTOR_EXTENT(a,0);
2519 }
2520 else
2521 {
2522 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2523 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2524
2525 count = GFC_DESCRIPTOR_EXTENT(a,1);
2526 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2527 }
2528
2529 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2530 {
2531 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2532 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2533 }
2534
2535 if (GFC_DESCRIPTOR_RANK (b) == 1)
2536 {
2537 /* Treat it as a column matrix B[count,1] */
2538 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2539
2540 /* bystride should never be used for 1-dimensional b.
2541 The value is only used for calculation of the
2542 memory by the buffer. */
2543 bystride = 256;
2544 ycount = 1;
2545 }
2546 else
2547 {
2548 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2549 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2550 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2551 }
2552
2553 abase = a->base_addr;
2554 bbase = b->base_addr;
2555 dest = retarray->base_addr;
2556
2557 /* Now that everything is set up, we perform the multiplication
2558 itself. */
2559
2560 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2561 #define min(a,b) ((a) <= (b) ? (a) : (b))
2562 #define max(a,b) ((a) >= (b) ? (a) : (b))
2563
2564 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2565 && (bxstride == 1 || bystride == 1)
2566 && (((float) xcount) * ((float) ycount) * ((float) count)
2567 > POW3(blas_limit)))
2568 {
2569 const int m = xcount, n = ycount, k = count, ldc = rystride;
2570 const GFC_COMPLEX_10 one = 1, zero = 0;
2571 const int lda = (axstride == 1) ? aystride : axstride,
2572 ldb = (bxstride == 1) ? bystride : bxstride;
2573
2574 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2575 {
2576 assert (gemm != NULL);
2577 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2578 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2579 &ldc, 1, 1);
2580 return;
2581 }
2582 }
2583
2584 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2585 {
2586 /* This block of code implements a tuned matmul, derived from
2587 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2588
2589 Bo Kagstrom and Per Ling
2590 Department of Computing Science
2591 Umea University
2592 S-901 87 Umea, Sweden
2593
2594 from netlib.org, translated to C, and modified for matmul.m4. */
2595
2596 const GFC_COMPLEX_10 *a, *b;
2597 GFC_COMPLEX_10 *c;
2598 const index_type m = xcount, n = ycount, k = count;
2599
2600 /* System generated locals */
2601 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2602 i1, i2, i3, i4, i5, i6;
2603
2604 /* Local variables */
2605 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
2606 f13, f14, f23, f24, f33, f34, f43, f44;
2607 index_type i, j, l, ii, jj, ll;
2608 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2609 GFC_COMPLEX_10 *t1;
2610
2611 a = abase;
2612 b = bbase;
2613 c = retarray->base_addr;
2614
2615 /* Parameter adjustments */
2616 c_dim1 = rystride;
2617 c_offset = 1 + c_dim1;
2618 c -= c_offset;
2619 a_dim1 = aystride;
2620 a_offset = 1 + a_dim1;
2621 a -= a_offset;
2622 b_dim1 = bystride;
2623 b_offset = 1 + b_dim1;
2624 b -= b_offset;
2625
2626 /* Empty c first. */
2627 for (j=1; j<=n; j++)
2628 for (i=1; i<=m; i++)
2629 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
2630
2631 /* Early exit if possible */
2632 if (m == 0 || n == 0 || k == 0)
2633 return;
2634
2635 /* Adjust size of t1 to what is needed. */
2636 index_type t1_dim, a_sz;
2637 if (aystride == 1)
2638 a_sz = rystride;
2639 else
2640 a_sz = a_dim1;
2641
2642 t1_dim = a_sz * 256 + b_dim1;
2643 if (t1_dim > 65536)
2644 t1_dim = 65536;
2645
2646 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
2647
2648 /* Start turning the crank. */
2649 i1 = n;
2650 for (jj = 1; jj <= i1; jj += 512)
2651 {
2652 /* Computing MIN */
2653 i2 = 512;
2654 i3 = n - jj + 1;
2655 jsec = min(i2,i3);
2656 ujsec = jsec - jsec % 4;
2657 i2 = k;
2658 for (ll = 1; ll <= i2; ll += 256)
2659 {
2660 /* Computing MIN */
2661 i3 = 256;
2662 i4 = k - ll + 1;
2663 lsec = min(i3,i4);
2664 ulsec = lsec - lsec % 2;
2665
2666 i3 = m;
2667 for (ii = 1; ii <= i3; ii += 256)
2668 {
2669 /* Computing MIN */
2670 i4 = 256;
2671 i5 = m - ii + 1;
2672 isec = min(i4,i5);
2673 uisec = isec - isec % 2;
2674 i4 = ll + ulsec - 1;
2675 for (l = ll; l <= i4; l += 2)
2676 {
2677 i5 = ii + uisec - 1;
2678 for (i = ii; i <= i5; i += 2)
2679 {
2680 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2681 a[i + l * a_dim1];
2682 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2683 a[i + (l + 1) * a_dim1];
2684 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2685 a[i + 1 + l * a_dim1];
2686 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2687 a[i + 1 + (l + 1) * a_dim1];
2688 }
2689 if (uisec < isec)
2690 {
2691 t1[l - ll + 1 + (isec << 8) - 257] =
2692 a[ii + isec - 1 + l * a_dim1];
2693 t1[l - ll + 2 + (isec << 8) - 257] =
2694 a[ii + isec - 1 + (l + 1) * a_dim1];
2695 }
2696 }
2697 if (ulsec < lsec)
2698 {
2699 i4 = ii + isec - 1;
2700 for (i = ii; i<= i4; ++i)
2701 {
2702 t1[lsec + ((i - ii + 1) << 8) - 257] =
2703 a[i + (ll + lsec - 1) * a_dim1];
2704 }
2705 }
2706
2707 uisec = isec - isec % 4;
2708 i4 = jj + ujsec - 1;
2709 for (j = jj; j <= i4; j += 4)
2710 {
2711 i5 = ii + uisec - 1;
2712 for (i = ii; i <= i5; i += 4)
2713 {
2714 f11 = c[i + j * c_dim1];
2715 f21 = c[i + 1 + j * c_dim1];
2716 f12 = c[i + (j + 1) * c_dim1];
2717 f22 = c[i + 1 + (j + 1) * c_dim1];
2718 f13 = c[i + (j + 2) * c_dim1];
2719 f23 = c[i + 1 + (j + 2) * c_dim1];
2720 f14 = c[i + (j + 3) * c_dim1];
2721 f24 = c[i + 1 + (j + 3) * c_dim1];
2722 f31 = c[i + 2 + j * c_dim1];
2723 f41 = c[i + 3 + j * c_dim1];
2724 f32 = c[i + 2 + (j + 1) * c_dim1];
2725 f42 = c[i + 3 + (j + 1) * c_dim1];
2726 f33 = c[i + 2 + (j + 2) * c_dim1];
2727 f43 = c[i + 3 + (j + 2) * c_dim1];
2728 f34 = c[i + 2 + (j + 3) * c_dim1];
2729 f44 = c[i + 3 + (j + 3) * c_dim1];
2730 i6 = ll + lsec - 1;
2731 for (l = ll; l <= i6; ++l)
2732 {
2733 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2734 * b[l + j * b_dim1];
2735 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2736 * b[l + j * b_dim1];
2737 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2738 * b[l + (j + 1) * b_dim1];
2739 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2740 * b[l + (j + 1) * b_dim1];
2741 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2742 * b[l + (j + 2) * b_dim1];
2743 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2744 * b[l + (j + 2) * b_dim1];
2745 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2746 * b[l + (j + 3) * b_dim1];
2747 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2748 * b[l + (j + 3) * b_dim1];
2749 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2750 * b[l + j * b_dim1];
2751 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2752 * b[l + j * b_dim1];
2753 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2754 * b[l + (j + 1) * b_dim1];
2755 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2756 * b[l + (j + 1) * b_dim1];
2757 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2758 * b[l + (j + 2) * b_dim1];
2759 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2760 * b[l + (j + 2) * b_dim1];
2761 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2762 * b[l + (j + 3) * b_dim1];
2763 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2764 * b[l + (j + 3) * b_dim1];
2765 }
2766 c[i + j * c_dim1] = f11;
2767 c[i + 1 + j * c_dim1] = f21;
2768 c[i + (j + 1) * c_dim1] = f12;
2769 c[i + 1 + (j + 1) * c_dim1] = f22;
2770 c[i + (j + 2) * c_dim1] = f13;
2771 c[i + 1 + (j + 2) * c_dim1] = f23;
2772 c[i + (j + 3) * c_dim1] = f14;
2773 c[i + 1 + (j + 3) * c_dim1] = f24;
2774 c[i + 2 + j * c_dim1] = f31;
2775 c[i + 3 + j * c_dim1] = f41;
2776 c[i + 2 + (j + 1) * c_dim1] = f32;
2777 c[i + 3 + (j + 1) * c_dim1] = f42;
2778 c[i + 2 + (j + 2) * c_dim1] = f33;
2779 c[i + 3 + (j + 2) * c_dim1] = f43;
2780 c[i + 2 + (j + 3) * c_dim1] = f34;
2781 c[i + 3 + (j + 3) * c_dim1] = f44;
2782 }
2783 if (uisec < isec)
2784 {
2785 i5 = ii + isec - 1;
2786 for (i = ii + uisec; i <= i5; ++i)
2787 {
2788 f11 = c[i + j * c_dim1];
2789 f12 = c[i + (j + 1) * c_dim1];
2790 f13 = c[i + (j + 2) * c_dim1];
2791 f14 = c[i + (j + 3) * c_dim1];
2792 i6 = ll + lsec - 1;
2793 for (l = ll; l <= i6; ++l)
2794 {
2795 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2796 257] * b[l + j * b_dim1];
2797 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2798 257] * b[l + (j + 1) * b_dim1];
2799 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2800 257] * b[l + (j + 2) * b_dim1];
2801 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2802 257] * b[l + (j + 3) * b_dim1];
2803 }
2804 c[i + j * c_dim1] = f11;
2805 c[i + (j + 1) * c_dim1] = f12;
2806 c[i + (j + 2) * c_dim1] = f13;
2807 c[i + (j + 3) * c_dim1] = f14;
2808 }
2809 }
2810 }
2811 if (ujsec < jsec)
2812 {
2813 i4 = jj + jsec - 1;
2814 for (j = jj + ujsec; j <= i4; ++j)
2815 {
2816 i5 = ii + uisec - 1;
2817 for (i = ii; i <= i5; i += 4)
2818 {
2819 f11 = c[i + j * c_dim1];
2820 f21 = c[i + 1 + j * c_dim1];
2821 f31 = c[i + 2 + j * c_dim1];
2822 f41 = c[i + 3 + j * c_dim1];
2823 i6 = ll + lsec - 1;
2824 for (l = ll; l <= i6; ++l)
2825 {
2826 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2827 257] * b[l + j * b_dim1];
2828 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2829 257] * b[l + j * b_dim1];
2830 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2831 257] * b[l + j * b_dim1];
2832 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2833 257] * b[l + j * b_dim1];
2834 }
2835 c[i + j * c_dim1] = f11;
2836 c[i + 1 + j * c_dim1] = f21;
2837 c[i + 2 + j * c_dim1] = f31;
2838 c[i + 3 + j * c_dim1] = f41;
2839 }
2840 i5 = ii + isec - 1;
2841 for (i = ii + uisec; i <= i5; ++i)
2842 {
2843 f11 = c[i + j * c_dim1];
2844 i6 = ll + lsec - 1;
2845 for (l = ll; l <= i6; ++l)
2846 {
2847 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2848 257] * b[l + j * b_dim1];
2849 }
2850 c[i + j * c_dim1] = f11;
2851 }
2852 }
2853 }
2854 }
2855 }
2856 }
2857 free(t1);
2858 return;
2859 }
2860 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2861 {
2862 if (GFC_DESCRIPTOR_RANK (a) != 1)
2863 {
2864 const GFC_COMPLEX_10 *restrict abase_x;
2865 const GFC_COMPLEX_10 *restrict bbase_y;
2866 GFC_COMPLEX_10 *restrict dest_y;
2867 GFC_COMPLEX_10 s;
2868
2869 for (y = 0; y < ycount; y++)
2870 {
2871 bbase_y = &bbase[y*bystride];
2872 dest_y = &dest[y*rystride];
2873 for (x = 0; x < xcount; x++)
2874 {
2875 abase_x = &abase[x*axstride];
2876 s = (GFC_COMPLEX_10) 0;
2877 for (n = 0; n < count; n++)
2878 s += abase_x[n] * bbase_y[n];
2879 dest_y[x] = s;
2880 }
2881 }
2882 }
2883 else
2884 {
2885 const GFC_COMPLEX_10 *restrict bbase_y;
2886 GFC_COMPLEX_10 s;
2887
2888 for (y = 0; y < ycount; y++)
2889 {
2890 bbase_y = &bbase[y*bystride];
2891 s = (GFC_COMPLEX_10) 0;
2892 for (n = 0; n < count; n++)
2893 s += abase[n*axstride] * bbase_y[n];
2894 dest[y*rystride] = s;
2895 }
2896 }
2897 }
2898 else if (axstride < aystride)
2899 {
2900 for (y = 0; y < ycount; y++)
2901 for (x = 0; x < xcount; x++)
2902 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
2903
2904 for (y = 0; y < ycount; y++)
2905 for (n = 0; n < count; n++)
2906 for (x = 0; x < xcount; x++)
2907 /* dest[x,y] += a[x,n] * b[n,y] */
2908 dest[x*rxstride + y*rystride] +=
2909 abase[x*axstride + n*aystride] *
2910 bbase[n*bxstride + y*bystride];
2911 }
2912 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2913 {
2914 const GFC_COMPLEX_10 *restrict bbase_y;
2915 GFC_COMPLEX_10 s;
2916
2917 for (y = 0; y < ycount; y++)
2918 {
2919 bbase_y = &bbase[y*bystride];
2920 s = (GFC_COMPLEX_10) 0;
2921 for (n = 0; n < count; n++)
2922 s += abase[n*axstride] * bbase_y[n*bxstride];
2923 dest[y*rxstride] = s;
2924 }
2925 }
2926 else
2927 {
2928 const GFC_COMPLEX_10 *restrict abase_x;
2929 const GFC_COMPLEX_10 *restrict bbase_y;
2930 GFC_COMPLEX_10 *restrict dest_y;
2931 GFC_COMPLEX_10 s;
2932
2933 for (y = 0; y < ycount; y++)
2934 {
2935 bbase_y = &bbase[y*bystride];
2936 dest_y = &dest[y*rystride];
2937 for (x = 0; x < xcount; x++)
2938 {
2939 abase_x = &abase[x*axstride];
2940 s = (GFC_COMPLEX_10) 0;
2941 for (n = 0; n < count; n++)
2942 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2943 dest_y[x*rxstride] = s;
2944 }
2945 }
2946 }
2947 }
2948 #undef POW3
2949 #undef min
2950 #undef max
2951
2952 #endif
2953 #endif
2954