]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_c8.c
re PR libfortran/78379 (Processor-specific versions for matmul)
[thirdparty/gcc.git] / libgfortran / generated / matmul_c8.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_COMPLEX_8)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
39 const int *, const GFC_COMPLEX_8 *, const int *,
40 const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
73 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_c8);
76
77 /* Put exhaustive list of possible architectures here here, ORed together. */
78
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81 #ifdef HAVE_AVX
82 static void
83 matmul_c8_avx (gfc_array_c8 * const restrict retarray,
84 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_c8_avx (gfc_array_c8 * const restrict retarray,
88 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90 {
91 const GFC_COMPLEX_8 * restrict abase;
92 const GFC_COMPLEX_8 * restrict bbase;
93 GFC_COMPLEX_8 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110 */
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
169
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
177 }
178 }
179
180
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182 {
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187 }
188 else
189 {
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192 }
193
194
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
196 {
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
200
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
203 }
204 else
205 {
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211 }
212
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214 {
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride = 0xDEADBEEF;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_COMPLEX_8 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
265 }
266 }
267
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
269 {
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
272
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
277
278 from netlib.org, translated to C, and modified for matmul.m4. */
279
280 const GFC_COMPLEX_8 *a, *b;
281 GFC_COMPLEX_8 *c;
282 const index_type m = xcount, n = ycount, k = count;
283
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
287
288 /* Local variables */
289 GFC_COMPLEX_8 t1[65536], /* was [256][256] */
290 f11, f12, f21, f22, f31, f32, f41, f42,
291 f13, f14, f23, f24, f33, f34, f43, f44;
292 index_type i, j, l, ii, jj, ll;
293 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
294
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
298
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
309
310 /* Early exit if possible */
311 if (m == 0 || n == 0 || k == 0)
312 return;
313
314 /* Empty c first. */
315 for (j=1; j<=n; j++)
316 for (i=1; i<=m; i++)
317 c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
318
319 /* Start turning the crank. */
320 i1 = n;
321 for (jj = 1; jj <= i1; jj += 512)
322 {
323 /* Computing MIN */
324 i2 = 512;
325 i3 = n - jj + 1;
326 jsec = min(i2,i3);
327 ujsec = jsec - jsec % 4;
328 i2 = k;
329 for (ll = 1; ll <= i2; ll += 256)
330 {
331 /* Computing MIN */
332 i3 = 256;
333 i4 = k - ll + 1;
334 lsec = min(i3,i4);
335 ulsec = lsec - lsec % 2;
336
337 i3 = m;
338 for (ii = 1; ii <= i3; ii += 256)
339 {
340 /* Computing MIN */
341 i4 = 256;
342 i5 = m - ii + 1;
343 isec = min(i4,i5);
344 uisec = isec - isec % 2;
345 i4 = ll + ulsec - 1;
346 for (l = ll; l <= i4; l += 2)
347 {
348 i5 = ii + uisec - 1;
349 for (i = ii; i <= i5; i += 2)
350 {
351 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
352 a[i + l * a_dim1];
353 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
354 a[i + (l + 1) * a_dim1];
355 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
356 a[i + 1 + l * a_dim1];
357 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
358 a[i + 1 + (l + 1) * a_dim1];
359 }
360 if (uisec < isec)
361 {
362 t1[l - ll + 1 + (isec << 8) - 257] =
363 a[ii + isec - 1 + l * a_dim1];
364 t1[l - ll + 2 + (isec << 8) - 257] =
365 a[ii + isec - 1 + (l + 1) * a_dim1];
366 }
367 }
368 if (ulsec < lsec)
369 {
370 i4 = ii + isec - 1;
371 for (i = ii; i<= i4; ++i)
372 {
373 t1[lsec + ((i - ii + 1) << 8) - 257] =
374 a[i + (ll + lsec - 1) * a_dim1];
375 }
376 }
377
378 uisec = isec - isec % 4;
379 i4 = jj + ujsec - 1;
380 for (j = jj; j <= i4; j += 4)
381 {
382 i5 = ii + uisec - 1;
383 for (i = ii; i <= i5; i += 4)
384 {
385 f11 = c[i + j * c_dim1];
386 f21 = c[i + 1 + j * c_dim1];
387 f12 = c[i + (j + 1) * c_dim1];
388 f22 = c[i + 1 + (j + 1) * c_dim1];
389 f13 = c[i + (j + 2) * c_dim1];
390 f23 = c[i + 1 + (j + 2) * c_dim1];
391 f14 = c[i + (j + 3) * c_dim1];
392 f24 = c[i + 1 + (j + 3) * c_dim1];
393 f31 = c[i + 2 + j * c_dim1];
394 f41 = c[i + 3 + j * c_dim1];
395 f32 = c[i + 2 + (j + 1) * c_dim1];
396 f42 = c[i + 3 + (j + 1) * c_dim1];
397 f33 = c[i + 2 + (j + 2) * c_dim1];
398 f43 = c[i + 3 + (j + 2) * c_dim1];
399 f34 = c[i + 2 + (j + 3) * c_dim1];
400 f44 = c[i + 3 + (j + 3) * c_dim1];
401 i6 = ll + lsec - 1;
402 for (l = ll; l <= i6; ++l)
403 {
404 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
405 * b[l + j * b_dim1];
406 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
407 * b[l + j * b_dim1];
408 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
409 * b[l + (j + 1) * b_dim1];
410 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
411 * b[l + (j + 1) * b_dim1];
412 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
413 * b[l + (j + 2) * b_dim1];
414 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
415 * b[l + (j + 2) * b_dim1];
416 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
417 * b[l + (j + 3) * b_dim1];
418 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
419 * b[l + (j + 3) * b_dim1];
420 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
421 * b[l + j * b_dim1];
422 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
423 * b[l + j * b_dim1];
424 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
425 * b[l + (j + 1) * b_dim1];
426 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
427 * b[l + (j + 1) * b_dim1];
428 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
429 * b[l + (j + 2) * b_dim1];
430 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
431 * b[l + (j + 2) * b_dim1];
432 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
433 * b[l + (j + 3) * b_dim1];
434 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
435 * b[l + (j + 3) * b_dim1];
436 }
437 c[i + j * c_dim1] = f11;
438 c[i + 1 + j * c_dim1] = f21;
439 c[i + (j + 1) * c_dim1] = f12;
440 c[i + 1 + (j + 1) * c_dim1] = f22;
441 c[i + (j + 2) * c_dim1] = f13;
442 c[i + 1 + (j + 2) * c_dim1] = f23;
443 c[i + (j + 3) * c_dim1] = f14;
444 c[i + 1 + (j + 3) * c_dim1] = f24;
445 c[i + 2 + j * c_dim1] = f31;
446 c[i + 3 + j * c_dim1] = f41;
447 c[i + 2 + (j + 1) * c_dim1] = f32;
448 c[i + 3 + (j + 1) * c_dim1] = f42;
449 c[i + 2 + (j + 2) * c_dim1] = f33;
450 c[i + 3 + (j + 2) * c_dim1] = f43;
451 c[i + 2 + (j + 3) * c_dim1] = f34;
452 c[i + 3 + (j + 3) * c_dim1] = f44;
453 }
454 if (uisec < isec)
455 {
456 i5 = ii + isec - 1;
457 for (i = ii + uisec; i <= i5; ++i)
458 {
459 f11 = c[i + j * c_dim1];
460 f12 = c[i + (j + 1) * c_dim1];
461 f13 = c[i + (j + 2) * c_dim1];
462 f14 = c[i + (j + 3) * c_dim1];
463 i6 = ll + lsec - 1;
464 for (l = ll; l <= i6; ++l)
465 {
466 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
467 257] * b[l + j * b_dim1];
468 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
469 257] * b[l + (j + 1) * b_dim1];
470 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
471 257] * b[l + (j + 2) * b_dim1];
472 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
473 257] * b[l + (j + 3) * b_dim1];
474 }
475 c[i + j * c_dim1] = f11;
476 c[i + (j + 1) * c_dim1] = f12;
477 c[i + (j + 2) * c_dim1] = f13;
478 c[i + (j + 3) * c_dim1] = f14;
479 }
480 }
481 }
482 if (ujsec < jsec)
483 {
484 i4 = jj + jsec - 1;
485 for (j = jj + ujsec; j <= i4; ++j)
486 {
487 i5 = ii + uisec - 1;
488 for (i = ii; i <= i5; i += 4)
489 {
490 f11 = c[i + j * c_dim1];
491 f21 = c[i + 1 + j * c_dim1];
492 f31 = c[i + 2 + j * c_dim1];
493 f41 = c[i + 3 + j * c_dim1];
494 i6 = ll + lsec - 1;
495 for (l = ll; l <= i6; ++l)
496 {
497 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498 257] * b[l + j * b_dim1];
499 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
500 257] * b[l + j * b_dim1];
501 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
502 257] * b[l + j * b_dim1];
503 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
504 257] * b[l + j * b_dim1];
505 }
506 c[i + j * c_dim1] = f11;
507 c[i + 1 + j * c_dim1] = f21;
508 c[i + 2 + j * c_dim1] = f31;
509 c[i + 3 + j * c_dim1] = f41;
510 }
511 i5 = ii + isec - 1;
512 for (i = ii + uisec; i <= i5; ++i)
513 {
514 f11 = c[i + j * c_dim1];
515 i6 = ll + lsec - 1;
516 for (l = ll; l <= i6; ++l)
517 {
518 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
519 257] * b[l + j * b_dim1];
520 }
521 c[i + j * c_dim1] = f11;
522 }
523 }
524 }
525 }
526 }
527 }
528 return;
529 }
530 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
531 {
532 if (GFC_DESCRIPTOR_RANK (a) != 1)
533 {
534 const GFC_COMPLEX_8 *restrict abase_x;
535 const GFC_COMPLEX_8 *restrict bbase_y;
536 GFC_COMPLEX_8 *restrict dest_y;
537 GFC_COMPLEX_8 s;
538
539 for (y = 0; y < ycount; y++)
540 {
541 bbase_y = &bbase[y*bystride];
542 dest_y = &dest[y*rystride];
543 for (x = 0; x < xcount; x++)
544 {
545 abase_x = &abase[x*axstride];
546 s = (GFC_COMPLEX_8) 0;
547 for (n = 0; n < count; n++)
548 s += abase_x[n] * bbase_y[n];
549 dest_y[x] = s;
550 }
551 }
552 }
553 else
554 {
555 const GFC_COMPLEX_8 *restrict bbase_y;
556 GFC_COMPLEX_8 s;
557
558 for (y = 0; y < ycount; y++)
559 {
560 bbase_y = &bbase[y*bystride];
561 s = (GFC_COMPLEX_8) 0;
562 for (n = 0; n < count; n++)
563 s += abase[n*axstride] * bbase_y[n];
564 dest[y*rystride] = s;
565 }
566 }
567 }
568 else if (axstride < aystride)
569 {
570 for (y = 0; y < ycount; y++)
571 for (x = 0; x < xcount; x++)
572 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
573
574 for (y = 0; y < ycount; y++)
575 for (n = 0; n < count; n++)
576 for (x = 0; x < xcount; x++)
577 /* dest[x,y] += a[x,n] * b[n,y] */
578 dest[x*rxstride + y*rystride] +=
579 abase[x*axstride + n*aystride] *
580 bbase[n*bxstride + y*bystride];
581 }
582 else if (GFC_DESCRIPTOR_RANK (a) == 1)
583 {
584 const GFC_COMPLEX_8 *restrict bbase_y;
585 GFC_COMPLEX_8 s;
586
587 for (y = 0; y < ycount; y++)
588 {
589 bbase_y = &bbase[y*bystride];
590 s = (GFC_COMPLEX_8) 0;
591 for (n = 0; n < count; n++)
592 s += abase[n*axstride] * bbase_y[n*bxstride];
593 dest[y*rxstride] = s;
594 }
595 }
596 else
597 {
598 const GFC_COMPLEX_8 *restrict abase_x;
599 const GFC_COMPLEX_8 *restrict bbase_y;
600 GFC_COMPLEX_8 *restrict dest_y;
601 GFC_COMPLEX_8 s;
602
603 for (y = 0; y < ycount; y++)
604 {
605 bbase_y = &bbase[y*bystride];
606 dest_y = &dest[y*rystride];
607 for (x = 0; x < xcount; x++)
608 {
609 abase_x = &abase[x*axstride];
610 s = (GFC_COMPLEX_8) 0;
611 for (n = 0; n < count; n++)
612 s += abase_x[n*aystride] * bbase_y[n*bxstride];
613 dest_y[x*rxstride] = s;
614 }
615 }
616 }
617 }
618 #undef POW3
619 #undef min
620 #undef max
621
622 #endif /* HAVE_AVX */
623
624 #ifdef HAVE_AVX2
625 static void
626 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
627 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
628 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
629 static void
630 matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
631 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
632 int blas_limit, blas_call gemm)
633 {
634 const GFC_COMPLEX_8 * restrict abase;
635 const GFC_COMPLEX_8 * restrict bbase;
636 GFC_COMPLEX_8 * restrict dest;
637
638 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
639 index_type x, y, n, count, xcount, ycount;
640
641 assert (GFC_DESCRIPTOR_RANK (a) == 2
642 || GFC_DESCRIPTOR_RANK (b) == 2);
643
644 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
645
646 Either A or B (but not both) can be rank 1:
647
648 o One-dimensional argument A is implicitly treated as a row matrix
649 dimensioned [1,count], so xcount=1.
650
651 o One-dimensional argument B is implicitly treated as a column matrix
652 dimensioned [count, 1], so ycount=1.
653 */
654
655 if (retarray->base_addr == NULL)
656 {
657 if (GFC_DESCRIPTOR_RANK (a) == 1)
658 {
659 GFC_DIMENSION_SET(retarray->dim[0], 0,
660 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
661 }
662 else if (GFC_DESCRIPTOR_RANK (b) == 1)
663 {
664 GFC_DIMENSION_SET(retarray->dim[0], 0,
665 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
666 }
667 else
668 {
669 GFC_DIMENSION_SET(retarray->dim[0], 0,
670 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
671
672 GFC_DIMENSION_SET(retarray->dim[1], 0,
673 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
674 GFC_DESCRIPTOR_EXTENT(retarray,0));
675 }
676
677 retarray->base_addr
678 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
679 retarray->offset = 0;
680 }
681 else if (unlikely (compile_options.bounds_check))
682 {
683 index_type ret_extent, arg_extent;
684
685 if (GFC_DESCRIPTOR_RANK (a) == 1)
686 {
687 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
689 if (arg_extent != ret_extent)
690 runtime_error ("Incorrect extent in return array in"
691 " MATMUL intrinsic: is %ld, should be %ld",
692 (long int) ret_extent, (long int) arg_extent);
693 }
694 else if (GFC_DESCRIPTOR_RANK (b) == 1)
695 {
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent, (long int) arg_extent);
702 }
703 else
704 {
705 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
706 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
707 if (arg_extent != ret_extent)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic for dimension 1:"
710 " is %ld, should be %ld",
711 (long int) ret_extent, (long int) arg_extent);
712
713 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
715 if (arg_extent != ret_extent)
716 runtime_error ("Incorrect extent in return array in"
717 " MATMUL intrinsic for dimension 2:"
718 " is %ld, should be %ld",
719 (long int) ret_extent, (long int) arg_extent);
720 }
721 }
722
723
724 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
725 {
726 /* One-dimensional result may be addressed in the code below
727 either as a row or a column matrix. We want both cases to
728 work. */
729 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
730 }
731 else
732 {
733 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
734 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
735 }
736
737
738 if (GFC_DESCRIPTOR_RANK (a) == 1)
739 {
740 /* Treat it as a a row matrix A[1,count]. */
741 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
742 aystride = 1;
743
744 xcount = 1;
745 count = GFC_DESCRIPTOR_EXTENT(a,0);
746 }
747 else
748 {
749 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
750 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
751
752 count = GFC_DESCRIPTOR_EXTENT(a,1);
753 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
754 }
755
756 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
757 {
758 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
759 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
760 }
761
762 if (GFC_DESCRIPTOR_RANK (b) == 1)
763 {
764 /* Treat it as a column matrix B[count,1] */
765 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
766
767 /* bystride should never be used for 1-dimensional b.
768 in case it is we want it to cause a segfault, rather than
769 an incorrect result. */
770 bystride = 0xDEADBEEF;
771 ycount = 1;
772 }
773 else
774 {
775 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
776 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
777 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
778 }
779
780 abase = a->base_addr;
781 bbase = b->base_addr;
782 dest = retarray->base_addr;
783
784 /* Now that everything is set up, we perform the multiplication
785 itself. */
786
787 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
788 #define min(a,b) ((a) <= (b) ? (a) : (b))
789 #define max(a,b) ((a) >= (b) ? (a) : (b))
790
791 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
792 && (bxstride == 1 || bystride == 1)
793 && (((float) xcount) * ((float) ycount) * ((float) count)
794 > POW3(blas_limit)))
795 {
796 const int m = xcount, n = ycount, k = count, ldc = rystride;
797 const GFC_COMPLEX_8 one = 1, zero = 0;
798 const int lda = (axstride == 1) ? aystride : axstride,
799 ldb = (bxstride == 1) ? bystride : bxstride;
800
801 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
802 {
803 assert (gemm != NULL);
804 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
805 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
806 &ldc, 1, 1);
807 return;
808 }
809 }
810
811 if (rxstride == 1 && axstride == 1 && bxstride == 1)
812 {
813 /* This block of code implements a tuned matmul, derived from
814 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
815
816 Bo Kagstrom and Per Ling
817 Department of Computing Science
818 Umea University
819 S-901 87 Umea, Sweden
820
821 from netlib.org, translated to C, and modified for matmul.m4. */
822
823 const GFC_COMPLEX_8 *a, *b;
824 GFC_COMPLEX_8 *c;
825 const index_type m = xcount, n = ycount, k = count;
826
827 /* System generated locals */
828 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
829 i1, i2, i3, i4, i5, i6;
830
831 /* Local variables */
832 GFC_COMPLEX_8 t1[65536], /* was [256][256] */
833 f11, f12, f21, f22, f31, f32, f41, f42,
834 f13, f14, f23, f24, f33, f34, f43, f44;
835 index_type i, j, l, ii, jj, ll;
836 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
837
838 a = abase;
839 b = bbase;
840 c = retarray->base_addr;
841
842 /* Parameter adjustments */
843 c_dim1 = rystride;
844 c_offset = 1 + c_dim1;
845 c -= c_offset;
846 a_dim1 = aystride;
847 a_offset = 1 + a_dim1;
848 a -= a_offset;
849 b_dim1 = bystride;
850 b_offset = 1 + b_dim1;
851 b -= b_offset;
852
853 /* Early exit if possible */
854 if (m == 0 || n == 0 || k == 0)
855 return;
856
857 /* Empty c first. */
858 for (j=1; j<=n; j++)
859 for (i=1; i<=m; i++)
860 c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
861
862 /* Start turning the crank. */
863 i1 = n;
864 for (jj = 1; jj <= i1; jj += 512)
865 {
866 /* Computing MIN */
867 i2 = 512;
868 i3 = n - jj + 1;
869 jsec = min(i2,i3);
870 ujsec = jsec - jsec % 4;
871 i2 = k;
872 for (ll = 1; ll <= i2; ll += 256)
873 {
874 /* Computing MIN */
875 i3 = 256;
876 i4 = k - ll + 1;
877 lsec = min(i3,i4);
878 ulsec = lsec - lsec % 2;
879
880 i3 = m;
881 for (ii = 1; ii <= i3; ii += 256)
882 {
883 /* Computing MIN */
884 i4 = 256;
885 i5 = m - ii + 1;
886 isec = min(i4,i5);
887 uisec = isec - isec % 2;
888 i4 = ll + ulsec - 1;
889 for (l = ll; l <= i4; l += 2)
890 {
891 i5 = ii + uisec - 1;
892 for (i = ii; i <= i5; i += 2)
893 {
894 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
895 a[i + l * a_dim1];
896 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
897 a[i + (l + 1) * a_dim1];
898 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
899 a[i + 1 + l * a_dim1];
900 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
901 a[i + 1 + (l + 1) * a_dim1];
902 }
903 if (uisec < isec)
904 {
905 t1[l - ll + 1 + (isec << 8) - 257] =
906 a[ii + isec - 1 + l * a_dim1];
907 t1[l - ll + 2 + (isec << 8) - 257] =
908 a[ii + isec - 1 + (l + 1) * a_dim1];
909 }
910 }
911 if (ulsec < lsec)
912 {
913 i4 = ii + isec - 1;
914 for (i = ii; i<= i4; ++i)
915 {
916 t1[lsec + ((i - ii + 1) << 8) - 257] =
917 a[i + (ll + lsec - 1) * a_dim1];
918 }
919 }
920
921 uisec = isec - isec % 4;
922 i4 = jj + ujsec - 1;
923 for (j = jj; j <= i4; j += 4)
924 {
925 i5 = ii + uisec - 1;
926 for (i = ii; i <= i5; i += 4)
927 {
928 f11 = c[i + j * c_dim1];
929 f21 = c[i + 1 + j * c_dim1];
930 f12 = c[i + (j + 1) * c_dim1];
931 f22 = c[i + 1 + (j + 1) * c_dim1];
932 f13 = c[i + (j + 2) * c_dim1];
933 f23 = c[i + 1 + (j + 2) * c_dim1];
934 f14 = c[i + (j + 3) * c_dim1];
935 f24 = c[i + 1 + (j + 3) * c_dim1];
936 f31 = c[i + 2 + j * c_dim1];
937 f41 = c[i + 3 + j * c_dim1];
938 f32 = c[i + 2 + (j + 1) * c_dim1];
939 f42 = c[i + 3 + (j + 1) * c_dim1];
940 f33 = c[i + 2 + (j + 2) * c_dim1];
941 f43 = c[i + 3 + (j + 2) * c_dim1];
942 f34 = c[i + 2 + (j + 3) * c_dim1];
943 f44 = c[i + 3 + (j + 3) * c_dim1];
944 i6 = ll + lsec - 1;
945 for (l = ll; l <= i6; ++l)
946 {
947 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
948 * b[l + j * b_dim1];
949 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
950 * b[l + j * b_dim1];
951 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
952 * b[l + (j + 1) * b_dim1];
953 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
954 * b[l + (j + 1) * b_dim1];
955 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
956 * b[l + (j + 2) * b_dim1];
957 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
958 * b[l + (j + 2) * b_dim1];
959 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
960 * b[l + (j + 3) * b_dim1];
961 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
962 * b[l + (j + 3) * b_dim1];
963 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
964 * b[l + j * b_dim1];
965 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
966 * b[l + j * b_dim1];
967 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
968 * b[l + (j + 1) * b_dim1];
969 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
970 * b[l + (j + 1) * b_dim1];
971 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
972 * b[l + (j + 2) * b_dim1];
973 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
974 * b[l + (j + 2) * b_dim1];
975 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
976 * b[l + (j + 3) * b_dim1];
977 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
978 * b[l + (j + 3) * b_dim1];
979 }
980 c[i + j * c_dim1] = f11;
981 c[i + 1 + j * c_dim1] = f21;
982 c[i + (j + 1) * c_dim1] = f12;
983 c[i + 1 + (j + 1) * c_dim1] = f22;
984 c[i + (j + 2) * c_dim1] = f13;
985 c[i + 1 + (j + 2) * c_dim1] = f23;
986 c[i + (j + 3) * c_dim1] = f14;
987 c[i + 1 + (j + 3) * c_dim1] = f24;
988 c[i + 2 + j * c_dim1] = f31;
989 c[i + 3 + j * c_dim1] = f41;
990 c[i + 2 + (j + 1) * c_dim1] = f32;
991 c[i + 3 + (j + 1) * c_dim1] = f42;
992 c[i + 2 + (j + 2) * c_dim1] = f33;
993 c[i + 3 + (j + 2) * c_dim1] = f43;
994 c[i + 2 + (j + 3) * c_dim1] = f34;
995 c[i + 3 + (j + 3) * c_dim1] = f44;
996 }
997 if (uisec < isec)
998 {
999 i5 = ii + isec - 1;
1000 for (i = ii + uisec; i <= i5; ++i)
1001 {
1002 f11 = c[i + j * c_dim1];
1003 f12 = c[i + (j + 1) * c_dim1];
1004 f13 = c[i + (j + 2) * c_dim1];
1005 f14 = c[i + (j + 3) * c_dim1];
1006 i6 = ll + lsec - 1;
1007 for (l = ll; l <= i6; ++l)
1008 {
1009 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1010 257] * b[l + j * b_dim1];
1011 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1012 257] * b[l + (j + 1) * b_dim1];
1013 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1014 257] * b[l + (j + 2) * b_dim1];
1015 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1016 257] * b[l + (j + 3) * b_dim1];
1017 }
1018 c[i + j * c_dim1] = f11;
1019 c[i + (j + 1) * c_dim1] = f12;
1020 c[i + (j + 2) * c_dim1] = f13;
1021 c[i + (j + 3) * c_dim1] = f14;
1022 }
1023 }
1024 }
1025 if (ujsec < jsec)
1026 {
1027 i4 = jj + jsec - 1;
1028 for (j = jj + ujsec; j <= i4; ++j)
1029 {
1030 i5 = ii + uisec - 1;
1031 for (i = ii; i <= i5; i += 4)
1032 {
1033 f11 = c[i + j * c_dim1];
1034 f21 = c[i + 1 + j * c_dim1];
1035 f31 = c[i + 2 + j * c_dim1];
1036 f41 = c[i + 3 + j * c_dim1];
1037 i6 = ll + lsec - 1;
1038 for (l = ll; l <= i6; ++l)
1039 {
1040 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 257] * b[l + j * b_dim1];
1042 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1043 257] * b[l + j * b_dim1];
1044 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1045 257] * b[l + j * b_dim1];
1046 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1047 257] * b[l + j * b_dim1];
1048 }
1049 c[i + j * c_dim1] = f11;
1050 c[i + 1 + j * c_dim1] = f21;
1051 c[i + 2 + j * c_dim1] = f31;
1052 c[i + 3 + j * c_dim1] = f41;
1053 }
1054 i5 = ii + isec - 1;
1055 for (i = ii + uisec; i <= i5; ++i)
1056 {
1057 f11 = c[i + j * c_dim1];
1058 i6 = ll + lsec - 1;
1059 for (l = ll; l <= i6; ++l)
1060 {
1061 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1062 257] * b[l + j * b_dim1];
1063 }
1064 c[i + j * c_dim1] = f11;
1065 }
1066 }
1067 }
1068 }
1069 }
1070 }
1071 return;
1072 }
1073 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1074 {
1075 if (GFC_DESCRIPTOR_RANK (a) != 1)
1076 {
1077 const GFC_COMPLEX_8 *restrict abase_x;
1078 const GFC_COMPLEX_8 *restrict bbase_y;
1079 GFC_COMPLEX_8 *restrict dest_y;
1080 GFC_COMPLEX_8 s;
1081
1082 for (y = 0; y < ycount; y++)
1083 {
1084 bbase_y = &bbase[y*bystride];
1085 dest_y = &dest[y*rystride];
1086 for (x = 0; x < xcount; x++)
1087 {
1088 abase_x = &abase[x*axstride];
1089 s = (GFC_COMPLEX_8) 0;
1090 for (n = 0; n < count; n++)
1091 s += abase_x[n] * bbase_y[n];
1092 dest_y[x] = s;
1093 }
1094 }
1095 }
1096 else
1097 {
1098 const GFC_COMPLEX_8 *restrict bbase_y;
1099 GFC_COMPLEX_8 s;
1100
1101 for (y = 0; y < ycount; y++)
1102 {
1103 bbase_y = &bbase[y*bystride];
1104 s = (GFC_COMPLEX_8) 0;
1105 for (n = 0; n < count; n++)
1106 s += abase[n*axstride] * bbase_y[n];
1107 dest[y*rystride] = s;
1108 }
1109 }
1110 }
1111 else if (axstride < aystride)
1112 {
1113 for (y = 0; y < ycount; y++)
1114 for (x = 0; x < xcount; x++)
1115 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1116
1117 for (y = 0; y < ycount; y++)
1118 for (n = 0; n < count; n++)
1119 for (x = 0; x < xcount; x++)
1120 /* dest[x,y] += a[x,n] * b[n,y] */
1121 dest[x*rxstride + y*rystride] +=
1122 abase[x*axstride + n*aystride] *
1123 bbase[n*bxstride + y*bystride];
1124 }
1125 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1126 {
1127 const GFC_COMPLEX_8 *restrict bbase_y;
1128 GFC_COMPLEX_8 s;
1129
1130 for (y = 0; y < ycount; y++)
1131 {
1132 bbase_y = &bbase[y*bystride];
1133 s = (GFC_COMPLEX_8) 0;
1134 for (n = 0; n < count; n++)
1135 s += abase[n*axstride] * bbase_y[n*bxstride];
1136 dest[y*rxstride] = s;
1137 }
1138 }
1139 else
1140 {
1141 const GFC_COMPLEX_8 *restrict abase_x;
1142 const GFC_COMPLEX_8 *restrict bbase_y;
1143 GFC_COMPLEX_8 *restrict dest_y;
1144 GFC_COMPLEX_8 s;
1145
1146 for (y = 0; y < ycount; y++)
1147 {
1148 bbase_y = &bbase[y*bystride];
1149 dest_y = &dest[y*rystride];
1150 for (x = 0; x < xcount; x++)
1151 {
1152 abase_x = &abase[x*axstride];
1153 s = (GFC_COMPLEX_8) 0;
1154 for (n = 0; n < count; n++)
1155 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1156 dest_y[x*rxstride] = s;
1157 }
1158 }
1159 }
1160 }
1161 #undef POW3
1162 #undef min
1163 #undef max
1164
1165 #endif /* HAVE_AVX2 */
1166
1167 #ifdef HAVE_AVX512F
1168 static void
1169 matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1170 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1171 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1172 static void
1173 matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
1174 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1175 int blas_limit, blas_call gemm)
1176 {
1177 const GFC_COMPLEX_8 * restrict abase;
1178 const GFC_COMPLEX_8 * restrict bbase;
1179 GFC_COMPLEX_8 * restrict dest;
1180
1181 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1182 index_type x, y, n, count, xcount, ycount;
1183
1184 assert (GFC_DESCRIPTOR_RANK (a) == 2
1185 || GFC_DESCRIPTOR_RANK (b) == 2);
1186
1187 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1188
1189 Either A or B (but not both) can be rank 1:
1190
1191 o One-dimensional argument A is implicitly treated as a row matrix
1192 dimensioned [1,count], so xcount=1.
1193
1194 o One-dimensional argument B is implicitly treated as a column matrix
1195 dimensioned [count, 1], so ycount=1.
1196 */
1197
1198 if (retarray->base_addr == NULL)
1199 {
1200 if (GFC_DESCRIPTOR_RANK (a) == 1)
1201 {
1202 GFC_DIMENSION_SET(retarray->dim[0], 0,
1203 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1204 }
1205 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1206 {
1207 GFC_DIMENSION_SET(retarray->dim[0], 0,
1208 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1209 }
1210 else
1211 {
1212 GFC_DIMENSION_SET(retarray->dim[0], 0,
1213 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1214
1215 GFC_DIMENSION_SET(retarray->dim[1], 0,
1216 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1217 GFC_DESCRIPTOR_EXTENT(retarray,0));
1218 }
1219
1220 retarray->base_addr
1221 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1222 retarray->offset = 0;
1223 }
1224 else if (unlikely (compile_options.bounds_check))
1225 {
1226 index_type ret_extent, arg_extent;
1227
1228 if (GFC_DESCRIPTOR_RANK (a) == 1)
1229 {
1230 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1231 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1232 if (arg_extent != ret_extent)
1233 runtime_error ("Incorrect extent in return array in"
1234 " MATMUL intrinsic: is %ld, should be %ld",
1235 (long int) ret_extent, (long int) arg_extent);
1236 }
1237 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1238 {
1239 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1240 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1241 if (arg_extent != ret_extent)
1242 runtime_error ("Incorrect extent in return array in"
1243 " MATMUL intrinsic: is %ld, should be %ld",
1244 (long int) ret_extent, (long int) arg_extent);
1245 }
1246 else
1247 {
1248 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1249 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250 if (arg_extent != ret_extent)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic for dimension 1:"
1253 " is %ld, should be %ld",
1254 (long int) ret_extent, (long int) arg_extent);
1255
1256 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1257 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1258 if (arg_extent != ret_extent)
1259 runtime_error ("Incorrect extent in return array in"
1260 " MATMUL intrinsic for dimension 2:"
1261 " is %ld, should be %ld",
1262 (long int) ret_extent, (long int) arg_extent);
1263 }
1264 }
1265
1266
1267 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1268 {
1269 /* One-dimensional result may be addressed in the code below
1270 either as a row or a column matrix. We want both cases to
1271 work. */
1272 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1273 }
1274 else
1275 {
1276 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1277 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1278 }
1279
1280
1281 if (GFC_DESCRIPTOR_RANK (a) == 1)
1282 {
1283 /* Treat it as a a row matrix A[1,count]. */
1284 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1285 aystride = 1;
1286
1287 xcount = 1;
1288 count = GFC_DESCRIPTOR_EXTENT(a,0);
1289 }
1290 else
1291 {
1292 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1294
1295 count = GFC_DESCRIPTOR_EXTENT(a,1);
1296 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1297 }
1298
1299 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1300 {
1301 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1302 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1303 }
1304
1305 if (GFC_DESCRIPTOR_RANK (b) == 1)
1306 {
1307 /* Treat it as a column matrix B[count,1] */
1308 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1309
1310 /* bystride should never be used for 1-dimensional b.
1311 in case it is we want it to cause a segfault, rather than
1312 an incorrect result. */
1313 bystride = 0xDEADBEEF;
1314 ycount = 1;
1315 }
1316 else
1317 {
1318 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1319 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1320 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1321 }
1322
1323 abase = a->base_addr;
1324 bbase = b->base_addr;
1325 dest = retarray->base_addr;
1326
1327 /* Now that everything is set up, we perform the multiplication
1328 itself. */
1329
1330 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1331 #define min(a,b) ((a) <= (b) ? (a) : (b))
1332 #define max(a,b) ((a) >= (b) ? (a) : (b))
1333
1334 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1335 && (bxstride == 1 || bystride == 1)
1336 && (((float) xcount) * ((float) ycount) * ((float) count)
1337 > POW3(blas_limit)))
1338 {
1339 const int m = xcount, n = ycount, k = count, ldc = rystride;
1340 const GFC_COMPLEX_8 one = 1, zero = 0;
1341 const int lda = (axstride == 1) ? aystride : axstride,
1342 ldb = (bxstride == 1) ? bystride : bxstride;
1343
1344 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1345 {
1346 assert (gemm != NULL);
1347 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1348 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1349 &ldc, 1, 1);
1350 return;
1351 }
1352 }
1353
1354 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1355 {
1356 /* This block of code implements a tuned matmul, derived from
1357 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1358
1359 Bo Kagstrom and Per Ling
1360 Department of Computing Science
1361 Umea University
1362 S-901 87 Umea, Sweden
1363
1364 from netlib.org, translated to C, and modified for matmul.m4. */
1365
1366 const GFC_COMPLEX_8 *a, *b;
1367 GFC_COMPLEX_8 *c;
1368 const index_type m = xcount, n = ycount, k = count;
1369
1370 /* System generated locals */
1371 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1372 i1, i2, i3, i4, i5, i6;
1373
1374 /* Local variables */
1375 GFC_COMPLEX_8 t1[65536], /* was [256][256] */
1376 f11, f12, f21, f22, f31, f32, f41, f42,
1377 f13, f14, f23, f24, f33, f34, f43, f44;
1378 index_type i, j, l, ii, jj, ll;
1379 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1380
1381 a = abase;
1382 b = bbase;
1383 c = retarray->base_addr;
1384
1385 /* Parameter adjustments */
1386 c_dim1 = rystride;
1387 c_offset = 1 + c_dim1;
1388 c -= c_offset;
1389 a_dim1 = aystride;
1390 a_offset = 1 + a_dim1;
1391 a -= a_offset;
1392 b_dim1 = bystride;
1393 b_offset = 1 + b_dim1;
1394 b -= b_offset;
1395
1396 /* Early exit if possible */
1397 if (m == 0 || n == 0 || k == 0)
1398 return;
1399
1400 /* Empty c first. */
1401 for (j=1; j<=n; j++)
1402 for (i=1; i<=m; i++)
1403 c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
1404
1405 /* Start turning the crank. */
1406 i1 = n;
1407 for (jj = 1; jj <= i1; jj += 512)
1408 {
1409 /* Computing MIN */
1410 i2 = 512;
1411 i3 = n - jj + 1;
1412 jsec = min(i2,i3);
1413 ujsec = jsec - jsec % 4;
1414 i2 = k;
1415 for (ll = 1; ll <= i2; ll += 256)
1416 {
1417 /* Computing MIN */
1418 i3 = 256;
1419 i4 = k - ll + 1;
1420 lsec = min(i3,i4);
1421 ulsec = lsec - lsec % 2;
1422
1423 i3 = m;
1424 for (ii = 1; ii <= i3; ii += 256)
1425 {
1426 /* Computing MIN */
1427 i4 = 256;
1428 i5 = m - ii + 1;
1429 isec = min(i4,i5);
1430 uisec = isec - isec % 2;
1431 i4 = ll + ulsec - 1;
1432 for (l = ll; l <= i4; l += 2)
1433 {
1434 i5 = ii + uisec - 1;
1435 for (i = ii; i <= i5; i += 2)
1436 {
1437 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1438 a[i + l * a_dim1];
1439 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1440 a[i + (l + 1) * a_dim1];
1441 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1442 a[i + 1 + l * a_dim1];
1443 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1444 a[i + 1 + (l + 1) * a_dim1];
1445 }
1446 if (uisec < isec)
1447 {
1448 t1[l - ll + 1 + (isec << 8) - 257] =
1449 a[ii + isec - 1 + l * a_dim1];
1450 t1[l - ll + 2 + (isec << 8) - 257] =
1451 a[ii + isec - 1 + (l + 1) * a_dim1];
1452 }
1453 }
1454 if (ulsec < lsec)
1455 {
1456 i4 = ii + isec - 1;
1457 for (i = ii; i<= i4; ++i)
1458 {
1459 t1[lsec + ((i - ii + 1) << 8) - 257] =
1460 a[i + (ll + lsec - 1) * a_dim1];
1461 }
1462 }
1463
1464 uisec = isec - isec % 4;
1465 i4 = jj + ujsec - 1;
1466 for (j = jj; j <= i4; j += 4)
1467 {
1468 i5 = ii + uisec - 1;
1469 for (i = ii; i <= i5; i += 4)
1470 {
1471 f11 = c[i + j * c_dim1];
1472 f21 = c[i + 1 + j * c_dim1];
1473 f12 = c[i + (j + 1) * c_dim1];
1474 f22 = c[i + 1 + (j + 1) * c_dim1];
1475 f13 = c[i + (j + 2) * c_dim1];
1476 f23 = c[i + 1 + (j + 2) * c_dim1];
1477 f14 = c[i + (j + 3) * c_dim1];
1478 f24 = c[i + 1 + (j + 3) * c_dim1];
1479 f31 = c[i + 2 + j * c_dim1];
1480 f41 = c[i + 3 + j * c_dim1];
1481 f32 = c[i + 2 + (j + 1) * c_dim1];
1482 f42 = c[i + 3 + (j + 1) * c_dim1];
1483 f33 = c[i + 2 + (j + 2) * c_dim1];
1484 f43 = c[i + 3 + (j + 2) * c_dim1];
1485 f34 = c[i + 2 + (j + 3) * c_dim1];
1486 f44 = c[i + 3 + (j + 3) * c_dim1];
1487 i6 = ll + lsec - 1;
1488 for (l = ll; l <= i6; ++l)
1489 {
1490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1491 * b[l + j * b_dim1];
1492 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1493 * b[l + j * b_dim1];
1494 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1495 * b[l + (j + 1) * b_dim1];
1496 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1497 * b[l + (j + 1) * b_dim1];
1498 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499 * b[l + (j + 2) * b_dim1];
1500 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501 * b[l + (j + 2) * b_dim1];
1502 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503 * b[l + (j + 3) * b_dim1];
1504 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505 * b[l + (j + 3) * b_dim1];
1506 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1507 * b[l + j * b_dim1];
1508 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1509 * b[l + j * b_dim1];
1510 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1511 * b[l + (j + 1) * b_dim1];
1512 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1513 * b[l + (j + 1) * b_dim1];
1514 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515 * b[l + (j + 2) * b_dim1];
1516 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517 * b[l + (j + 2) * b_dim1];
1518 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519 * b[l + (j + 3) * b_dim1];
1520 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521 * b[l + (j + 3) * b_dim1];
1522 }
1523 c[i + j * c_dim1] = f11;
1524 c[i + 1 + j * c_dim1] = f21;
1525 c[i + (j + 1) * c_dim1] = f12;
1526 c[i + 1 + (j + 1) * c_dim1] = f22;
1527 c[i + (j + 2) * c_dim1] = f13;
1528 c[i + 1 + (j + 2) * c_dim1] = f23;
1529 c[i + (j + 3) * c_dim1] = f14;
1530 c[i + 1 + (j + 3) * c_dim1] = f24;
1531 c[i + 2 + j * c_dim1] = f31;
1532 c[i + 3 + j * c_dim1] = f41;
1533 c[i + 2 + (j + 1) * c_dim1] = f32;
1534 c[i + 3 + (j + 1) * c_dim1] = f42;
1535 c[i + 2 + (j + 2) * c_dim1] = f33;
1536 c[i + 3 + (j + 2) * c_dim1] = f43;
1537 c[i + 2 + (j + 3) * c_dim1] = f34;
1538 c[i + 3 + (j + 3) * c_dim1] = f44;
1539 }
1540 if (uisec < isec)
1541 {
1542 i5 = ii + isec - 1;
1543 for (i = ii + uisec; i <= i5; ++i)
1544 {
1545 f11 = c[i + j * c_dim1];
1546 f12 = c[i + (j + 1) * c_dim1];
1547 f13 = c[i + (j + 2) * c_dim1];
1548 f14 = c[i + (j + 3) * c_dim1];
1549 i6 = ll + lsec - 1;
1550 for (l = ll; l <= i6; ++l)
1551 {
1552 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1553 257] * b[l + j * b_dim1];
1554 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1555 257] * b[l + (j + 1) * b_dim1];
1556 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1557 257] * b[l + (j + 2) * b_dim1];
1558 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1559 257] * b[l + (j + 3) * b_dim1];
1560 }
1561 c[i + j * c_dim1] = f11;
1562 c[i + (j + 1) * c_dim1] = f12;
1563 c[i + (j + 2) * c_dim1] = f13;
1564 c[i + (j + 3) * c_dim1] = f14;
1565 }
1566 }
1567 }
1568 if (ujsec < jsec)
1569 {
1570 i4 = jj + jsec - 1;
1571 for (j = jj + ujsec; j <= i4; ++j)
1572 {
1573 i5 = ii + uisec - 1;
1574 for (i = ii; i <= i5; i += 4)
1575 {
1576 f11 = c[i + j * c_dim1];
1577 f21 = c[i + 1 + j * c_dim1];
1578 f31 = c[i + 2 + j * c_dim1];
1579 f41 = c[i + 3 + j * c_dim1];
1580 i6 = ll + lsec - 1;
1581 for (l = ll; l <= i6; ++l)
1582 {
1583 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1584 257] * b[l + j * b_dim1];
1585 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1586 257] * b[l + j * b_dim1];
1587 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1588 257] * b[l + j * b_dim1];
1589 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1590 257] * b[l + j * b_dim1];
1591 }
1592 c[i + j * c_dim1] = f11;
1593 c[i + 1 + j * c_dim1] = f21;
1594 c[i + 2 + j * c_dim1] = f31;
1595 c[i + 3 + j * c_dim1] = f41;
1596 }
1597 i5 = ii + isec - 1;
1598 for (i = ii + uisec; i <= i5; ++i)
1599 {
1600 f11 = c[i + j * c_dim1];
1601 i6 = ll + lsec - 1;
1602 for (l = ll; l <= i6; ++l)
1603 {
1604 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1605 257] * b[l + j * b_dim1];
1606 }
1607 c[i + j * c_dim1] = f11;
1608 }
1609 }
1610 }
1611 }
1612 }
1613 }
1614 return;
1615 }
1616 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1617 {
1618 if (GFC_DESCRIPTOR_RANK (a) != 1)
1619 {
1620 const GFC_COMPLEX_8 *restrict abase_x;
1621 const GFC_COMPLEX_8 *restrict bbase_y;
1622 GFC_COMPLEX_8 *restrict dest_y;
1623 GFC_COMPLEX_8 s;
1624
1625 for (y = 0; y < ycount; y++)
1626 {
1627 bbase_y = &bbase[y*bystride];
1628 dest_y = &dest[y*rystride];
1629 for (x = 0; x < xcount; x++)
1630 {
1631 abase_x = &abase[x*axstride];
1632 s = (GFC_COMPLEX_8) 0;
1633 for (n = 0; n < count; n++)
1634 s += abase_x[n] * bbase_y[n];
1635 dest_y[x] = s;
1636 }
1637 }
1638 }
1639 else
1640 {
1641 const GFC_COMPLEX_8 *restrict bbase_y;
1642 GFC_COMPLEX_8 s;
1643
1644 for (y = 0; y < ycount; y++)
1645 {
1646 bbase_y = &bbase[y*bystride];
1647 s = (GFC_COMPLEX_8) 0;
1648 for (n = 0; n < count; n++)
1649 s += abase[n*axstride] * bbase_y[n];
1650 dest[y*rystride] = s;
1651 }
1652 }
1653 }
1654 else if (axstride < aystride)
1655 {
1656 for (y = 0; y < ycount; y++)
1657 for (x = 0; x < xcount; x++)
1658 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
1659
1660 for (y = 0; y < ycount; y++)
1661 for (n = 0; n < count; n++)
1662 for (x = 0; x < xcount; x++)
1663 /* dest[x,y] += a[x,n] * b[n,y] */
1664 dest[x*rxstride + y*rystride] +=
1665 abase[x*axstride + n*aystride] *
1666 bbase[n*bxstride + y*bystride];
1667 }
1668 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1669 {
1670 const GFC_COMPLEX_8 *restrict bbase_y;
1671 GFC_COMPLEX_8 s;
1672
1673 for (y = 0; y < ycount; y++)
1674 {
1675 bbase_y = &bbase[y*bystride];
1676 s = (GFC_COMPLEX_8) 0;
1677 for (n = 0; n < count; n++)
1678 s += abase[n*axstride] * bbase_y[n*bxstride];
1679 dest[y*rxstride] = s;
1680 }
1681 }
1682 else
1683 {
1684 const GFC_COMPLEX_8 *restrict abase_x;
1685 const GFC_COMPLEX_8 *restrict bbase_y;
1686 GFC_COMPLEX_8 *restrict dest_y;
1687 GFC_COMPLEX_8 s;
1688
1689 for (y = 0; y < ycount; y++)
1690 {
1691 bbase_y = &bbase[y*bystride];
1692 dest_y = &dest[y*rystride];
1693 for (x = 0; x < xcount; x++)
1694 {
1695 abase_x = &abase[x*axstride];
1696 s = (GFC_COMPLEX_8) 0;
1697 for (n = 0; n < count; n++)
1698 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1699 dest_y[x*rxstride] = s;
1700 }
1701 }
1702 }
1703 }
1704 #undef POW3
1705 #undef min
1706 #undef max
1707
1708 #endif /* HAVE_AVX512F */
1709
1710 /* Function to fall back to if there is no special processor-specific version. */
1711 static void
1712 matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
1713 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
1714 int blas_limit, blas_call gemm)
1715 {
1716 const GFC_COMPLEX_8 * restrict abase;
1717 const GFC_COMPLEX_8 * restrict bbase;
1718 GFC_COMPLEX_8 * restrict dest;
1719
1720 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1721 index_type x, y, n, count, xcount, ycount;
1722
1723 assert (GFC_DESCRIPTOR_RANK (a) == 2
1724 || GFC_DESCRIPTOR_RANK (b) == 2);
1725
1726 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1727
1728 Either A or B (but not both) can be rank 1:
1729
1730 o One-dimensional argument A is implicitly treated as a row matrix
1731 dimensioned [1,count], so xcount=1.
1732
1733 o One-dimensional argument B is implicitly treated as a column matrix
1734 dimensioned [count, 1], so ycount=1.
1735 */
1736
1737 if (retarray->base_addr == NULL)
1738 {
1739 if (GFC_DESCRIPTOR_RANK (a) == 1)
1740 {
1741 GFC_DIMENSION_SET(retarray->dim[0], 0,
1742 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1743 }
1744 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1745 {
1746 GFC_DIMENSION_SET(retarray->dim[0], 0,
1747 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1748 }
1749 else
1750 {
1751 GFC_DIMENSION_SET(retarray->dim[0], 0,
1752 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1753
1754 GFC_DIMENSION_SET(retarray->dim[1], 0,
1755 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1756 GFC_DESCRIPTOR_EXTENT(retarray,0));
1757 }
1758
1759 retarray->base_addr
1760 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
1761 retarray->offset = 0;
1762 }
1763 else if (unlikely (compile_options.bounds_check))
1764 {
1765 index_type ret_extent, arg_extent;
1766
1767 if (GFC_DESCRIPTOR_RANK (a) == 1)
1768 {
1769 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1770 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1771 if (arg_extent != ret_extent)
1772 runtime_error ("Incorrect extent in return array in"
1773 " MATMUL intrinsic: is %ld, should be %ld",
1774 (long int) ret_extent, (long int) arg_extent);
1775 }
1776 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1777 {
1778 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1779 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1780 if (arg_extent != ret_extent)
1781 runtime_error ("Incorrect extent in return array in"
1782 " MATMUL intrinsic: is %ld, should be %ld",
1783 (long int) ret_extent, (long int) arg_extent);
1784 }
1785 else
1786 {
1787 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1788 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1789 if (arg_extent != ret_extent)
1790 runtime_error ("Incorrect extent in return array in"
1791 " MATMUL intrinsic for dimension 1:"
1792 " is %ld, should be %ld",
1793 (long int) ret_extent, (long int) arg_extent);
1794
1795 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1796 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1797 if (arg_extent != ret_extent)
1798 runtime_error ("Incorrect extent in return array in"
1799 " MATMUL intrinsic for dimension 2:"
1800 " is %ld, should be %ld",
1801 (long int) ret_extent, (long int) arg_extent);
1802 }
1803 }
1804
1805
1806 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1807 {
1808 /* One-dimensional result may be addressed in the code below
1809 either as a row or a column matrix. We want both cases to
1810 work. */
1811 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1812 }
1813 else
1814 {
1815 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1816 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1817 }
1818
1819
1820 if (GFC_DESCRIPTOR_RANK (a) == 1)
1821 {
1822 /* Treat it as a a row matrix A[1,count]. */
1823 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1824 aystride = 1;
1825
1826 xcount = 1;
1827 count = GFC_DESCRIPTOR_EXTENT(a,0);
1828 }
1829 else
1830 {
1831 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1833
1834 count = GFC_DESCRIPTOR_EXTENT(a,1);
1835 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1836 }
1837
1838 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1839 {
1840 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1841 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1842 }
1843
1844 if (GFC_DESCRIPTOR_RANK (b) == 1)
1845 {
1846 /* Treat it as a column matrix B[count,1] */
1847 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1848
1849 /* bystride should never be used for 1-dimensional b.
1850 in case it is we want it to cause a segfault, rather than
1851 an incorrect result. */
1852 bystride = 0xDEADBEEF;
1853 ycount = 1;
1854 }
1855 else
1856 {
1857 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1858 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1859 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1860 }
1861
1862 abase = a->base_addr;
1863 bbase = b->base_addr;
1864 dest = retarray->base_addr;
1865
1866 /* Now that everything is set up, we perform the multiplication
1867 itself. */
1868
1869 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1870 #define min(a,b) ((a) <= (b) ? (a) : (b))
1871 #define max(a,b) ((a) >= (b) ? (a) : (b))
1872
1873 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1874 && (bxstride == 1 || bystride == 1)
1875 && (((float) xcount) * ((float) ycount) * ((float) count)
1876 > POW3(blas_limit)))
1877 {
1878 const int m = xcount, n = ycount, k = count, ldc = rystride;
1879 const GFC_COMPLEX_8 one = 1, zero = 0;
1880 const int lda = (axstride == 1) ? aystride : axstride,
1881 ldb = (bxstride == 1) ? bystride : bxstride;
1882
1883 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1884 {
1885 assert (gemm != NULL);
1886 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1887 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1888 &ldc, 1, 1);
1889 return;
1890 }
1891 }
1892
1893 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1894 {
1895 /* This block of code implements a tuned matmul, derived from
1896 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1897
1898 Bo Kagstrom and Per Ling
1899 Department of Computing Science
1900 Umea University
1901 S-901 87 Umea, Sweden
1902
1903 from netlib.org, translated to C, and modified for matmul.m4. */
1904
1905 const GFC_COMPLEX_8 *a, *b;
1906 GFC_COMPLEX_8 *c;
1907 const index_type m = xcount, n = ycount, k = count;
1908
1909 /* System generated locals */
1910 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1911 i1, i2, i3, i4, i5, i6;
1912
1913 /* Local variables */
1914 GFC_COMPLEX_8 t1[65536], /* was [256][256] */
1915 f11, f12, f21, f22, f31, f32, f41, f42,
1916 f13, f14, f23, f24, f33, f34, f43, f44;
1917 index_type i, j, l, ii, jj, ll;
1918 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1919
1920 a = abase;
1921 b = bbase;
1922 c = retarray->base_addr;
1923
1924 /* Parameter adjustments */
1925 c_dim1 = rystride;
1926 c_offset = 1 + c_dim1;
1927 c -= c_offset;
1928 a_dim1 = aystride;
1929 a_offset = 1 + a_dim1;
1930 a -= a_offset;
1931 b_dim1 = bystride;
1932 b_offset = 1 + b_dim1;
1933 b -= b_offset;
1934
1935 /* Early exit if possible */
1936 if (m == 0 || n == 0 || k == 0)
1937 return;
1938
1939 /* Empty c first. */
1940 for (j=1; j<=n; j++)
1941 for (i=1; i<=m; i++)
1942 c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
1943
1944 /* Start turning the crank. */
1945 i1 = n;
1946 for (jj = 1; jj <= i1; jj += 512)
1947 {
1948 /* Computing MIN */
1949 i2 = 512;
1950 i3 = n - jj + 1;
1951 jsec = min(i2,i3);
1952 ujsec = jsec - jsec % 4;
1953 i2 = k;
1954 for (ll = 1; ll <= i2; ll += 256)
1955 {
1956 /* Computing MIN */
1957 i3 = 256;
1958 i4 = k - ll + 1;
1959 lsec = min(i3,i4);
1960 ulsec = lsec - lsec % 2;
1961
1962 i3 = m;
1963 for (ii = 1; ii <= i3; ii += 256)
1964 {
1965 /* Computing MIN */
1966 i4 = 256;
1967 i5 = m - ii + 1;
1968 isec = min(i4,i5);
1969 uisec = isec - isec % 2;
1970 i4 = ll + ulsec - 1;
1971 for (l = ll; l <= i4; l += 2)
1972 {
1973 i5 = ii + uisec - 1;
1974 for (i = ii; i <= i5; i += 2)
1975 {
1976 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1977 a[i + l * a_dim1];
1978 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1979 a[i + (l + 1) * a_dim1];
1980 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1981 a[i + 1 + l * a_dim1];
1982 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1983 a[i + 1 + (l + 1) * a_dim1];
1984 }
1985 if (uisec < isec)
1986 {
1987 t1[l - ll + 1 + (isec << 8) - 257] =
1988 a[ii + isec - 1 + l * a_dim1];
1989 t1[l - ll + 2 + (isec << 8) - 257] =
1990 a[ii + isec - 1 + (l + 1) * a_dim1];
1991 }
1992 }
1993 if (ulsec < lsec)
1994 {
1995 i4 = ii + isec - 1;
1996 for (i = ii; i<= i4; ++i)
1997 {
1998 t1[lsec + ((i - ii + 1) << 8) - 257] =
1999 a[i + (ll + lsec - 1) * a_dim1];
2000 }
2001 }
2002
2003 uisec = isec - isec % 4;
2004 i4 = jj + ujsec - 1;
2005 for (j = jj; j <= i4; j += 4)
2006 {
2007 i5 = ii + uisec - 1;
2008 for (i = ii; i <= i5; i += 4)
2009 {
2010 f11 = c[i + j * c_dim1];
2011 f21 = c[i + 1 + j * c_dim1];
2012 f12 = c[i + (j + 1) * c_dim1];
2013 f22 = c[i + 1 + (j + 1) * c_dim1];
2014 f13 = c[i + (j + 2) * c_dim1];
2015 f23 = c[i + 1 + (j + 2) * c_dim1];
2016 f14 = c[i + (j + 3) * c_dim1];
2017 f24 = c[i + 1 + (j + 3) * c_dim1];
2018 f31 = c[i + 2 + j * c_dim1];
2019 f41 = c[i + 3 + j * c_dim1];
2020 f32 = c[i + 2 + (j + 1) * c_dim1];
2021 f42 = c[i + 3 + (j + 1) * c_dim1];
2022 f33 = c[i + 2 + (j + 2) * c_dim1];
2023 f43 = c[i + 3 + (j + 2) * c_dim1];
2024 f34 = c[i + 2 + (j + 3) * c_dim1];
2025 f44 = c[i + 3 + (j + 3) * c_dim1];
2026 i6 = ll + lsec - 1;
2027 for (l = ll; l <= i6; ++l)
2028 {
2029 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2030 * b[l + j * b_dim1];
2031 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2032 * b[l + j * b_dim1];
2033 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2034 * b[l + (j + 1) * b_dim1];
2035 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2036 * b[l + (j + 1) * b_dim1];
2037 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038 * b[l + (j + 2) * b_dim1];
2039 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040 * b[l + (j + 2) * b_dim1];
2041 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042 * b[l + (j + 3) * b_dim1];
2043 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044 * b[l + (j + 3) * b_dim1];
2045 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2046 * b[l + j * b_dim1];
2047 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2048 * b[l + j * b_dim1];
2049 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2050 * b[l + (j + 1) * b_dim1];
2051 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2052 * b[l + (j + 1) * b_dim1];
2053 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054 * b[l + (j + 2) * b_dim1];
2055 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056 * b[l + (j + 2) * b_dim1];
2057 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058 * b[l + (j + 3) * b_dim1];
2059 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060 * b[l + (j + 3) * b_dim1];
2061 }
2062 c[i + j * c_dim1] = f11;
2063 c[i + 1 + j * c_dim1] = f21;
2064 c[i + (j + 1) * c_dim1] = f12;
2065 c[i + 1 + (j + 1) * c_dim1] = f22;
2066 c[i + (j + 2) * c_dim1] = f13;
2067 c[i + 1 + (j + 2) * c_dim1] = f23;
2068 c[i + (j + 3) * c_dim1] = f14;
2069 c[i + 1 + (j + 3) * c_dim1] = f24;
2070 c[i + 2 + j * c_dim1] = f31;
2071 c[i + 3 + j * c_dim1] = f41;
2072 c[i + 2 + (j + 1) * c_dim1] = f32;
2073 c[i + 3 + (j + 1) * c_dim1] = f42;
2074 c[i + 2 + (j + 2) * c_dim1] = f33;
2075 c[i + 3 + (j + 2) * c_dim1] = f43;
2076 c[i + 2 + (j + 3) * c_dim1] = f34;
2077 c[i + 3 + (j + 3) * c_dim1] = f44;
2078 }
2079 if (uisec < isec)
2080 {
2081 i5 = ii + isec - 1;
2082 for (i = ii + uisec; i <= i5; ++i)
2083 {
2084 f11 = c[i + j * c_dim1];
2085 f12 = c[i + (j + 1) * c_dim1];
2086 f13 = c[i + (j + 2) * c_dim1];
2087 f14 = c[i + (j + 3) * c_dim1];
2088 i6 = ll + lsec - 1;
2089 for (l = ll; l <= i6; ++l)
2090 {
2091 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2092 257] * b[l + j * b_dim1];
2093 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2094 257] * b[l + (j + 1) * b_dim1];
2095 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2096 257] * b[l + (j + 2) * b_dim1];
2097 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2098 257] * b[l + (j + 3) * b_dim1];
2099 }
2100 c[i + j * c_dim1] = f11;
2101 c[i + (j + 1) * c_dim1] = f12;
2102 c[i + (j + 2) * c_dim1] = f13;
2103 c[i + (j + 3) * c_dim1] = f14;
2104 }
2105 }
2106 }
2107 if (ujsec < jsec)
2108 {
2109 i4 = jj + jsec - 1;
2110 for (j = jj + ujsec; j <= i4; ++j)
2111 {
2112 i5 = ii + uisec - 1;
2113 for (i = ii; i <= i5; i += 4)
2114 {
2115 f11 = c[i + j * c_dim1];
2116 f21 = c[i + 1 + j * c_dim1];
2117 f31 = c[i + 2 + j * c_dim1];
2118 f41 = c[i + 3 + j * c_dim1];
2119 i6 = ll + lsec - 1;
2120 for (l = ll; l <= i6; ++l)
2121 {
2122 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2123 257] * b[l + j * b_dim1];
2124 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2125 257] * b[l + j * b_dim1];
2126 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2127 257] * b[l + j * b_dim1];
2128 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2129 257] * b[l + j * b_dim1];
2130 }
2131 c[i + j * c_dim1] = f11;
2132 c[i + 1 + j * c_dim1] = f21;
2133 c[i + 2 + j * c_dim1] = f31;
2134 c[i + 3 + j * c_dim1] = f41;
2135 }
2136 i5 = ii + isec - 1;
2137 for (i = ii + uisec; i <= i5; ++i)
2138 {
2139 f11 = c[i + j * c_dim1];
2140 i6 = ll + lsec - 1;
2141 for (l = ll; l <= i6; ++l)
2142 {
2143 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2144 257] * b[l + j * b_dim1];
2145 }
2146 c[i + j * c_dim1] = f11;
2147 }
2148 }
2149 }
2150 }
2151 }
2152 }
2153 return;
2154 }
2155 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2156 {
2157 if (GFC_DESCRIPTOR_RANK (a) != 1)
2158 {
2159 const GFC_COMPLEX_8 *restrict abase_x;
2160 const GFC_COMPLEX_8 *restrict bbase_y;
2161 GFC_COMPLEX_8 *restrict dest_y;
2162 GFC_COMPLEX_8 s;
2163
2164 for (y = 0; y < ycount; y++)
2165 {
2166 bbase_y = &bbase[y*bystride];
2167 dest_y = &dest[y*rystride];
2168 for (x = 0; x < xcount; x++)
2169 {
2170 abase_x = &abase[x*axstride];
2171 s = (GFC_COMPLEX_8) 0;
2172 for (n = 0; n < count; n++)
2173 s += abase_x[n] * bbase_y[n];
2174 dest_y[x] = s;
2175 }
2176 }
2177 }
2178 else
2179 {
2180 const GFC_COMPLEX_8 *restrict bbase_y;
2181 GFC_COMPLEX_8 s;
2182
2183 for (y = 0; y < ycount; y++)
2184 {
2185 bbase_y = &bbase[y*bystride];
2186 s = (GFC_COMPLEX_8) 0;
2187 for (n = 0; n < count; n++)
2188 s += abase[n*axstride] * bbase_y[n];
2189 dest[y*rystride] = s;
2190 }
2191 }
2192 }
2193 else if (axstride < aystride)
2194 {
2195 for (y = 0; y < ycount; y++)
2196 for (x = 0; x < xcount; x++)
2197 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2198
2199 for (y = 0; y < ycount; y++)
2200 for (n = 0; n < count; n++)
2201 for (x = 0; x < xcount; x++)
2202 /* dest[x,y] += a[x,n] * b[n,y] */
2203 dest[x*rxstride + y*rystride] +=
2204 abase[x*axstride + n*aystride] *
2205 bbase[n*bxstride + y*bystride];
2206 }
2207 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2208 {
2209 const GFC_COMPLEX_8 *restrict bbase_y;
2210 GFC_COMPLEX_8 s;
2211
2212 for (y = 0; y < ycount; y++)
2213 {
2214 bbase_y = &bbase[y*bystride];
2215 s = (GFC_COMPLEX_8) 0;
2216 for (n = 0; n < count; n++)
2217 s += abase[n*axstride] * bbase_y[n*bxstride];
2218 dest[y*rxstride] = s;
2219 }
2220 }
2221 else
2222 {
2223 const GFC_COMPLEX_8 *restrict abase_x;
2224 const GFC_COMPLEX_8 *restrict bbase_y;
2225 GFC_COMPLEX_8 *restrict dest_y;
2226 GFC_COMPLEX_8 s;
2227
2228 for (y = 0; y < ycount; y++)
2229 {
2230 bbase_y = &bbase[y*bystride];
2231 dest_y = &dest[y*rystride];
2232 for (x = 0; x < xcount; x++)
2233 {
2234 abase_x = &abase[x*axstride];
2235 s = (GFC_COMPLEX_8) 0;
2236 for (n = 0; n < count; n++)
2237 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2238 dest_y[x*rxstride] = s;
2239 }
2240 }
2241 }
2242 }
2243 #undef POW3
2244 #undef min
2245 #undef max
2246
2247
2248 /* Compiling main function, with selection code for the processor. */
2249
2250 /* Currently, this is i386 only. Adjust for other architectures. */
2251
2252 #include <config/i386/cpuinfo.h>
2253 void matmul_c8 (gfc_array_c8 * const restrict retarray,
2254 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2255 int blas_limit, blas_call gemm)
2256 {
2257 static void (*matmul_p) (gfc_array_c8 * const restrict retarray,
2258 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2259 int blas_limit, blas_call gemm) = NULL;
2260
2261 if (matmul_p == NULL)
2262 {
2263 matmul_p = matmul_c8_vanilla;
2264 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2265 {
2266 /* Run down the available processors in order of preference. */
2267 #ifdef HAVE_AVX512F
2268 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2269 {
2270 matmul_p = matmul_c8_avx512f;
2271 goto tailcall;
2272 }
2273
2274 #endif /* HAVE_AVX512F */
2275
2276 #ifdef HAVE_AVX2
2277 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2278 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2279 {
2280 matmul_p = matmul_c8_avx2;
2281 goto tailcall;
2282 }
2283
2284 #endif
2285
2286 #ifdef HAVE_AVX
2287 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2288 {
2289 matmul_p = matmul_c8_avx;
2290 goto tailcall;
2291 }
2292 #endif /* HAVE_AVX */
2293 }
2294 }
2295
2296 tailcall:
2297 (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2298 }
2299
2300 #else /* Just the vanilla function. */
2301
2302 void
2303 matmul_c8 (gfc_array_c8 * const restrict retarray,
2304 gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
2305 int blas_limit, blas_call gemm)
2306 {
2307 const GFC_COMPLEX_8 * restrict abase;
2308 const GFC_COMPLEX_8 * restrict bbase;
2309 GFC_COMPLEX_8 * restrict dest;
2310
2311 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2312 index_type x, y, n, count, xcount, ycount;
2313
2314 assert (GFC_DESCRIPTOR_RANK (a) == 2
2315 || GFC_DESCRIPTOR_RANK (b) == 2);
2316
2317 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2318
2319 Either A or B (but not both) can be rank 1:
2320
2321 o One-dimensional argument A is implicitly treated as a row matrix
2322 dimensioned [1,count], so xcount=1.
2323
2324 o One-dimensional argument B is implicitly treated as a column matrix
2325 dimensioned [count, 1], so ycount=1.
2326 */
2327
2328 if (retarray->base_addr == NULL)
2329 {
2330 if (GFC_DESCRIPTOR_RANK (a) == 1)
2331 {
2332 GFC_DIMENSION_SET(retarray->dim[0], 0,
2333 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2334 }
2335 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2336 {
2337 GFC_DIMENSION_SET(retarray->dim[0], 0,
2338 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2339 }
2340 else
2341 {
2342 GFC_DIMENSION_SET(retarray->dim[0], 0,
2343 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2344
2345 GFC_DIMENSION_SET(retarray->dim[1], 0,
2346 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2347 GFC_DESCRIPTOR_EXTENT(retarray,0));
2348 }
2349
2350 retarray->base_addr
2351 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8));
2352 retarray->offset = 0;
2353 }
2354 else if (unlikely (compile_options.bounds_check))
2355 {
2356 index_type ret_extent, arg_extent;
2357
2358 if (GFC_DESCRIPTOR_RANK (a) == 1)
2359 {
2360 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2361 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2362 if (arg_extent != ret_extent)
2363 runtime_error ("Incorrect extent in return array in"
2364 " MATMUL intrinsic: is %ld, should be %ld",
2365 (long int) ret_extent, (long int) arg_extent);
2366 }
2367 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2368 {
2369 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2370 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2371 if (arg_extent != ret_extent)
2372 runtime_error ("Incorrect extent in return array in"
2373 " MATMUL intrinsic: is %ld, should be %ld",
2374 (long int) ret_extent, (long int) arg_extent);
2375 }
2376 else
2377 {
2378 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2379 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2380 if (arg_extent != ret_extent)
2381 runtime_error ("Incorrect extent in return array in"
2382 " MATMUL intrinsic for dimension 1:"
2383 " is %ld, should be %ld",
2384 (long int) ret_extent, (long int) arg_extent);
2385
2386 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2387 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2388 if (arg_extent != ret_extent)
2389 runtime_error ("Incorrect extent in return array in"
2390 " MATMUL intrinsic for dimension 2:"
2391 " is %ld, should be %ld",
2392 (long int) ret_extent, (long int) arg_extent);
2393 }
2394 }
2395
2396
2397 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2398 {
2399 /* One-dimensional result may be addressed in the code below
2400 either as a row or a column matrix. We want both cases to
2401 work. */
2402 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2403 }
2404 else
2405 {
2406 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2407 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2408 }
2409
2410
2411 if (GFC_DESCRIPTOR_RANK (a) == 1)
2412 {
2413 /* Treat it as a a row matrix A[1,count]. */
2414 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2415 aystride = 1;
2416
2417 xcount = 1;
2418 count = GFC_DESCRIPTOR_EXTENT(a,0);
2419 }
2420 else
2421 {
2422 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2423 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2424
2425 count = GFC_DESCRIPTOR_EXTENT(a,1);
2426 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2427 }
2428
2429 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2430 {
2431 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2432 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2433 }
2434
2435 if (GFC_DESCRIPTOR_RANK (b) == 1)
2436 {
2437 /* Treat it as a column matrix B[count,1] */
2438 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2439
2440 /* bystride should never be used for 1-dimensional b.
2441 in case it is we want it to cause a segfault, rather than
2442 an incorrect result. */
2443 bystride = 0xDEADBEEF;
2444 ycount = 1;
2445 }
2446 else
2447 {
2448 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2449 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2450 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2451 }
2452
2453 abase = a->base_addr;
2454 bbase = b->base_addr;
2455 dest = retarray->base_addr;
2456
2457 /* Now that everything is set up, we perform the multiplication
2458 itself. */
2459
2460 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2461 #define min(a,b) ((a) <= (b) ? (a) : (b))
2462 #define max(a,b) ((a) >= (b) ? (a) : (b))
2463
2464 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2465 && (bxstride == 1 || bystride == 1)
2466 && (((float) xcount) * ((float) ycount) * ((float) count)
2467 > POW3(blas_limit)))
2468 {
2469 const int m = xcount, n = ycount, k = count, ldc = rystride;
2470 const GFC_COMPLEX_8 one = 1, zero = 0;
2471 const int lda = (axstride == 1) ? aystride : axstride,
2472 ldb = (bxstride == 1) ? bystride : bxstride;
2473
2474 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2475 {
2476 assert (gemm != NULL);
2477 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2478 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2479 &ldc, 1, 1);
2480 return;
2481 }
2482 }
2483
2484 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2485 {
2486 /* This block of code implements a tuned matmul, derived from
2487 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2488
2489 Bo Kagstrom and Per Ling
2490 Department of Computing Science
2491 Umea University
2492 S-901 87 Umea, Sweden
2493
2494 from netlib.org, translated to C, and modified for matmul.m4. */
2495
2496 const GFC_COMPLEX_8 *a, *b;
2497 GFC_COMPLEX_8 *c;
2498 const index_type m = xcount, n = ycount, k = count;
2499
2500 /* System generated locals */
2501 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2502 i1, i2, i3, i4, i5, i6;
2503
2504 /* Local variables */
2505 GFC_COMPLEX_8 t1[65536], /* was [256][256] */
2506 f11, f12, f21, f22, f31, f32, f41, f42,
2507 f13, f14, f23, f24, f33, f34, f43, f44;
2508 index_type i, j, l, ii, jj, ll;
2509 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2510
2511 a = abase;
2512 b = bbase;
2513 c = retarray->base_addr;
2514
2515 /* Parameter adjustments */
2516 c_dim1 = rystride;
2517 c_offset = 1 + c_dim1;
2518 c -= c_offset;
2519 a_dim1 = aystride;
2520 a_offset = 1 + a_dim1;
2521 a -= a_offset;
2522 b_dim1 = bystride;
2523 b_offset = 1 + b_dim1;
2524 b -= b_offset;
2525
2526 /* Early exit if possible */
2527 if (m == 0 || n == 0 || k == 0)
2528 return;
2529
2530 /* Empty c first. */
2531 for (j=1; j<=n; j++)
2532 for (i=1; i<=m; i++)
2533 c[i + j * c_dim1] = (GFC_COMPLEX_8)0;
2534
2535 /* Start turning the crank. */
2536 i1 = n;
2537 for (jj = 1; jj <= i1; jj += 512)
2538 {
2539 /* Computing MIN */
2540 i2 = 512;
2541 i3 = n - jj + 1;
2542 jsec = min(i2,i3);
2543 ujsec = jsec - jsec % 4;
2544 i2 = k;
2545 for (ll = 1; ll <= i2; ll += 256)
2546 {
2547 /* Computing MIN */
2548 i3 = 256;
2549 i4 = k - ll + 1;
2550 lsec = min(i3,i4);
2551 ulsec = lsec - lsec % 2;
2552
2553 i3 = m;
2554 for (ii = 1; ii <= i3; ii += 256)
2555 {
2556 /* Computing MIN */
2557 i4 = 256;
2558 i5 = m - ii + 1;
2559 isec = min(i4,i5);
2560 uisec = isec - isec % 2;
2561 i4 = ll + ulsec - 1;
2562 for (l = ll; l <= i4; l += 2)
2563 {
2564 i5 = ii + uisec - 1;
2565 for (i = ii; i <= i5; i += 2)
2566 {
2567 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2568 a[i + l * a_dim1];
2569 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2570 a[i + (l + 1) * a_dim1];
2571 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2572 a[i + 1 + l * a_dim1];
2573 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2574 a[i + 1 + (l + 1) * a_dim1];
2575 }
2576 if (uisec < isec)
2577 {
2578 t1[l - ll + 1 + (isec << 8) - 257] =
2579 a[ii + isec - 1 + l * a_dim1];
2580 t1[l - ll + 2 + (isec << 8) - 257] =
2581 a[ii + isec - 1 + (l + 1) * a_dim1];
2582 }
2583 }
2584 if (ulsec < lsec)
2585 {
2586 i4 = ii + isec - 1;
2587 for (i = ii; i<= i4; ++i)
2588 {
2589 t1[lsec + ((i - ii + 1) << 8) - 257] =
2590 a[i + (ll + lsec - 1) * a_dim1];
2591 }
2592 }
2593
2594 uisec = isec - isec % 4;
2595 i4 = jj + ujsec - 1;
2596 for (j = jj; j <= i4; j += 4)
2597 {
2598 i5 = ii + uisec - 1;
2599 for (i = ii; i <= i5; i += 4)
2600 {
2601 f11 = c[i + j * c_dim1];
2602 f21 = c[i + 1 + j * c_dim1];
2603 f12 = c[i + (j + 1) * c_dim1];
2604 f22 = c[i + 1 + (j + 1) * c_dim1];
2605 f13 = c[i + (j + 2) * c_dim1];
2606 f23 = c[i + 1 + (j + 2) * c_dim1];
2607 f14 = c[i + (j + 3) * c_dim1];
2608 f24 = c[i + 1 + (j + 3) * c_dim1];
2609 f31 = c[i + 2 + j * c_dim1];
2610 f41 = c[i + 3 + j * c_dim1];
2611 f32 = c[i + 2 + (j + 1) * c_dim1];
2612 f42 = c[i + 3 + (j + 1) * c_dim1];
2613 f33 = c[i + 2 + (j + 2) * c_dim1];
2614 f43 = c[i + 3 + (j + 2) * c_dim1];
2615 f34 = c[i + 2 + (j + 3) * c_dim1];
2616 f44 = c[i + 3 + (j + 3) * c_dim1];
2617 i6 = ll + lsec - 1;
2618 for (l = ll; l <= i6; ++l)
2619 {
2620 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2621 * b[l + j * b_dim1];
2622 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2623 * b[l + j * b_dim1];
2624 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2625 * b[l + (j + 1) * b_dim1];
2626 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2627 * b[l + (j + 1) * b_dim1];
2628 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2629 * b[l + (j + 2) * b_dim1];
2630 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2631 * b[l + (j + 2) * b_dim1];
2632 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2633 * b[l + (j + 3) * b_dim1];
2634 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2635 * b[l + (j + 3) * b_dim1];
2636 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2637 * b[l + j * b_dim1];
2638 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2639 * b[l + j * b_dim1];
2640 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2641 * b[l + (j + 1) * b_dim1];
2642 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2643 * b[l + (j + 1) * b_dim1];
2644 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2645 * b[l + (j + 2) * b_dim1];
2646 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2647 * b[l + (j + 2) * b_dim1];
2648 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2649 * b[l + (j + 3) * b_dim1];
2650 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2651 * b[l + (j + 3) * b_dim1];
2652 }
2653 c[i + j * c_dim1] = f11;
2654 c[i + 1 + j * c_dim1] = f21;
2655 c[i + (j + 1) * c_dim1] = f12;
2656 c[i + 1 + (j + 1) * c_dim1] = f22;
2657 c[i + (j + 2) * c_dim1] = f13;
2658 c[i + 1 + (j + 2) * c_dim1] = f23;
2659 c[i + (j + 3) * c_dim1] = f14;
2660 c[i + 1 + (j + 3) * c_dim1] = f24;
2661 c[i + 2 + j * c_dim1] = f31;
2662 c[i + 3 + j * c_dim1] = f41;
2663 c[i + 2 + (j + 1) * c_dim1] = f32;
2664 c[i + 3 + (j + 1) * c_dim1] = f42;
2665 c[i + 2 + (j + 2) * c_dim1] = f33;
2666 c[i + 3 + (j + 2) * c_dim1] = f43;
2667 c[i + 2 + (j + 3) * c_dim1] = f34;
2668 c[i + 3 + (j + 3) * c_dim1] = f44;
2669 }
2670 if (uisec < isec)
2671 {
2672 i5 = ii + isec - 1;
2673 for (i = ii + uisec; i <= i5; ++i)
2674 {
2675 f11 = c[i + j * c_dim1];
2676 f12 = c[i + (j + 1) * c_dim1];
2677 f13 = c[i + (j + 2) * c_dim1];
2678 f14 = c[i + (j + 3) * c_dim1];
2679 i6 = ll + lsec - 1;
2680 for (l = ll; l <= i6; ++l)
2681 {
2682 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2683 257] * b[l + j * b_dim1];
2684 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2685 257] * b[l + (j + 1) * b_dim1];
2686 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2687 257] * b[l + (j + 2) * b_dim1];
2688 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + (j + 3) * b_dim1];
2690 }
2691 c[i + j * c_dim1] = f11;
2692 c[i + (j + 1) * c_dim1] = f12;
2693 c[i + (j + 2) * c_dim1] = f13;
2694 c[i + (j + 3) * c_dim1] = f14;
2695 }
2696 }
2697 }
2698 if (ujsec < jsec)
2699 {
2700 i4 = jj + jsec - 1;
2701 for (j = jj + ujsec; j <= i4; ++j)
2702 {
2703 i5 = ii + uisec - 1;
2704 for (i = ii; i <= i5; i += 4)
2705 {
2706 f11 = c[i + j * c_dim1];
2707 f21 = c[i + 1 + j * c_dim1];
2708 f31 = c[i + 2 + j * c_dim1];
2709 f41 = c[i + 3 + j * c_dim1];
2710 i6 = ll + lsec - 1;
2711 for (l = ll; l <= i6; ++l)
2712 {
2713 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2714 257] * b[l + j * b_dim1];
2715 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2716 257] * b[l + j * b_dim1];
2717 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2718 257] * b[l + j * b_dim1];
2719 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2720 257] * b[l + j * b_dim1];
2721 }
2722 c[i + j * c_dim1] = f11;
2723 c[i + 1 + j * c_dim1] = f21;
2724 c[i + 2 + j * c_dim1] = f31;
2725 c[i + 3 + j * c_dim1] = f41;
2726 }
2727 i5 = ii + isec - 1;
2728 for (i = ii + uisec; i <= i5; ++i)
2729 {
2730 f11 = c[i + j * c_dim1];
2731 i6 = ll + lsec - 1;
2732 for (l = ll; l <= i6; ++l)
2733 {
2734 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2735 257] * b[l + j * b_dim1];
2736 }
2737 c[i + j * c_dim1] = f11;
2738 }
2739 }
2740 }
2741 }
2742 }
2743 }
2744 return;
2745 }
2746 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2747 {
2748 if (GFC_DESCRIPTOR_RANK (a) != 1)
2749 {
2750 const GFC_COMPLEX_8 *restrict abase_x;
2751 const GFC_COMPLEX_8 *restrict bbase_y;
2752 GFC_COMPLEX_8 *restrict dest_y;
2753 GFC_COMPLEX_8 s;
2754
2755 for (y = 0; y < ycount; y++)
2756 {
2757 bbase_y = &bbase[y*bystride];
2758 dest_y = &dest[y*rystride];
2759 for (x = 0; x < xcount; x++)
2760 {
2761 abase_x = &abase[x*axstride];
2762 s = (GFC_COMPLEX_8) 0;
2763 for (n = 0; n < count; n++)
2764 s += abase_x[n] * bbase_y[n];
2765 dest_y[x] = s;
2766 }
2767 }
2768 }
2769 else
2770 {
2771 const GFC_COMPLEX_8 *restrict bbase_y;
2772 GFC_COMPLEX_8 s;
2773
2774 for (y = 0; y < ycount; y++)
2775 {
2776 bbase_y = &bbase[y*bystride];
2777 s = (GFC_COMPLEX_8) 0;
2778 for (n = 0; n < count; n++)
2779 s += abase[n*axstride] * bbase_y[n];
2780 dest[y*rystride] = s;
2781 }
2782 }
2783 }
2784 else if (axstride < aystride)
2785 {
2786 for (y = 0; y < ycount; y++)
2787 for (x = 0; x < xcount; x++)
2788 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0;
2789
2790 for (y = 0; y < ycount; y++)
2791 for (n = 0; n < count; n++)
2792 for (x = 0; x < xcount; x++)
2793 /* dest[x,y] += a[x,n] * b[n,y] */
2794 dest[x*rxstride + y*rystride] +=
2795 abase[x*axstride + n*aystride] *
2796 bbase[n*bxstride + y*bystride];
2797 }
2798 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2799 {
2800 const GFC_COMPLEX_8 *restrict bbase_y;
2801 GFC_COMPLEX_8 s;
2802
2803 for (y = 0; y < ycount; y++)
2804 {
2805 bbase_y = &bbase[y*bystride];
2806 s = (GFC_COMPLEX_8) 0;
2807 for (n = 0; n < count; n++)
2808 s += abase[n*axstride] * bbase_y[n*bxstride];
2809 dest[y*rxstride] = s;
2810 }
2811 }
2812 else
2813 {
2814 const GFC_COMPLEX_8 *restrict abase_x;
2815 const GFC_COMPLEX_8 *restrict bbase_y;
2816 GFC_COMPLEX_8 *restrict dest_y;
2817 GFC_COMPLEX_8 s;
2818
2819 for (y = 0; y < ycount; y++)
2820 {
2821 bbase_y = &bbase[y*bystride];
2822 dest_y = &dest[y*rystride];
2823 for (x = 0; x < xcount; x++)
2824 {
2825 abase_x = &abase[x*axstride];
2826 s = (GFC_COMPLEX_8) 0;
2827 for (n = 0; n < count; n++)
2828 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2829 dest_y[x*rxstride] = s;
2830 }
2831 }
2832 }
2833 }
2834 #undef POW3
2835 #undef min
2836 #undef max
2837
2838 #endif
2839 #endif
2840