]>
git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_c8.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
31 #if defined (HAVE_GFC_COMPLEX_8)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
37 typedef void (*blas_call
)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_COMPLEX_8
*, const GFC_COMPLEX_8
*,
39 const int *, const GFC_COMPLEX_8
*, const int *,
40 const GFC_COMPLEX_8
*, GFC_COMPLEX_8
*, const int *,
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_c8 (gfc_array_c8
* const restrict retarray
,
73 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
74 int blas_limit
, blas_call gemm
);
75 export_proto(matmul_c8
);
77 /* Put exhaustive list of possible architectures here here, ORed together. */
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
83 matmul_c8_avx (gfc_array_c8
* const restrict retarray
,
84 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
85 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx")));
87 matmul_c8_avx (gfc_array_c8
* const restrict retarray
,
88 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
89 int blas_limit
, blas_call gemm
)
91 const GFC_COMPLEX_8
* restrict abase
;
92 const GFC_COMPLEX_8
* restrict bbase
;
93 GFC_COMPLEX_8
* restrict dest
;
95 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
96 index_type x
, y
, n
, count
, xcount
, ycount
;
98 assert (GFC_DESCRIPTOR_RANK (a
) == 2
99 || GFC_DESCRIPTOR_RANK (b
) == 2);
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
103 Either A or B (but not both) can be rank 1:
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
112 if (retarray
->base_addr
== NULL
)
114 if (GFC_DESCRIPTOR_RANK (a
) == 1)
116 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
119 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
121 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
126 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray
,0));
135 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
136 retarray
->offset
= 0;
138 else if (unlikely (compile_options
.bounds_check
))
140 index_type ret_extent
, arg_extent
;
142 if (GFC_DESCRIPTOR_RANK (a
) == 1)
144 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
145 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
146 if (arg_extent
!= ret_extent
)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent
, (long int) arg_extent
);
151 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
153 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
154 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
155 if (arg_extent
!= ret_extent
)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent
, (long int) arg_extent
);
162 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
163 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
164 if (arg_extent
!= ret_extent
)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent
, (long int) arg_extent
);
170 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
171 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
172 if (arg_extent
!= ret_extent
)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent
, (long int) arg_extent
);
181 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
186 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
190 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
191 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
195 if (GFC_DESCRIPTOR_RANK (a
) == 1)
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
202 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
206 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
207 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
209 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
210 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
213 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
215 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
219 if (GFC_DESCRIPTOR_RANK (b
) == 1)
221 /* Treat it as a column matrix B[count,1] */
222 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride
= 0xDEADBEEF;
232 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
233 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
234 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
237 abase
= a
->base_addr
;
238 bbase
= b
->base_addr
;
239 dest
= retarray
->base_addr
;
241 /* Now that everything is set up, we perform the multiplication
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
248 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
249 && (bxstride
== 1 || bystride
== 1)
250 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
253 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
254 const GFC_COMPLEX_8 one
= 1, zero
= 0;
255 const int lda
= (axstride
== 1) ? aystride
: axstride
,
256 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
258 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
260 assert (gemm
!= NULL
);
261 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
262 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
268 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
276 S-901 87 Umea, Sweden
278 from netlib.org, translated to C, and modified for matmul.m4. */
280 const GFC_COMPLEX_8
*a
, *b
;
282 const index_type m
= xcount
, n
= ycount
, k
= count
;
284 /* System generated locals */
285 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
286 i1
, i2
, i3
, i4
, i5
, i6
;
288 /* Local variables */
289 GFC_COMPLEX_8 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
290 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
291 index_type i
, j
, l
, ii
, jj
, ll
;
292 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
296 c
= retarray
->base_addr
;
298 /* Parameter adjustments */
300 c_offset
= 1 + c_dim1
;
303 a_offset
= 1 + a_dim1
;
306 b_offset
= 1 + b_dim1
;
309 /* Early exit if possible */
310 if (m
== 0 || n
== 0 || k
== 0)
313 /* Adjust size of t1 to what is needed. */
315 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
319 #pragma GCC diagnostic push
320 #pragma GCC diagnostic ignored "-Wvla"
321 GFC_COMPLEX_8 t1
[t1_dim
]; /* was [256][256] */
322 #pragma GCC diagnostic pop
327 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
329 /* Start turning the crank. */
331 for (jj
= 1; jj
<= i1
; jj
+= 512)
337 ujsec
= jsec
- jsec
% 4;
339 for (ll
= 1; ll
<= i2
; ll
+= 256)
345 ulsec
= lsec
- lsec
% 2;
348 for (ii
= 1; ii
<= i3
; ii
+= 256)
354 uisec
= isec
- isec
% 2;
356 for (l
= ll
; l
<= i4
; l
+= 2)
359 for (i
= ii
; i
<= i5
; i
+= 2)
361 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
363 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
364 a
[i
+ (l
+ 1) * a_dim1
];
365 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
366 a
[i
+ 1 + l
* a_dim1
];
367 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
368 a
[i
+ 1 + (l
+ 1) * a_dim1
];
372 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
373 a
[ii
+ isec
- 1 + l
* a_dim1
];
374 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
375 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
381 for (i
= ii
; i
<= i4
; ++i
)
383 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
384 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
388 uisec
= isec
- isec
% 4;
390 for (j
= jj
; j
<= i4
; j
+= 4)
393 for (i
= ii
; i
<= i5
; i
+= 4)
395 f11
= c
[i
+ j
* c_dim1
];
396 f21
= c
[i
+ 1 + j
* c_dim1
];
397 f12
= c
[i
+ (j
+ 1) * c_dim1
];
398 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
399 f13
= c
[i
+ (j
+ 2) * c_dim1
];
400 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
401 f14
= c
[i
+ (j
+ 3) * c_dim1
];
402 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
403 f31
= c
[i
+ 2 + j
* c_dim1
];
404 f41
= c
[i
+ 3 + j
* c_dim1
];
405 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
406 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
407 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
408 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
409 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
410 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
412 for (l
= ll
; l
<= i6
; ++l
)
414 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
416 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
418 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
419 * b
[l
+ (j
+ 1) * b_dim1
];
420 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
421 * b
[l
+ (j
+ 1) * b_dim1
];
422 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
423 * b
[l
+ (j
+ 2) * b_dim1
];
424 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
425 * b
[l
+ (j
+ 2) * b_dim1
];
426 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
427 * b
[l
+ (j
+ 3) * b_dim1
];
428 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
429 * b
[l
+ (j
+ 3) * b_dim1
];
430 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
432 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
434 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
435 * b
[l
+ (j
+ 1) * b_dim1
];
436 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
437 * b
[l
+ (j
+ 1) * b_dim1
];
438 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
439 * b
[l
+ (j
+ 2) * b_dim1
];
440 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
441 * b
[l
+ (j
+ 2) * b_dim1
];
442 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
443 * b
[l
+ (j
+ 3) * b_dim1
];
444 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
445 * b
[l
+ (j
+ 3) * b_dim1
];
447 c
[i
+ j
* c_dim1
] = f11
;
448 c
[i
+ 1 + j
* c_dim1
] = f21
;
449 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
450 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
451 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
452 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
453 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
454 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
455 c
[i
+ 2 + j
* c_dim1
] = f31
;
456 c
[i
+ 3 + j
* c_dim1
] = f41
;
457 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
458 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
459 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
460 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
461 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
462 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
467 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
469 f11
= c
[i
+ j
* c_dim1
];
470 f12
= c
[i
+ (j
+ 1) * c_dim1
];
471 f13
= c
[i
+ (j
+ 2) * c_dim1
];
472 f14
= c
[i
+ (j
+ 3) * c_dim1
];
474 for (l
= ll
; l
<= i6
; ++l
)
476 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
477 257] * b
[l
+ j
* b_dim1
];
478 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
479 257] * b
[l
+ (j
+ 1) * b_dim1
];
480 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
481 257] * b
[l
+ (j
+ 2) * b_dim1
];
482 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
483 257] * b
[l
+ (j
+ 3) * b_dim1
];
485 c
[i
+ j
* c_dim1
] = f11
;
486 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
487 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
488 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
495 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
498 for (i
= ii
; i
<= i5
; i
+= 4)
500 f11
= c
[i
+ j
* c_dim1
];
501 f21
= c
[i
+ 1 + j
* c_dim1
];
502 f31
= c
[i
+ 2 + j
* c_dim1
];
503 f41
= c
[i
+ 3 + j
* c_dim1
];
505 for (l
= ll
; l
<= i6
; ++l
)
507 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
508 257] * b
[l
+ j
* b_dim1
];
509 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
510 257] * b
[l
+ j
* b_dim1
];
511 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
512 257] * b
[l
+ j
* b_dim1
];
513 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
514 257] * b
[l
+ j
* b_dim1
];
516 c
[i
+ j
* c_dim1
] = f11
;
517 c
[i
+ 1 + j
* c_dim1
] = f21
;
518 c
[i
+ 2 + j
* c_dim1
] = f31
;
519 c
[i
+ 3 + j
* c_dim1
] = f41
;
522 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
524 f11
= c
[i
+ j
* c_dim1
];
526 for (l
= ll
; l
<= i6
; ++l
)
528 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
529 257] * b
[l
+ j
* b_dim1
];
531 c
[i
+ j
* c_dim1
] = f11
;
540 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
542 if (GFC_DESCRIPTOR_RANK (a
) != 1)
544 const GFC_COMPLEX_8
*restrict abase_x
;
545 const GFC_COMPLEX_8
*restrict bbase_y
;
546 GFC_COMPLEX_8
*restrict dest_y
;
549 for (y
= 0; y
< ycount
; y
++)
551 bbase_y
= &bbase
[y
*bystride
];
552 dest_y
= &dest
[y
*rystride
];
553 for (x
= 0; x
< xcount
; x
++)
555 abase_x
= &abase
[x
*axstride
];
556 s
= (GFC_COMPLEX_8
) 0;
557 for (n
= 0; n
< count
; n
++)
558 s
+= abase_x
[n
] * bbase_y
[n
];
565 const GFC_COMPLEX_8
*restrict bbase_y
;
568 for (y
= 0; y
< ycount
; y
++)
570 bbase_y
= &bbase
[y
*bystride
];
571 s
= (GFC_COMPLEX_8
) 0;
572 for (n
= 0; n
< count
; n
++)
573 s
+= abase
[n
*axstride
] * bbase_y
[n
];
574 dest
[y
*rystride
] = s
;
578 else if (axstride
< aystride
)
580 for (y
= 0; y
< ycount
; y
++)
581 for (x
= 0; x
< xcount
; x
++)
582 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
584 for (y
= 0; y
< ycount
; y
++)
585 for (n
= 0; n
< count
; n
++)
586 for (x
= 0; x
< xcount
; x
++)
587 /* dest[x,y] += a[x,n] * b[n,y] */
588 dest
[x
*rxstride
+ y
*rystride
] +=
589 abase
[x
*axstride
+ n
*aystride
] *
590 bbase
[n
*bxstride
+ y
*bystride
];
592 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
594 const GFC_COMPLEX_8
*restrict bbase_y
;
597 for (y
= 0; y
< ycount
; y
++)
599 bbase_y
= &bbase
[y
*bystride
];
600 s
= (GFC_COMPLEX_8
) 0;
601 for (n
= 0; n
< count
; n
++)
602 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
603 dest
[y
*rxstride
] = s
;
608 const GFC_COMPLEX_8
*restrict abase_x
;
609 const GFC_COMPLEX_8
*restrict bbase_y
;
610 GFC_COMPLEX_8
*restrict dest_y
;
613 for (y
= 0; y
< ycount
; y
++)
615 bbase_y
= &bbase
[y
*bystride
];
616 dest_y
= &dest
[y
*rystride
];
617 for (x
= 0; x
< xcount
; x
++)
619 abase_x
= &abase
[x
*axstride
];
620 s
= (GFC_COMPLEX_8
) 0;
621 for (n
= 0; n
< count
; n
++)
622 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
623 dest_y
[x
*rxstride
] = s
;
632 #endif /* HAVE_AVX */
636 matmul_c8_avx2 (gfc_array_c8
* const restrict retarray
,
637 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
638 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx2,fma")));
640 matmul_c8_avx2 (gfc_array_c8
* const restrict retarray
,
641 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
642 int blas_limit
, blas_call gemm
)
644 const GFC_COMPLEX_8
* restrict abase
;
645 const GFC_COMPLEX_8
* restrict bbase
;
646 GFC_COMPLEX_8
* restrict dest
;
648 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
649 index_type x
, y
, n
, count
, xcount
, ycount
;
651 assert (GFC_DESCRIPTOR_RANK (a
) == 2
652 || GFC_DESCRIPTOR_RANK (b
) == 2);
654 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
656 Either A or B (but not both) can be rank 1:
658 o One-dimensional argument A is implicitly treated as a row matrix
659 dimensioned [1,count], so xcount=1.
661 o One-dimensional argument B is implicitly treated as a column matrix
662 dimensioned [count, 1], so ycount=1.
665 if (retarray
->base_addr
== NULL
)
667 if (GFC_DESCRIPTOR_RANK (a
) == 1)
669 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
670 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
672 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
674 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
675 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
679 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
680 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
682 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
683 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
684 GFC_DESCRIPTOR_EXTENT(retarray
,0));
688 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
689 retarray
->offset
= 0;
691 else if (unlikely (compile_options
.bounds_check
))
693 index_type ret_extent
, arg_extent
;
695 if (GFC_DESCRIPTOR_RANK (a
) == 1)
697 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
698 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
699 if (arg_extent
!= ret_extent
)
700 runtime_error ("Incorrect extent in return array in"
701 " MATMUL intrinsic: is %ld, should be %ld",
702 (long int) ret_extent
, (long int) arg_extent
);
704 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
706 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
707 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
708 if (arg_extent
!= ret_extent
)
709 runtime_error ("Incorrect extent in return array in"
710 " MATMUL intrinsic: is %ld, should be %ld",
711 (long int) ret_extent
, (long int) arg_extent
);
715 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
716 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
717 if (arg_extent
!= ret_extent
)
718 runtime_error ("Incorrect extent in return array in"
719 " MATMUL intrinsic for dimension 1:"
720 " is %ld, should be %ld",
721 (long int) ret_extent
, (long int) arg_extent
);
723 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
724 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
725 if (arg_extent
!= ret_extent
)
726 runtime_error ("Incorrect extent in return array in"
727 " MATMUL intrinsic for dimension 2:"
728 " is %ld, should be %ld",
729 (long int) ret_extent
, (long int) arg_extent
);
734 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
736 /* One-dimensional result may be addressed in the code below
737 either as a row or a column matrix. We want both cases to
739 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
743 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
744 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
748 if (GFC_DESCRIPTOR_RANK (a
) == 1)
750 /* Treat it as a a row matrix A[1,count]. */
751 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
755 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
759 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
760 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
762 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
763 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
766 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
768 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
769 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
772 if (GFC_DESCRIPTOR_RANK (b
) == 1)
774 /* Treat it as a column matrix B[count,1] */
775 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
777 /* bystride should never be used for 1-dimensional b.
778 in case it is we want it to cause a segfault, rather than
779 an incorrect result. */
780 bystride
= 0xDEADBEEF;
785 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
786 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
787 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
790 abase
= a
->base_addr
;
791 bbase
= b
->base_addr
;
792 dest
= retarray
->base_addr
;
794 /* Now that everything is set up, we perform the multiplication
797 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
798 #define min(a,b) ((a) <= (b) ? (a) : (b))
799 #define max(a,b) ((a) >= (b) ? (a) : (b))
801 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
802 && (bxstride
== 1 || bystride
== 1)
803 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
806 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
807 const GFC_COMPLEX_8 one
= 1, zero
= 0;
808 const int lda
= (axstride
== 1) ? aystride
: axstride
,
809 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
811 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
813 assert (gemm
!= NULL
);
814 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
815 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
821 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
823 /* This block of code implements a tuned matmul, derived from
824 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
826 Bo Kagstrom and Per Ling
827 Department of Computing Science
829 S-901 87 Umea, Sweden
831 from netlib.org, translated to C, and modified for matmul.m4. */
833 const GFC_COMPLEX_8
*a
, *b
;
835 const index_type m
= xcount
, n
= ycount
, k
= count
;
837 /* System generated locals */
838 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
839 i1
, i2
, i3
, i4
, i5
, i6
;
841 /* Local variables */
842 GFC_COMPLEX_8 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
843 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
844 index_type i
, j
, l
, ii
, jj
, ll
;
845 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
849 c
= retarray
->base_addr
;
851 /* Parameter adjustments */
853 c_offset
= 1 + c_dim1
;
856 a_offset
= 1 + a_dim1
;
859 b_offset
= 1 + b_dim1
;
862 /* Early exit if possible */
863 if (m
== 0 || n
== 0 || k
== 0)
866 /* Adjust size of t1 to what is needed. */
868 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
872 #pragma GCC diagnostic push
873 #pragma GCC diagnostic ignored "-Wvla"
874 GFC_COMPLEX_8 t1
[t1_dim
]; /* was [256][256] */
875 #pragma GCC diagnostic pop
880 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
882 /* Start turning the crank. */
884 for (jj
= 1; jj
<= i1
; jj
+= 512)
890 ujsec
= jsec
- jsec
% 4;
892 for (ll
= 1; ll
<= i2
; ll
+= 256)
898 ulsec
= lsec
- lsec
% 2;
901 for (ii
= 1; ii
<= i3
; ii
+= 256)
907 uisec
= isec
- isec
% 2;
909 for (l
= ll
; l
<= i4
; l
+= 2)
912 for (i
= ii
; i
<= i5
; i
+= 2)
914 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
916 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
917 a
[i
+ (l
+ 1) * a_dim1
];
918 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
919 a
[i
+ 1 + l
* a_dim1
];
920 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
921 a
[i
+ 1 + (l
+ 1) * a_dim1
];
925 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
926 a
[ii
+ isec
- 1 + l
* a_dim1
];
927 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
928 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
934 for (i
= ii
; i
<= i4
; ++i
)
936 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
937 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
941 uisec
= isec
- isec
% 4;
943 for (j
= jj
; j
<= i4
; j
+= 4)
946 for (i
= ii
; i
<= i5
; i
+= 4)
948 f11
= c
[i
+ j
* c_dim1
];
949 f21
= c
[i
+ 1 + j
* c_dim1
];
950 f12
= c
[i
+ (j
+ 1) * c_dim1
];
951 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
952 f13
= c
[i
+ (j
+ 2) * c_dim1
];
953 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
954 f14
= c
[i
+ (j
+ 3) * c_dim1
];
955 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
956 f31
= c
[i
+ 2 + j
* c_dim1
];
957 f41
= c
[i
+ 3 + j
* c_dim1
];
958 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
959 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
960 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
961 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
962 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
963 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
965 for (l
= ll
; l
<= i6
; ++l
)
967 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
969 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
971 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
972 * b
[l
+ (j
+ 1) * b_dim1
];
973 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
974 * b
[l
+ (j
+ 1) * b_dim1
];
975 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
976 * b
[l
+ (j
+ 2) * b_dim1
];
977 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
978 * b
[l
+ (j
+ 2) * b_dim1
];
979 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
980 * b
[l
+ (j
+ 3) * b_dim1
];
981 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
982 * b
[l
+ (j
+ 3) * b_dim1
];
983 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
985 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
987 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
988 * b
[l
+ (j
+ 1) * b_dim1
];
989 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
990 * b
[l
+ (j
+ 1) * b_dim1
];
991 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
992 * b
[l
+ (j
+ 2) * b_dim1
];
993 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
994 * b
[l
+ (j
+ 2) * b_dim1
];
995 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
996 * b
[l
+ (j
+ 3) * b_dim1
];
997 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
998 * b
[l
+ (j
+ 3) * b_dim1
];
1000 c
[i
+ j
* c_dim1
] = f11
;
1001 c
[i
+ 1 + j
* c_dim1
] = f21
;
1002 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1003 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1004 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1005 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1006 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1007 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1008 c
[i
+ 2 + j
* c_dim1
] = f31
;
1009 c
[i
+ 3 + j
* c_dim1
] = f41
;
1010 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1011 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1012 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1013 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1014 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1015 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1020 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1022 f11
= c
[i
+ j
* c_dim1
];
1023 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1024 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1025 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1027 for (l
= ll
; l
<= i6
; ++l
)
1029 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1030 257] * b
[l
+ j
* b_dim1
];
1031 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1032 257] * b
[l
+ (j
+ 1) * b_dim1
];
1033 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1034 257] * b
[l
+ (j
+ 2) * b_dim1
];
1035 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1036 257] * b
[l
+ (j
+ 3) * b_dim1
];
1038 c
[i
+ j
* c_dim1
] = f11
;
1039 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1040 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1041 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1048 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1050 i5
= ii
+ uisec
- 1;
1051 for (i
= ii
; i
<= i5
; i
+= 4)
1053 f11
= c
[i
+ j
* c_dim1
];
1054 f21
= c
[i
+ 1 + j
* c_dim1
];
1055 f31
= c
[i
+ 2 + j
* c_dim1
];
1056 f41
= c
[i
+ 3 + j
* c_dim1
];
1058 for (l
= ll
; l
<= i6
; ++l
)
1060 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1061 257] * b
[l
+ j
* b_dim1
];
1062 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1063 257] * b
[l
+ j
* b_dim1
];
1064 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1065 257] * b
[l
+ j
* b_dim1
];
1066 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1067 257] * b
[l
+ j
* b_dim1
];
1069 c
[i
+ j
* c_dim1
] = f11
;
1070 c
[i
+ 1 + j
* c_dim1
] = f21
;
1071 c
[i
+ 2 + j
* c_dim1
] = f31
;
1072 c
[i
+ 3 + j
* c_dim1
] = f41
;
1075 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1077 f11
= c
[i
+ j
* c_dim1
];
1079 for (l
= ll
; l
<= i6
; ++l
)
1081 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1082 257] * b
[l
+ j
* b_dim1
];
1084 c
[i
+ j
* c_dim1
] = f11
;
1093 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1095 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1097 const GFC_COMPLEX_8
*restrict abase_x
;
1098 const GFC_COMPLEX_8
*restrict bbase_y
;
1099 GFC_COMPLEX_8
*restrict dest_y
;
1102 for (y
= 0; y
< ycount
; y
++)
1104 bbase_y
= &bbase
[y
*bystride
];
1105 dest_y
= &dest
[y
*rystride
];
1106 for (x
= 0; x
< xcount
; x
++)
1108 abase_x
= &abase
[x
*axstride
];
1109 s
= (GFC_COMPLEX_8
) 0;
1110 for (n
= 0; n
< count
; n
++)
1111 s
+= abase_x
[n
] * bbase_y
[n
];
1118 const GFC_COMPLEX_8
*restrict bbase_y
;
1121 for (y
= 0; y
< ycount
; y
++)
1123 bbase_y
= &bbase
[y
*bystride
];
1124 s
= (GFC_COMPLEX_8
) 0;
1125 for (n
= 0; n
< count
; n
++)
1126 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1127 dest
[y
*rystride
] = s
;
1131 else if (axstride
< aystride
)
1133 for (y
= 0; y
< ycount
; y
++)
1134 for (x
= 0; x
< xcount
; x
++)
1135 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
1137 for (y
= 0; y
< ycount
; y
++)
1138 for (n
= 0; n
< count
; n
++)
1139 for (x
= 0; x
< xcount
; x
++)
1140 /* dest[x,y] += a[x,n] * b[n,y] */
1141 dest
[x
*rxstride
+ y
*rystride
] +=
1142 abase
[x
*axstride
+ n
*aystride
] *
1143 bbase
[n
*bxstride
+ y
*bystride
];
1145 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1147 const GFC_COMPLEX_8
*restrict bbase_y
;
1150 for (y
= 0; y
< ycount
; y
++)
1152 bbase_y
= &bbase
[y
*bystride
];
1153 s
= (GFC_COMPLEX_8
) 0;
1154 for (n
= 0; n
< count
; n
++)
1155 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1156 dest
[y
*rxstride
] = s
;
1161 const GFC_COMPLEX_8
*restrict abase_x
;
1162 const GFC_COMPLEX_8
*restrict bbase_y
;
1163 GFC_COMPLEX_8
*restrict dest_y
;
1166 for (y
= 0; y
< ycount
; y
++)
1168 bbase_y
= &bbase
[y
*bystride
];
1169 dest_y
= &dest
[y
*rystride
];
1170 for (x
= 0; x
< xcount
; x
++)
1172 abase_x
= &abase
[x
*axstride
];
1173 s
= (GFC_COMPLEX_8
) 0;
1174 for (n
= 0; n
< count
; n
++)
1175 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1176 dest_y
[x
*rxstride
] = s
;
1185 #endif /* HAVE_AVX2 */
1189 matmul_c8_avx512f (gfc_array_c8
* const restrict retarray
,
1190 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
1191 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx512f")));
1193 matmul_c8_avx512f (gfc_array_c8
* const restrict retarray
,
1194 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
1195 int blas_limit
, blas_call gemm
)
1197 const GFC_COMPLEX_8
* restrict abase
;
1198 const GFC_COMPLEX_8
* restrict bbase
;
1199 GFC_COMPLEX_8
* restrict dest
;
1201 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1202 index_type x
, y
, n
, count
, xcount
, ycount
;
1204 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1205 || GFC_DESCRIPTOR_RANK (b
) == 2);
1207 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1209 Either A or B (but not both) can be rank 1:
1211 o One-dimensional argument A is implicitly treated as a row matrix
1212 dimensioned [1,count], so xcount=1.
1214 o One-dimensional argument B is implicitly treated as a column matrix
1215 dimensioned [count, 1], so ycount=1.
1218 if (retarray
->base_addr
== NULL
)
1220 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1222 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1223 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1225 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1227 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1228 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1232 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1233 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1235 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1236 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1237 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1241 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
1242 retarray
->offset
= 0;
1244 else if (unlikely (compile_options
.bounds_check
))
1246 index_type ret_extent
, arg_extent
;
1248 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1250 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1251 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1252 if (arg_extent
!= ret_extent
)
1253 runtime_error ("Incorrect extent in return array in"
1254 " MATMUL intrinsic: is %ld, should be %ld",
1255 (long int) ret_extent
, (long int) arg_extent
);
1257 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1259 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1260 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1261 if (arg_extent
!= ret_extent
)
1262 runtime_error ("Incorrect extent in return array in"
1263 " MATMUL intrinsic: is %ld, should be %ld",
1264 (long int) ret_extent
, (long int) arg_extent
);
1268 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1269 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1270 if (arg_extent
!= ret_extent
)
1271 runtime_error ("Incorrect extent in return array in"
1272 " MATMUL intrinsic for dimension 1:"
1273 " is %ld, should be %ld",
1274 (long int) ret_extent
, (long int) arg_extent
);
1276 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1277 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1278 if (arg_extent
!= ret_extent
)
1279 runtime_error ("Incorrect extent in return array in"
1280 " MATMUL intrinsic for dimension 2:"
1281 " is %ld, should be %ld",
1282 (long int) ret_extent
, (long int) arg_extent
);
1287 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1289 /* One-dimensional result may be addressed in the code below
1290 either as a row or a column matrix. We want both cases to
1292 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1296 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1297 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1301 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1303 /* Treat it as a a row matrix A[1,count]. */
1304 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1308 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1312 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1313 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1315 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1316 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1319 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1321 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1322 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1325 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1327 /* Treat it as a column matrix B[count,1] */
1328 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1330 /* bystride should never be used for 1-dimensional b.
1331 in case it is we want it to cause a segfault, rather than
1332 an incorrect result. */
1333 bystride
= 0xDEADBEEF;
1338 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1339 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1340 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1343 abase
= a
->base_addr
;
1344 bbase
= b
->base_addr
;
1345 dest
= retarray
->base_addr
;
1347 /* Now that everything is set up, we perform the multiplication
1350 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1351 #define min(a,b) ((a) <= (b) ? (a) : (b))
1352 #define max(a,b) ((a) >= (b) ? (a) : (b))
1354 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1355 && (bxstride
== 1 || bystride
== 1)
1356 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1357 > POW3(blas_limit
)))
1359 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1360 const GFC_COMPLEX_8 one
= 1, zero
= 0;
1361 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1362 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1364 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1366 assert (gemm
!= NULL
);
1367 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1368 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1374 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1376 /* This block of code implements a tuned matmul, derived from
1377 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1379 Bo Kagstrom and Per Ling
1380 Department of Computing Science
1382 S-901 87 Umea, Sweden
1384 from netlib.org, translated to C, and modified for matmul.m4. */
1386 const GFC_COMPLEX_8
*a
, *b
;
1388 const index_type m
= xcount
, n
= ycount
, k
= count
;
1390 /* System generated locals */
1391 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1392 i1
, i2
, i3
, i4
, i5
, i6
;
1394 /* Local variables */
1395 GFC_COMPLEX_8 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1396 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1397 index_type i
, j
, l
, ii
, jj
, ll
;
1398 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1402 c
= retarray
->base_addr
;
1404 /* Parameter adjustments */
1406 c_offset
= 1 + c_dim1
;
1409 a_offset
= 1 + a_dim1
;
1412 b_offset
= 1 + b_dim1
;
1415 /* Early exit if possible */
1416 if (m
== 0 || n
== 0 || k
== 0)
1419 /* Adjust size of t1 to what is needed. */
1421 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
1425 #pragma GCC diagnostic push
1426 #pragma GCC diagnostic ignored "-Wvla"
1427 GFC_COMPLEX_8 t1
[t1_dim
]; /* was [256][256] */
1428 #pragma GCC diagnostic pop
1430 /* Empty c first. */
1431 for (j
=1; j
<=n
; j
++)
1432 for (i
=1; i
<=m
; i
++)
1433 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
1435 /* Start turning the crank. */
1437 for (jj
= 1; jj
<= i1
; jj
+= 512)
1443 ujsec
= jsec
- jsec
% 4;
1445 for (ll
= 1; ll
<= i2
; ll
+= 256)
1451 ulsec
= lsec
- lsec
% 2;
1454 for (ii
= 1; ii
<= i3
; ii
+= 256)
1460 uisec
= isec
- isec
% 2;
1461 i4
= ll
+ ulsec
- 1;
1462 for (l
= ll
; l
<= i4
; l
+= 2)
1464 i5
= ii
+ uisec
- 1;
1465 for (i
= ii
; i
<= i5
; i
+= 2)
1467 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1469 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1470 a
[i
+ (l
+ 1) * a_dim1
];
1471 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1472 a
[i
+ 1 + l
* a_dim1
];
1473 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1474 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1478 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1479 a
[ii
+ isec
- 1 + l
* a_dim1
];
1480 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1481 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1487 for (i
= ii
; i
<= i4
; ++i
)
1489 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
1490 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
1494 uisec
= isec
- isec
% 4;
1495 i4
= jj
+ ujsec
- 1;
1496 for (j
= jj
; j
<= i4
; j
+= 4)
1498 i5
= ii
+ uisec
- 1;
1499 for (i
= ii
; i
<= i5
; i
+= 4)
1501 f11
= c
[i
+ j
* c_dim1
];
1502 f21
= c
[i
+ 1 + j
* c_dim1
];
1503 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1504 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
1505 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1506 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
1507 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1508 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
1509 f31
= c
[i
+ 2 + j
* c_dim1
];
1510 f41
= c
[i
+ 3 + j
* c_dim1
];
1511 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
1512 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
1513 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
1514 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
1515 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
1516 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
1518 for (l
= ll
; l
<= i6
; ++l
)
1520 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1521 * b
[l
+ j
* b_dim1
];
1522 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1523 * b
[l
+ j
* b_dim1
];
1524 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1525 * b
[l
+ (j
+ 1) * b_dim1
];
1526 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1527 * b
[l
+ (j
+ 1) * b_dim1
];
1528 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1529 * b
[l
+ (j
+ 2) * b_dim1
];
1530 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1531 * b
[l
+ (j
+ 2) * b_dim1
];
1532 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1533 * b
[l
+ (j
+ 3) * b_dim1
];
1534 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1535 * b
[l
+ (j
+ 3) * b_dim1
];
1536 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1537 * b
[l
+ j
* b_dim1
];
1538 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1539 * b
[l
+ j
* b_dim1
];
1540 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1541 * b
[l
+ (j
+ 1) * b_dim1
];
1542 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1543 * b
[l
+ (j
+ 1) * b_dim1
];
1544 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1545 * b
[l
+ (j
+ 2) * b_dim1
];
1546 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1547 * b
[l
+ (j
+ 2) * b_dim1
];
1548 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1549 * b
[l
+ (j
+ 3) * b_dim1
];
1550 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1551 * b
[l
+ (j
+ 3) * b_dim1
];
1553 c
[i
+ j
* c_dim1
] = f11
;
1554 c
[i
+ 1 + j
* c_dim1
] = f21
;
1555 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1556 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1557 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1558 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1559 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1560 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1561 c
[i
+ 2 + j
* c_dim1
] = f31
;
1562 c
[i
+ 3 + j
* c_dim1
] = f41
;
1563 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1564 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1565 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1566 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1567 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1568 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1573 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1575 f11
= c
[i
+ j
* c_dim1
];
1576 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1577 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1578 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1580 for (l
= ll
; l
<= i6
; ++l
)
1582 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1583 257] * b
[l
+ j
* b_dim1
];
1584 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1585 257] * b
[l
+ (j
+ 1) * b_dim1
];
1586 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1587 257] * b
[l
+ (j
+ 2) * b_dim1
];
1588 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1589 257] * b
[l
+ (j
+ 3) * b_dim1
];
1591 c
[i
+ j
* c_dim1
] = f11
;
1592 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1593 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1594 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1601 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1603 i5
= ii
+ uisec
- 1;
1604 for (i
= ii
; i
<= i5
; i
+= 4)
1606 f11
= c
[i
+ j
* c_dim1
];
1607 f21
= c
[i
+ 1 + j
* c_dim1
];
1608 f31
= c
[i
+ 2 + j
* c_dim1
];
1609 f41
= c
[i
+ 3 + j
* c_dim1
];
1611 for (l
= ll
; l
<= i6
; ++l
)
1613 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1614 257] * b
[l
+ j
* b_dim1
];
1615 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1616 257] * b
[l
+ j
* b_dim1
];
1617 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1618 257] * b
[l
+ j
* b_dim1
];
1619 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1620 257] * b
[l
+ j
* b_dim1
];
1622 c
[i
+ j
* c_dim1
] = f11
;
1623 c
[i
+ 1 + j
* c_dim1
] = f21
;
1624 c
[i
+ 2 + j
* c_dim1
] = f31
;
1625 c
[i
+ 3 + j
* c_dim1
] = f41
;
1628 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1630 f11
= c
[i
+ j
* c_dim1
];
1632 for (l
= ll
; l
<= i6
; ++l
)
1634 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1635 257] * b
[l
+ j
* b_dim1
];
1637 c
[i
+ j
* c_dim1
] = f11
;
1646 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1648 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1650 const GFC_COMPLEX_8
*restrict abase_x
;
1651 const GFC_COMPLEX_8
*restrict bbase_y
;
1652 GFC_COMPLEX_8
*restrict dest_y
;
1655 for (y
= 0; y
< ycount
; y
++)
1657 bbase_y
= &bbase
[y
*bystride
];
1658 dest_y
= &dest
[y
*rystride
];
1659 for (x
= 0; x
< xcount
; x
++)
1661 abase_x
= &abase
[x
*axstride
];
1662 s
= (GFC_COMPLEX_8
) 0;
1663 for (n
= 0; n
< count
; n
++)
1664 s
+= abase_x
[n
] * bbase_y
[n
];
1671 const GFC_COMPLEX_8
*restrict bbase_y
;
1674 for (y
= 0; y
< ycount
; y
++)
1676 bbase_y
= &bbase
[y
*bystride
];
1677 s
= (GFC_COMPLEX_8
) 0;
1678 for (n
= 0; n
< count
; n
++)
1679 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1680 dest
[y
*rystride
] = s
;
1684 else if (axstride
< aystride
)
1686 for (y
= 0; y
< ycount
; y
++)
1687 for (x
= 0; x
< xcount
; x
++)
1688 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
1690 for (y
= 0; y
< ycount
; y
++)
1691 for (n
= 0; n
< count
; n
++)
1692 for (x
= 0; x
< xcount
; x
++)
1693 /* dest[x,y] += a[x,n] * b[n,y] */
1694 dest
[x
*rxstride
+ y
*rystride
] +=
1695 abase
[x
*axstride
+ n
*aystride
] *
1696 bbase
[n
*bxstride
+ y
*bystride
];
1698 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1700 const GFC_COMPLEX_8
*restrict bbase_y
;
1703 for (y
= 0; y
< ycount
; y
++)
1705 bbase_y
= &bbase
[y
*bystride
];
1706 s
= (GFC_COMPLEX_8
) 0;
1707 for (n
= 0; n
< count
; n
++)
1708 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1709 dest
[y
*rxstride
] = s
;
1714 const GFC_COMPLEX_8
*restrict abase_x
;
1715 const GFC_COMPLEX_8
*restrict bbase_y
;
1716 GFC_COMPLEX_8
*restrict dest_y
;
1719 for (y
= 0; y
< ycount
; y
++)
1721 bbase_y
= &bbase
[y
*bystride
];
1722 dest_y
= &dest
[y
*rystride
];
1723 for (x
= 0; x
< xcount
; x
++)
1725 abase_x
= &abase
[x
*axstride
];
1726 s
= (GFC_COMPLEX_8
) 0;
1727 for (n
= 0; n
< count
; n
++)
1728 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1729 dest_y
[x
*rxstride
] = s
;
1738 #endif /* HAVE_AVX512F */
1740 /* Function to fall back to if there is no special processor-specific version. */
1742 matmul_c8_vanilla (gfc_array_c8
* const restrict retarray
,
1743 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
1744 int blas_limit
, blas_call gemm
)
1746 const GFC_COMPLEX_8
* restrict abase
;
1747 const GFC_COMPLEX_8
* restrict bbase
;
1748 GFC_COMPLEX_8
* restrict dest
;
1750 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1751 index_type x
, y
, n
, count
, xcount
, ycount
;
1753 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1754 || GFC_DESCRIPTOR_RANK (b
) == 2);
1756 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1758 Either A or B (but not both) can be rank 1:
1760 o One-dimensional argument A is implicitly treated as a row matrix
1761 dimensioned [1,count], so xcount=1.
1763 o One-dimensional argument B is implicitly treated as a column matrix
1764 dimensioned [count, 1], so ycount=1.
1767 if (retarray
->base_addr
== NULL
)
1769 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1771 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1772 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1774 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1776 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1777 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1781 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1782 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1784 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1785 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1786 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1790 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
1791 retarray
->offset
= 0;
1793 else if (unlikely (compile_options
.bounds_check
))
1795 index_type ret_extent
, arg_extent
;
1797 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1799 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1800 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1801 if (arg_extent
!= ret_extent
)
1802 runtime_error ("Incorrect extent in return array in"
1803 " MATMUL intrinsic: is %ld, should be %ld",
1804 (long int) ret_extent
, (long int) arg_extent
);
1806 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1808 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1809 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1810 if (arg_extent
!= ret_extent
)
1811 runtime_error ("Incorrect extent in return array in"
1812 " MATMUL intrinsic: is %ld, should be %ld",
1813 (long int) ret_extent
, (long int) arg_extent
);
1817 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1818 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1819 if (arg_extent
!= ret_extent
)
1820 runtime_error ("Incorrect extent in return array in"
1821 " MATMUL intrinsic for dimension 1:"
1822 " is %ld, should be %ld",
1823 (long int) ret_extent
, (long int) arg_extent
);
1825 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1826 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1827 if (arg_extent
!= ret_extent
)
1828 runtime_error ("Incorrect extent in return array in"
1829 " MATMUL intrinsic for dimension 2:"
1830 " is %ld, should be %ld",
1831 (long int) ret_extent
, (long int) arg_extent
);
1836 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1838 /* One-dimensional result may be addressed in the code below
1839 either as a row or a column matrix. We want both cases to
1841 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1845 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1846 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1850 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1852 /* Treat it as a a row matrix A[1,count]. */
1853 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1857 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1861 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1862 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1864 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1865 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1868 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1870 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1871 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1874 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1876 /* Treat it as a column matrix B[count,1] */
1877 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1879 /* bystride should never be used for 1-dimensional b.
1880 in case it is we want it to cause a segfault, rather than
1881 an incorrect result. */
1882 bystride
= 0xDEADBEEF;
1887 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1888 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1889 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1892 abase
= a
->base_addr
;
1893 bbase
= b
->base_addr
;
1894 dest
= retarray
->base_addr
;
1896 /* Now that everything is set up, we perform the multiplication
1899 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1900 #define min(a,b) ((a) <= (b) ? (a) : (b))
1901 #define max(a,b) ((a) >= (b) ? (a) : (b))
1903 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1904 && (bxstride
== 1 || bystride
== 1)
1905 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1906 > POW3(blas_limit
)))
1908 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1909 const GFC_COMPLEX_8 one
= 1, zero
= 0;
1910 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1911 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1913 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1915 assert (gemm
!= NULL
);
1916 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1917 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1923 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1925 /* This block of code implements a tuned matmul, derived from
1926 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1928 Bo Kagstrom and Per Ling
1929 Department of Computing Science
1931 S-901 87 Umea, Sweden
1933 from netlib.org, translated to C, and modified for matmul.m4. */
1935 const GFC_COMPLEX_8
*a
, *b
;
1937 const index_type m
= xcount
, n
= ycount
, k
= count
;
1939 /* System generated locals */
1940 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1941 i1
, i2
, i3
, i4
, i5
, i6
;
1943 /* Local variables */
1944 GFC_COMPLEX_8 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1945 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1946 index_type i
, j
, l
, ii
, jj
, ll
;
1947 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1951 c
= retarray
->base_addr
;
1953 /* Parameter adjustments */
1955 c_offset
= 1 + c_dim1
;
1958 a_offset
= 1 + a_dim1
;
1961 b_offset
= 1 + b_dim1
;
1964 /* Early exit if possible */
1965 if (m
== 0 || n
== 0 || k
== 0)
1968 /* Adjust size of t1 to what is needed. */
1970 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
1974 #pragma GCC diagnostic push
1975 #pragma GCC diagnostic ignored "-Wvla"
1976 GFC_COMPLEX_8 t1
[t1_dim
]; /* was [256][256] */
1977 #pragma GCC diagnostic pop
1979 /* Empty c first. */
1980 for (j
=1; j
<=n
; j
++)
1981 for (i
=1; i
<=m
; i
++)
1982 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
1984 /* Start turning the crank. */
1986 for (jj
= 1; jj
<= i1
; jj
+= 512)
1992 ujsec
= jsec
- jsec
% 4;
1994 for (ll
= 1; ll
<= i2
; ll
+= 256)
2000 ulsec
= lsec
- lsec
% 2;
2003 for (ii
= 1; ii
<= i3
; ii
+= 256)
2009 uisec
= isec
- isec
% 2;
2010 i4
= ll
+ ulsec
- 1;
2011 for (l
= ll
; l
<= i4
; l
+= 2)
2013 i5
= ii
+ uisec
- 1;
2014 for (i
= ii
; i
<= i5
; i
+= 2)
2016 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2018 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2019 a
[i
+ (l
+ 1) * a_dim1
];
2020 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2021 a
[i
+ 1 + l
* a_dim1
];
2022 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2023 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2027 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2028 a
[ii
+ isec
- 1 + l
* a_dim1
];
2029 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2030 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2036 for (i
= ii
; i
<= i4
; ++i
)
2038 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2039 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2043 uisec
= isec
- isec
% 4;
2044 i4
= jj
+ ujsec
- 1;
2045 for (j
= jj
; j
<= i4
; j
+= 4)
2047 i5
= ii
+ uisec
- 1;
2048 for (i
= ii
; i
<= i5
; i
+= 4)
2050 f11
= c
[i
+ j
* c_dim1
];
2051 f21
= c
[i
+ 1 + j
* c_dim1
];
2052 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2053 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2054 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2055 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2056 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2057 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2058 f31
= c
[i
+ 2 + j
* c_dim1
];
2059 f41
= c
[i
+ 3 + j
* c_dim1
];
2060 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2061 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2062 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2063 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2064 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2065 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2067 for (l
= ll
; l
<= i6
; ++l
)
2069 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2070 * b
[l
+ j
* b_dim1
];
2071 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2072 * b
[l
+ j
* b_dim1
];
2073 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2074 * b
[l
+ (j
+ 1) * b_dim1
];
2075 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2076 * b
[l
+ (j
+ 1) * b_dim1
];
2077 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2078 * b
[l
+ (j
+ 2) * b_dim1
];
2079 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2080 * b
[l
+ (j
+ 2) * b_dim1
];
2081 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2082 * b
[l
+ (j
+ 3) * b_dim1
];
2083 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2084 * b
[l
+ (j
+ 3) * b_dim1
];
2085 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2086 * b
[l
+ j
* b_dim1
];
2087 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2088 * b
[l
+ j
* b_dim1
];
2089 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2090 * b
[l
+ (j
+ 1) * b_dim1
];
2091 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2092 * b
[l
+ (j
+ 1) * b_dim1
];
2093 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2094 * b
[l
+ (j
+ 2) * b_dim1
];
2095 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2096 * b
[l
+ (j
+ 2) * b_dim1
];
2097 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2098 * b
[l
+ (j
+ 3) * b_dim1
];
2099 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2100 * b
[l
+ (j
+ 3) * b_dim1
];
2102 c
[i
+ j
* c_dim1
] = f11
;
2103 c
[i
+ 1 + j
* c_dim1
] = f21
;
2104 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2105 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2106 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2107 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2108 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2109 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2110 c
[i
+ 2 + j
* c_dim1
] = f31
;
2111 c
[i
+ 3 + j
* c_dim1
] = f41
;
2112 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2113 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2114 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2115 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2116 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2117 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2122 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2124 f11
= c
[i
+ j
* c_dim1
];
2125 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2126 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2127 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2129 for (l
= ll
; l
<= i6
; ++l
)
2131 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2132 257] * b
[l
+ j
* b_dim1
];
2133 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2134 257] * b
[l
+ (j
+ 1) * b_dim1
];
2135 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2136 257] * b
[l
+ (j
+ 2) * b_dim1
];
2137 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2138 257] * b
[l
+ (j
+ 3) * b_dim1
];
2140 c
[i
+ j
* c_dim1
] = f11
;
2141 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2142 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2143 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2150 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2152 i5
= ii
+ uisec
- 1;
2153 for (i
= ii
; i
<= i5
; i
+= 4)
2155 f11
= c
[i
+ j
* c_dim1
];
2156 f21
= c
[i
+ 1 + j
* c_dim1
];
2157 f31
= c
[i
+ 2 + j
* c_dim1
];
2158 f41
= c
[i
+ 3 + j
* c_dim1
];
2160 for (l
= ll
; l
<= i6
; ++l
)
2162 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2163 257] * b
[l
+ j
* b_dim1
];
2164 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2165 257] * b
[l
+ j
* b_dim1
];
2166 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2167 257] * b
[l
+ j
* b_dim1
];
2168 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2169 257] * b
[l
+ j
* b_dim1
];
2171 c
[i
+ j
* c_dim1
] = f11
;
2172 c
[i
+ 1 + j
* c_dim1
] = f21
;
2173 c
[i
+ 2 + j
* c_dim1
] = f31
;
2174 c
[i
+ 3 + j
* c_dim1
] = f41
;
2177 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2179 f11
= c
[i
+ j
* c_dim1
];
2181 for (l
= ll
; l
<= i6
; ++l
)
2183 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2184 257] * b
[l
+ j
* b_dim1
];
2186 c
[i
+ j
* c_dim1
] = f11
;
2195 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2197 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2199 const GFC_COMPLEX_8
*restrict abase_x
;
2200 const GFC_COMPLEX_8
*restrict bbase_y
;
2201 GFC_COMPLEX_8
*restrict dest_y
;
2204 for (y
= 0; y
< ycount
; y
++)
2206 bbase_y
= &bbase
[y
*bystride
];
2207 dest_y
= &dest
[y
*rystride
];
2208 for (x
= 0; x
< xcount
; x
++)
2210 abase_x
= &abase
[x
*axstride
];
2211 s
= (GFC_COMPLEX_8
) 0;
2212 for (n
= 0; n
< count
; n
++)
2213 s
+= abase_x
[n
] * bbase_y
[n
];
2220 const GFC_COMPLEX_8
*restrict bbase_y
;
2223 for (y
= 0; y
< ycount
; y
++)
2225 bbase_y
= &bbase
[y
*bystride
];
2226 s
= (GFC_COMPLEX_8
) 0;
2227 for (n
= 0; n
< count
; n
++)
2228 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2229 dest
[y
*rystride
] = s
;
2233 else if (axstride
< aystride
)
2235 for (y
= 0; y
< ycount
; y
++)
2236 for (x
= 0; x
< xcount
; x
++)
2237 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
2239 for (y
= 0; y
< ycount
; y
++)
2240 for (n
= 0; n
< count
; n
++)
2241 for (x
= 0; x
< xcount
; x
++)
2242 /* dest[x,y] += a[x,n] * b[n,y] */
2243 dest
[x
*rxstride
+ y
*rystride
] +=
2244 abase
[x
*axstride
+ n
*aystride
] *
2245 bbase
[n
*bxstride
+ y
*bystride
];
2247 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2249 const GFC_COMPLEX_8
*restrict bbase_y
;
2252 for (y
= 0; y
< ycount
; y
++)
2254 bbase_y
= &bbase
[y
*bystride
];
2255 s
= (GFC_COMPLEX_8
) 0;
2256 for (n
= 0; n
< count
; n
++)
2257 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2258 dest
[y
*rxstride
] = s
;
2263 const GFC_COMPLEX_8
*restrict abase_x
;
2264 const GFC_COMPLEX_8
*restrict bbase_y
;
2265 GFC_COMPLEX_8
*restrict dest_y
;
2268 for (y
= 0; y
< ycount
; y
++)
2270 bbase_y
= &bbase
[y
*bystride
];
2271 dest_y
= &dest
[y
*rystride
];
2272 for (x
= 0; x
< xcount
; x
++)
2274 abase_x
= &abase
[x
*axstride
];
2275 s
= (GFC_COMPLEX_8
) 0;
2276 for (n
= 0; n
< count
; n
++)
2277 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2278 dest_y
[x
*rxstride
] = s
;
2288 /* Compiling main function, with selection code for the processor. */
2290 /* Currently, this is i386 only. Adjust for other architectures. */
2292 #include <config/i386/cpuinfo.h>
2293 void matmul_c8 (gfc_array_c8
* const restrict retarray
,
2294 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
2295 int blas_limit
, blas_call gemm
)
2297 static void (*matmul_p
) (gfc_array_c8
* const restrict retarray
,
2298 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
2299 int blas_limit
, blas_call gemm
);
2301 void (*matmul_fn
) (gfc_array_c8
* const restrict retarray
,
2302 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
2303 int blas_limit
, blas_call gemm
);
2305 matmul_fn
= __atomic_load_n (&matmul_p
, __ATOMIC_RELAXED
);
2306 if (matmul_fn
== NULL
)
2308 matmul_fn
= matmul_c8_vanilla
;
2309 if (__cpu_model
.__cpu_vendor
== VENDOR_INTEL
)
2311 /* Run down the available processors in order of preference. */
2313 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX512F
))
2315 matmul_fn
= matmul_c8_avx512f
;
2319 #endif /* HAVE_AVX512F */
2322 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX2
))
2323 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA
)))
2325 matmul_fn
= matmul_c8_avx2
;
2332 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2334 matmul_fn
= matmul_c8_avx
;
2337 #endif /* HAVE_AVX */
2340 __atomic_store_n (&matmul_p
, matmul_fn
, __ATOMIC_RELAXED
);
2343 (*matmul_fn
) (retarray
, a
, b
, try_blas
, blas_limit
, gemm
);
2346 #else /* Just the vanilla function. */
2349 matmul_c8 (gfc_array_c8
* const restrict retarray
,
2350 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
2351 int blas_limit
, blas_call gemm
)
2353 const GFC_COMPLEX_8
* restrict abase
;
2354 const GFC_COMPLEX_8
* restrict bbase
;
2355 GFC_COMPLEX_8
* restrict dest
;
2357 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
2358 index_type x
, y
, n
, count
, xcount
, ycount
;
2360 assert (GFC_DESCRIPTOR_RANK (a
) == 2
2361 || GFC_DESCRIPTOR_RANK (b
) == 2);
2363 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2365 Either A or B (but not both) can be rank 1:
2367 o One-dimensional argument A is implicitly treated as a row matrix
2368 dimensioned [1,count], so xcount=1.
2370 o One-dimensional argument B is implicitly treated as a column matrix
2371 dimensioned [count, 1], so ycount=1.
2374 if (retarray
->base_addr
== NULL
)
2376 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2378 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2379 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
2381 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2383 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2384 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2388 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2389 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2391 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
2392 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
2393 GFC_DESCRIPTOR_EXTENT(retarray
,0));
2397 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
2398 retarray
->offset
= 0;
2400 else if (unlikely (compile_options
.bounds_check
))
2402 index_type ret_extent
, arg_extent
;
2404 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2406 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2407 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2408 if (arg_extent
!= ret_extent
)
2409 runtime_error ("Incorrect extent in return array in"
2410 " MATMUL intrinsic: is %ld, should be %ld",
2411 (long int) ret_extent
, (long int) arg_extent
);
2413 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2415 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2416 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2417 if (arg_extent
!= ret_extent
)
2418 runtime_error ("Incorrect extent in return array in"
2419 " MATMUL intrinsic: is %ld, should be %ld",
2420 (long int) ret_extent
, (long int) arg_extent
);
2424 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2425 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2426 if (arg_extent
!= ret_extent
)
2427 runtime_error ("Incorrect extent in return array in"
2428 " MATMUL intrinsic for dimension 1:"
2429 " is %ld, should be %ld",
2430 (long int) ret_extent
, (long int) arg_extent
);
2432 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2433 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
2434 if (arg_extent
!= ret_extent
)
2435 runtime_error ("Incorrect extent in return array in"
2436 " MATMUL intrinsic for dimension 2:"
2437 " is %ld, should be %ld",
2438 (long int) ret_extent
, (long int) arg_extent
);
2443 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
2445 /* One-dimensional result may be addressed in the code below
2446 either as a row or a column matrix. We want both cases to
2448 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2452 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2453 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
2457 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2459 /* Treat it as a a row matrix A[1,count]. */
2460 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2464 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
2468 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2469 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
2471 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
2472 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
2475 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
2477 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
2478 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2481 if (GFC_DESCRIPTOR_RANK (b
) == 1)
2483 /* Treat it as a column matrix B[count,1] */
2484 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2486 /* bystride should never be used for 1-dimensional b.
2487 in case it is we want it to cause a segfault, rather than
2488 an incorrect result. */
2489 bystride
= 0xDEADBEEF;
2494 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2495 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
2496 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
2499 abase
= a
->base_addr
;
2500 bbase
= b
->base_addr
;
2501 dest
= retarray
->base_addr
;
2503 /* Now that everything is set up, we perform the multiplication
2506 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2507 #define min(a,b) ((a) <= (b) ? (a) : (b))
2508 #define max(a,b) ((a) >= (b) ? (a) : (b))
2510 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
2511 && (bxstride
== 1 || bystride
== 1)
2512 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
2513 > POW3(blas_limit
)))
2515 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
2516 const GFC_COMPLEX_8 one
= 1, zero
= 0;
2517 const int lda
= (axstride
== 1) ? aystride
: axstride
,
2518 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
2520 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
2522 assert (gemm
!= NULL
);
2523 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
2524 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
2530 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
2532 /* This block of code implements a tuned matmul, derived from
2533 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2535 Bo Kagstrom and Per Ling
2536 Department of Computing Science
2538 S-901 87 Umea, Sweden
2540 from netlib.org, translated to C, and modified for matmul.m4. */
2542 const GFC_COMPLEX_8
*a
, *b
;
2544 const index_type m
= xcount
, n
= ycount
, k
= count
;
2546 /* System generated locals */
2547 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
2548 i1
, i2
, i3
, i4
, i5
, i6
;
2550 /* Local variables */
2551 GFC_COMPLEX_8 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
2552 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
2553 index_type i
, j
, l
, ii
, jj
, ll
;
2554 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
2558 c
= retarray
->base_addr
;
2560 /* Parameter adjustments */
2562 c_offset
= 1 + c_dim1
;
2565 a_offset
= 1 + a_dim1
;
2568 b_offset
= 1 + b_dim1
;
2571 /* Early exit if possible */
2572 if (m
== 0 || n
== 0 || k
== 0)
2575 /* Adjust size of t1 to what is needed. */
2577 t1_dim
= (a_dim1
-1) * 256 + b_dim1
;
2581 #pragma GCC diagnostic push
2582 #pragma GCC diagnostic ignored "-Wvla"
2583 GFC_COMPLEX_8 t1
[t1_dim
]; /* was [256][256] */
2584 #pragma GCC diagnostic pop
2586 /* Empty c first. */
2587 for (j
=1; j
<=n
; j
++)
2588 for (i
=1; i
<=m
; i
++)
2589 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
2591 /* Start turning the crank. */
2593 for (jj
= 1; jj
<= i1
; jj
+= 512)
2599 ujsec
= jsec
- jsec
% 4;
2601 for (ll
= 1; ll
<= i2
; ll
+= 256)
2607 ulsec
= lsec
- lsec
% 2;
2610 for (ii
= 1; ii
<= i3
; ii
+= 256)
2616 uisec
= isec
- isec
% 2;
2617 i4
= ll
+ ulsec
- 1;
2618 for (l
= ll
; l
<= i4
; l
+= 2)
2620 i5
= ii
+ uisec
- 1;
2621 for (i
= ii
; i
<= i5
; i
+= 2)
2623 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2625 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2626 a
[i
+ (l
+ 1) * a_dim1
];
2627 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2628 a
[i
+ 1 + l
* a_dim1
];
2629 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2630 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2634 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2635 a
[ii
+ isec
- 1 + l
* a_dim1
];
2636 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2637 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2643 for (i
= ii
; i
<= i4
; ++i
)
2645 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2646 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2650 uisec
= isec
- isec
% 4;
2651 i4
= jj
+ ujsec
- 1;
2652 for (j
= jj
; j
<= i4
; j
+= 4)
2654 i5
= ii
+ uisec
- 1;
2655 for (i
= ii
; i
<= i5
; i
+= 4)
2657 f11
= c
[i
+ j
* c_dim1
];
2658 f21
= c
[i
+ 1 + j
* c_dim1
];
2659 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2660 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2661 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2662 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2663 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2664 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2665 f31
= c
[i
+ 2 + j
* c_dim1
];
2666 f41
= c
[i
+ 3 + j
* c_dim1
];
2667 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2668 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2669 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2670 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2671 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2672 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2674 for (l
= ll
; l
<= i6
; ++l
)
2676 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2677 * b
[l
+ j
* b_dim1
];
2678 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2679 * b
[l
+ j
* b_dim1
];
2680 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2681 * b
[l
+ (j
+ 1) * b_dim1
];
2682 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2683 * b
[l
+ (j
+ 1) * b_dim1
];
2684 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2685 * b
[l
+ (j
+ 2) * b_dim1
];
2686 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2687 * b
[l
+ (j
+ 2) * b_dim1
];
2688 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2689 * b
[l
+ (j
+ 3) * b_dim1
];
2690 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2691 * b
[l
+ (j
+ 3) * b_dim1
];
2692 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2693 * b
[l
+ j
* b_dim1
];
2694 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2695 * b
[l
+ j
* b_dim1
];
2696 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2697 * b
[l
+ (j
+ 1) * b_dim1
];
2698 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2699 * b
[l
+ (j
+ 1) * b_dim1
];
2700 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2701 * b
[l
+ (j
+ 2) * b_dim1
];
2702 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2703 * b
[l
+ (j
+ 2) * b_dim1
];
2704 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2705 * b
[l
+ (j
+ 3) * b_dim1
];
2706 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2707 * b
[l
+ (j
+ 3) * b_dim1
];
2709 c
[i
+ j
* c_dim1
] = f11
;
2710 c
[i
+ 1 + j
* c_dim1
] = f21
;
2711 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2712 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2713 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2714 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2715 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2716 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2717 c
[i
+ 2 + j
* c_dim1
] = f31
;
2718 c
[i
+ 3 + j
* c_dim1
] = f41
;
2719 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2720 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2721 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2722 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2723 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2724 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2729 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2731 f11
= c
[i
+ j
* c_dim1
];
2732 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2733 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2734 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2736 for (l
= ll
; l
<= i6
; ++l
)
2738 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2739 257] * b
[l
+ j
* b_dim1
];
2740 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2741 257] * b
[l
+ (j
+ 1) * b_dim1
];
2742 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2743 257] * b
[l
+ (j
+ 2) * b_dim1
];
2744 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2745 257] * b
[l
+ (j
+ 3) * b_dim1
];
2747 c
[i
+ j
* c_dim1
] = f11
;
2748 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2749 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2750 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2757 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2759 i5
= ii
+ uisec
- 1;
2760 for (i
= ii
; i
<= i5
; i
+= 4)
2762 f11
= c
[i
+ j
* c_dim1
];
2763 f21
= c
[i
+ 1 + j
* c_dim1
];
2764 f31
= c
[i
+ 2 + j
* c_dim1
];
2765 f41
= c
[i
+ 3 + j
* c_dim1
];
2767 for (l
= ll
; l
<= i6
; ++l
)
2769 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2770 257] * b
[l
+ j
* b_dim1
];
2771 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2772 257] * b
[l
+ j
* b_dim1
];
2773 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2774 257] * b
[l
+ j
* b_dim1
];
2775 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2776 257] * b
[l
+ j
* b_dim1
];
2778 c
[i
+ j
* c_dim1
] = f11
;
2779 c
[i
+ 1 + j
* c_dim1
] = f21
;
2780 c
[i
+ 2 + j
* c_dim1
] = f31
;
2781 c
[i
+ 3 + j
* c_dim1
] = f41
;
2784 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2786 f11
= c
[i
+ j
* c_dim1
];
2788 for (l
= ll
; l
<= i6
; ++l
)
2790 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2791 257] * b
[l
+ j
* b_dim1
];
2793 c
[i
+ j
* c_dim1
] = f11
;
2802 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2804 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2806 const GFC_COMPLEX_8
*restrict abase_x
;
2807 const GFC_COMPLEX_8
*restrict bbase_y
;
2808 GFC_COMPLEX_8
*restrict dest_y
;
2811 for (y
= 0; y
< ycount
; y
++)
2813 bbase_y
= &bbase
[y
*bystride
];
2814 dest_y
= &dest
[y
*rystride
];
2815 for (x
= 0; x
< xcount
; x
++)
2817 abase_x
= &abase
[x
*axstride
];
2818 s
= (GFC_COMPLEX_8
) 0;
2819 for (n
= 0; n
< count
; n
++)
2820 s
+= abase_x
[n
] * bbase_y
[n
];
2827 const GFC_COMPLEX_8
*restrict bbase_y
;
2830 for (y
= 0; y
< ycount
; y
++)
2832 bbase_y
= &bbase
[y
*bystride
];
2833 s
= (GFC_COMPLEX_8
) 0;
2834 for (n
= 0; n
< count
; n
++)
2835 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2836 dest
[y
*rystride
] = s
;
2840 else if (axstride
< aystride
)
2842 for (y
= 0; y
< ycount
; y
++)
2843 for (x
= 0; x
< xcount
; x
++)
2844 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
2846 for (y
= 0; y
< ycount
; y
++)
2847 for (n
= 0; n
< count
; n
++)
2848 for (x
= 0; x
< xcount
; x
++)
2849 /* dest[x,y] += a[x,n] * b[n,y] */
2850 dest
[x
*rxstride
+ y
*rystride
] +=
2851 abase
[x
*axstride
+ n
*aystride
] *
2852 bbase
[n
*bxstride
+ y
*bystride
];
2854 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2856 const GFC_COMPLEX_8
*restrict bbase_y
;
2859 for (y
= 0; y
< ycount
; y
++)
2861 bbase_y
= &bbase
[y
*bystride
];
2862 s
= (GFC_COMPLEX_8
) 0;
2863 for (n
= 0; n
< count
; n
++)
2864 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2865 dest
[y
*rxstride
] = s
;
2870 const GFC_COMPLEX_8
*restrict abase_x
;
2871 const GFC_COMPLEX_8
*restrict bbase_y
;
2872 GFC_COMPLEX_8
*restrict dest_y
;
2875 for (y
= 0; y
< ycount
; y
++)
2877 bbase_y
= &bbase
[y
*bystride
];
2878 dest_y
= &dest
[y
*rystride
];
2879 for (x
= 0; x
< xcount
; x
++)
2881 abase_x
= &abase
[x
*axstride
];
2882 s
= (GFC_COMPLEX_8
) 0;
2883 for (n
= 0; n
< count
; n
++)
2884 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2885 dest_y
[x
*rxstride
] = s
;