]>
git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_c8.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
31 #if defined (HAVE_GFC_COMPLEX_8)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
37 typedef void (*blas_call
)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_COMPLEX_8
*, const GFC_COMPLEX_8
*,
39 const int *, const GFC_COMPLEX_8
*, const int *,
40 const GFC_COMPLEX_8
*, GFC_COMPLEX_8
*, const int *,
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_c8 (gfc_array_c8
* const restrict retarray
,
73 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
74 int blas_limit
, blas_call gemm
);
75 export_proto(matmul_c8
);
80 /* Put exhaustive list of possible architectures here here, ORed together. */
82 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
86 matmul_c8_avx (gfc_array_c8
* const restrict retarray
,
87 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
88 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx")));
90 matmul_c8_avx (gfc_array_c8
* const restrict retarray
,
91 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
92 int blas_limit
, blas_call gemm
)
94 const GFC_COMPLEX_8
* restrict abase
;
95 const GFC_COMPLEX_8
* restrict bbase
;
96 GFC_COMPLEX_8
* restrict dest
;
98 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
99 index_type x
, y
, n
, count
, xcount
, ycount
;
101 assert (GFC_DESCRIPTOR_RANK (a
) == 2
102 || GFC_DESCRIPTOR_RANK (b
) == 2);
104 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
106 Either A or B (but not both) can be rank 1:
108 o One-dimensional argument A is implicitly treated as a row matrix
109 dimensioned [1,count], so xcount=1.
111 o One-dimensional argument B is implicitly treated as a column matrix
112 dimensioned [count, 1], so ycount=1.
115 if (retarray
->base_addr
== NULL
)
117 if (GFC_DESCRIPTOR_RANK (a
) == 1)
119 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
120 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
122 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
124 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
125 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
130 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
132 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
133 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
134 GFC_DESCRIPTOR_EXTENT(retarray
,0));
138 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
139 retarray
->offset
= 0;
141 else if (unlikely (compile_options
.bounds_check
))
143 index_type ret_extent
, arg_extent
;
145 if (GFC_DESCRIPTOR_RANK (a
) == 1)
147 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
148 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
149 if (arg_extent
!= ret_extent
)
150 runtime_error ("Incorrect extent in return array in"
151 " MATMUL intrinsic: is %ld, should be %ld",
152 (long int) ret_extent
, (long int) arg_extent
);
154 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
156 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
157 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
158 if (arg_extent
!= ret_extent
)
159 runtime_error ("Incorrect extent in return array in"
160 " MATMUL intrinsic: is %ld, should be %ld",
161 (long int) ret_extent
, (long int) arg_extent
);
165 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
166 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
167 if (arg_extent
!= ret_extent
)
168 runtime_error ("Incorrect extent in return array in"
169 " MATMUL intrinsic for dimension 1:"
170 " is %ld, should be %ld",
171 (long int) ret_extent
, (long int) arg_extent
);
173 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
174 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
175 if (arg_extent
!= ret_extent
)
176 runtime_error ("Incorrect extent in return array in"
177 " MATMUL intrinsic for dimension 2:"
178 " is %ld, should be %ld",
179 (long int) ret_extent
, (long int) arg_extent
);
184 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
186 /* One-dimensional result may be addressed in the code below
187 either as a row or a column matrix. We want both cases to
189 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
193 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
194 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
198 if (GFC_DESCRIPTOR_RANK (a
) == 1)
200 /* Treat it as a a row matrix A[1,count]. */
201 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
205 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
209 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
210 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
212 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
213 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
216 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
218 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
219 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
222 if (GFC_DESCRIPTOR_RANK (b
) == 1)
224 /* Treat it as a column matrix B[count,1] */
225 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
227 /* bystride should never be used for 1-dimensional b.
228 in case it is we want it to cause a segfault, rather than
229 an incorrect result. */
230 bystride
= 0xDEADBEEF;
235 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
236 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
237 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
240 abase
= a
->base_addr
;
241 bbase
= b
->base_addr
;
242 dest
= retarray
->base_addr
;
244 /* Now that everything is set up, we perform the multiplication
247 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
248 #define min(a,b) ((a) <= (b) ? (a) : (b))
249 #define max(a,b) ((a) >= (b) ? (a) : (b))
251 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
252 && (bxstride
== 1 || bystride
== 1)
253 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
256 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
257 const GFC_COMPLEX_8 one
= 1, zero
= 0;
258 const int lda
= (axstride
== 1) ? aystride
: axstride
,
259 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
261 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
263 assert (gemm
!= NULL
);
264 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
265 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
271 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
273 /* This block of code implements a tuned matmul, derived from
274 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
276 Bo Kagstrom and Per Ling
277 Department of Computing Science
279 S-901 87 Umea, Sweden
281 from netlib.org, translated to C, and modified for matmul.m4. */
283 const GFC_COMPLEX_8
*a
, *b
;
285 const index_type m
= xcount
, n
= ycount
, k
= count
;
287 /* System generated locals */
288 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
289 i1
, i2
, i3
, i4
, i5
, i6
;
291 /* Local variables */
292 GFC_COMPLEX_8 t1
[65536], /* was [256][256] */
293 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
294 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
295 index_type i
, j
, l
, ii
, jj
, ll
;
296 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
300 c
= retarray
->base_addr
;
302 /* Parameter adjustments */
304 c_offset
= 1 + c_dim1
;
307 a_offset
= 1 + a_dim1
;
310 b_offset
= 1 + b_dim1
;
313 /* Early exit if possible */
314 if (m
== 0 || n
== 0 || k
== 0)
320 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
322 /* Start turning the crank. */
324 for (jj
= 1; jj
<= i1
; jj
+= 512)
330 ujsec
= jsec
- jsec
% 4;
332 for (ll
= 1; ll
<= i2
; ll
+= 256)
338 ulsec
= lsec
- lsec
% 2;
341 for (ii
= 1; ii
<= i3
; ii
+= 256)
347 uisec
= isec
- isec
% 2;
349 for (l
= ll
; l
<= i4
; l
+= 2)
352 for (i
= ii
; i
<= i5
; i
+= 2)
354 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
356 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
357 a
[i
+ (l
+ 1) * a_dim1
];
358 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
359 a
[i
+ 1 + l
* a_dim1
];
360 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
361 a
[i
+ 1 + (l
+ 1) * a_dim1
];
365 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
366 a
[ii
+ isec
- 1 + l
* a_dim1
];
367 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
368 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
374 for (i
= ii
; i
<= i4
; ++i
)
376 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
377 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
381 uisec
= isec
- isec
% 4;
383 for (j
= jj
; j
<= i4
; j
+= 4)
386 for (i
= ii
; i
<= i5
; i
+= 4)
388 f11
= c
[i
+ j
* c_dim1
];
389 f21
= c
[i
+ 1 + j
* c_dim1
];
390 f12
= c
[i
+ (j
+ 1) * c_dim1
];
391 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
392 f13
= c
[i
+ (j
+ 2) * c_dim1
];
393 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
394 f14
= c
[i
+ (j
+ 3) * c_dim1
];
395 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
396 f31
= c
[i
+ 2 + j
* c_dim1
];
397 f41
= c
[i
+ 3 + j
* c_dim1
];
398 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
399 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
400 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
401 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
402 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
403 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
405 for (l
= ll
; l
<= i6
; ++l
)
407 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
409 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
411 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
412 * b
[l
+ (j
+ 1) * b_dim1
];
413 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
414 * b
[l
+ (j
+ 1) * b_dim1
];
415 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
416 * b
[l
+ (j
+ 2) * b_dim1
];
417 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
418 * b
[l
+ (j
+ 2) * b_dim1
];
419 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
420 * b
[l
+ (j
+ 3) * b_dim1
];
421 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
422 * b
[l
+ (j
+ 3) * b_dim1
];
423 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
425 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
427 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
428 * b
[l
+ (j
+ 1) * b_dim1
];
429 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
430 * b
[l
+ (j
+ 1) * b_dim1
];
431 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
432 * b
[l
+ (j
+ 2) * b_dim1
];
433 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
434 * b
[l
+ (j
+ 2) * b_dim1
];
435 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
436 * b
[l
+ (j
+ 3) * b_dim1
];
437 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
438 * b
[l
+ (j
+ 3) * b_dim1
];
440 c
[i
+ j
* c_dim1
] = f11
;
441 c
[i
+ 1 + j
* c_dim1
] = f21
;
442 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
443 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
444 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
445 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
446 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
447 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
448 c
[i
+ 2 + j
* c_dim1
] = f31
;
449 c
[i
+ 3 + j
* c_dim1
] = f41
;
450 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
451 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
452 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
453 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
454 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
455 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
460 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
462 f11
= c
[i
+ j
* c_dim1
];
463 f12
= c
[i
+ (j
+ 1) * c_dim1
];
464 f13
= c
[i
+ (j
+ 2) * c_dim1
];
465 f14
= c
[i
+ (j
+ 3) * c_dim1
];
467 for (l
= ll
; l
<= i6
; ++l
)
469 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
470 257] * b
[l
+ j
* b_dim1
];
471 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
472 257] * b
[l
+ (j
+ 1) * b_dim1
];
473 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
474 257] * b
[l
+ (j
+ 2) * b_dim1
];
475 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
476 257] * b
[l
+ (j
+ 3) * b_dim1
];
478 c
[i
+ j
* c_dim1
] = f11
;
479 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
480 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
481 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
488 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
491 for (i
= ii
; i
<= i5
; i
+= 4)
493 f11
= c
[i
+ j
* c_dim1
];
494 f21
= c
[i
+ 1 + j
* c_dim1
];
495 f31
= c
[i
+ 2 + j
* c_dim1
];
496 f41
= c
[i
+ 3 + j
* c_dim1
];
498 for (l
= ll
; l
<= i6
; ++l
)
500 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
501 257] * b
[l
+ j
* b_dim1
];
502 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
503 257] * b
[l
+ j
* b_dim1
];
504 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
505 257] * b
[l
+ j
* b_dim1
];
506 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
507 257] * b
[l
+ j
* b_dim1
];
509 c
[i
+ j
* c_dim1
] = f11
;
510 c
[i
+ 1 + j
* c_dim1
] = f21
;
511 c
[i
+ 2 + j
* c_dim1
] = f31
;
512 c
[i
+ 3 + j
* c_dim1
] = f41
;
515 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
517 f11
= c
[i
+ j
* c_dim1
];
519 for (l
= ll
; l
<= i6
; ++l
)
521 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
522 257] * b
[l
+ j
* b_dim1
];
524 c
[i
+ j
* c_dim1
] = f11
;
533 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
535 if (GFC_DESCRIPTOR_RANK (a
) != 1)
537 const GFC_COMPLEX_8
*restrict abase_x
;
538 const GFC_COMPLEX_8
*restrict bbase_y
;
539 GFC_COMPLEX_8
*restrict dest_y
;
542 for (y
= 0; y
< ycount
; y
++)
544 bbase_y
= &bbase
[y
*bystride
];
545 dest_y
= &dest
[y
*rystride
];
546 for (x
= 0; x
< xcount
; x
++)
548 abase_x
= &abase
[x
*axstride
];
549 s
= (GFC_COMPLEX_8
) 0;
550 for (n
= 0; n
< count
; n
++)
551 s
+= abase_x
[n
] * bbase_y
[n
];
558 const GFC_COMPLEX_8
*restrict bbase_y
;
561 for (y
= 0; y
< ycount
; y
++)
563 bbase_y
= &bbase
[y
*bystride
];
564 s
= (GFC_COMPLEX_8
) 0;
565 for (n
= 0; n
< count
; n
++)
566 s
+= abase
[n
*axstride
] * bbase_y
[n
];
567 dest
[y
*rystride
] = s
;
571 else if (axstride
< aystride
)
573 for (y
= 0; y
< ycount
; y
++)
574 for (x
= 0; x
< xcount
; x
++)
575 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
577 for (y
= 0; y
< ycount
; y
++)
578 for (n
= 0; n
< count
; n
++)
579 for (x
= 0; x
< xcount
; x
++)
580 /* dest[x,y] += a[x,n] * b[n,y] */
581 dest
[x
*rxstride
+ y
*rystride
] +=
582 abase
[x
*axstride
+ n
*aystride
] *
583 bbase
[n
*bxstride
+ y
*bystride
];
585 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
587 const GFC_COMPLEX_8
*restrict bbase_y
;
590 for (y
= 0; y
< ycount
; y
++)
592 bbase_y
= &bbase
[y
*bystride
];
593 s
= (GFC_COMPLEX_8
) 0;
594 for (n
= 0; n
< count
; n
++)
595 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
596 dest
[y
*rxstride
] = s
;
601 const GFC_COMPLEX_8
*restrict abase_x
;
602 const GFC_COMPLEX_8
*restrict bbase_y
;
603 GFC_COMPLEX_8
*restrict dest_y
;
606 for (y
= 0; y
< ycount
; y
++)
608 bbase_y
= &bbase
[y
*bystride
];
609 dest_y
= &dest
[y
*rystride
];
610 for (x
= 0; x
< xcount
; x
++)
612 abase_x
= &abase
[x
*axstride
];
613 s
= (GFC_COMPLEX_8
) 0;
614 for (n
= 0; n
< count
; n
++)
615 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
616 dest_y
[x
*rxstride
] = s
;
625 #endif /* HAVE_AVX */
629 matmul_c8_avx2 (gfc_array_c8
* const restrict retarray
,
630 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
631 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx2")));
633 matmul_c8_avx2 (gfc_array_c8
* const restrict retarray
,
634 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
635 int blas_limit
, blas_call gemm
)
637 const GFC_COMPLEX_8
* restrict abase
;
638 const GFC_COMPLEX_8
* restrict bbase
;
639 GFC_COMPLEX_8
* restrict dest
;
641 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
642 index_type x
, y
, n
, count
, xcount
, ycount
;
644 assert (GFC_DESCRIPTOR_RANK (a
) == 2
645 || GFC_DESCRIPTOR_RANK (b
) == 2);
647 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
649 Either A or B (but not both) can be rank 1:
651 o One-dimensional argument A is implicitly treated as a row matrix
652 dimensioned [1,count], so xcount=1.
654 o One-dimensional argument B is implicitly treated as a column matrix
655 dimensioned [count, 1], so ycount=1.
658 if (retarray
->base_addr
== NULL
)
660 if (GFC_DESCRIPTOR_RANK (a
) == 1)
662 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
663 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
665 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
667 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
668 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
672 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
673 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
675 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
676 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
677 GFC_DESCRIPTOR_EXTENT(retarray
,0));
681 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
682 retarray
->offset
= 0;
684 else if (unlikely (compile_options
.bounds_check
))
686 index_type ret_extent
, arg_extent
;
688 if (GFC_DESCRIPTOR_RANK (a
) == 1)
690 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
691 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
692 if (arg_extent
!= ret_extent
)
693 runtime_error ("Incorrect extent in return array in"
694 " MATMUL intrinsic: is %ld, should be %ld",
695 (long int) ret_extent
, (long int) arg_extent
);
697 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
699 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
700 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
701 if (arg_extent
!= ret_extent
)
702 runtime_error ("Incorrect extent in return array in"
703 " MATMUL intrinsic: is %ld, should be %ld",
704 (long int) ret_extent
, (long int) arg_extent
);
708 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
709 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
710 if (arg_extent
!= ret_extent
)
711 runtime_error ("Incorrect extent in return array in"
712 " MATMUL intrinsic for dimension 1:"
713 " is %ld, should be %ld",
714 (long int) ret_extent
, (long int) arg_extent
);
716 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
717 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
718 if (arg_extent
!= ret_extent
)
719 runtime_error ("Incorrect extent in return array in"
720 " MATMUL intrinsic for dimension 2:"
721 " is %ld, should be %ld",
722 (long int) ret_extent
, (long int) arg_extent
);
727 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
729 /* One-dimensional result may be addressed in the code below
730 either as a row or a column matrix. We want both cases to
732 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
736 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
737 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
741 if (GFC_DESCRIPTOR_RANK (a
) == 1)
743 /* Treat it as a a row matrix A[1,count]. */
744 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
748 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
752 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
753 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
755 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
756 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
759 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
761 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
762 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
765 if (GFC_DESCRIPTOR_RANK (b
) == 1)
767 /* Treat it as a column matrix B[count,1] */
768 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
770 /* bystride should never be used for 1-dimensional b.
771 in case it is we want it to cause a segfault, rather than
772 an incorrect result. */
773 bystride
= 0xDEADBEEF;
778 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
779 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
780 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
783 abase
= a
->base_addr
;
784 bbase
= b
->base_addr
;
785 dest
= retarray
->base_addr
;
787 /* Now that everything is set up, we perform the multiplication
790 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
791 #define min(a,b) ((a) <= (b) ? (a) : (b))
792 #define max(a,b) ((a) >= (b) ? (a) : (b))
794 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
795 && (bxstride
== 1 || bystride
== 1)
796 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
799 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
800 const GFC_COMPLEX_8 one
= 1, zero
= 0;
801 const int lda
= (axstride
== 1) ? aystride
: axstride
,
802 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
804 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
806 assert (gemm
!= NULL
);
807 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
808 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
814 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
816 /* This block of code implements a tuned matmul, derived from
817 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
819 Bo Kagstrom and Per Ling
820 Department of Computing Science
822 S-901 87 Umea, Sweden
824 from netlib.org, translated to C, and modified for matmul.m4. */
826 const GFC_COMPLEX_8
*a
, *b
;
828 const index_type m
= xcount
, n
= ycount
, k
= count
;
830 /* System generated locals */
831 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
832 i1
, i2
, i3
, i4
, i5
, i6
;
834 /* Local variables */
835 GFC_COMPLEX_8 t1
[65536], /* was [256][256] */
836 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
837 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
838 index_type i
, j
, l
, ii
, jj
, ll
;
839 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
843 c
= retarray
->base_addr
;
845 /* Parameter adjustments */
847 c_offset
= 1 + c_dim1
;
850 a_offset
= 1 + a_dim1
;
853 b_offset
= 1 + b_dim1
;
856 /* Early exit if possible */
857 if (m
== 0 || n
== 0 || k
== 0)
863 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
865 /* Start turning the crank. */
867 for (jj
= 1; jj
<= i1
; jj
+= 512)
873 ujsec
= jsec
- jsec
% 4;
875 for (ll
= 1; ll
<= i2
; ll
+= 256)
881 ulsec
= lsec
- lsec
% 2;
884 for (ii
= 1; ii
<= i3
; ii
+= 256)
890 uisec
= isec
- isec
% 2;
892 for (l
= ll
; l
<= i4
; l
+= 2)
895 for (i
= ii
; i
<= i5
; i
+= 2)
897 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
899 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
900 a
[i
+ (l
+ 1) * a_dim1
];
901 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
902 a
[i
+ 1 + l
* a_dim1
];
903 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
904 a
[i
+ 1 + (l
+ 1) * a_dim1
];
908 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
909 a
[ii
+ isec
- 1 + l
* a_dim1
];
910 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
911 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
917 for (i
= ii
; i
<= i4
; ++i
)
919 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
920 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
924 uisec
= isec
- isec
% 4;
926 for (j
= jj
; j
<= i4
; j
+= 4)
929 for (i
= ii
; i
<= i5
; i
+= 4)
931 f11
= c
[i
+ j
* c_dim1
];
932 f21
= c
[i
+ 1 + j
* c_dim1
];
933 f12
= c
[i
+ (j
+ 1) * c_dim1
];
934 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
935 f13
= c
[i
+ (j
+ 2) * c_dim1
];
936 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
937 f14
= c
[i
+ (j
+ 3) * c_dim1
];
938 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
939 f31
= c
[i
+ 2 + j
* c_dim1
];
940 f41
= c
[i
+ 3 + j
* c_dim1
];
941 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
942 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
943 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
944 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
945 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
946 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
948 for (l
= ll
; l
<= i6
; ++l
)
950 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
952 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
954 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
955 * b
[l
+ (j
+ 1) * b_dim1
];
956 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
957 * b
[l
+ (j
+ 1) * b_dim1
];
958 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
959 * b
[l
+ (j
+ 2) * b_dim1
];
960 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
961 * b
[l
+ (j
+ 2) * b_dim1
];
962 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
963 * b
[l
+ (j
+ 3) * b_dim1
];
964 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
965 * b
[l
+ (j
+ 3) * b_dim1
];
966 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
968 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
970 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
971 * b
[l
+ (j
+ 1) * b_dim1
];
972 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
973 * b
[l
+ (j
+ 1) * b_dim1
];
974 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
975 * b
[l
+ (j
+ 2) * b_dim1
];
976 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
977 * b
[l
+ (j
+ 2) * b_dim1
];
978 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
979 * b
[l
+ (j
+ 3) * b_dim1
];
980 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
981 * b
[l
+ (j
+ 3) * b_dim1
];
983 c
[i
+ j
* c_dim1
] = f11
;
984 c
[i
+ 1 + j
* c_dim1
] = f21
;
985 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
986 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
987 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
988 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
989 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
990 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
991 c
[i
+ 2 + j
* c_dim1
] = f31
;
992 c
[i
+ 3 + j
* c_dim1
] = f41
;
993 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
994 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
995 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
996 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
997 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
998 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1003 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1005 f11
= c
[i
+ j
* c_dim1
];
1006 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1007 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1008 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1010 for (l
= ll
; l
<= i6
; ++l
)
1012 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1013 257] * b
[l
+ j
* b_dim1
];
1014 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1015 257] * b
[l
+ (j
+ 1) * b_dim1
];
1016 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1017 257] * b
[l
+ (j
+ 2) * b_dim1
];
1018 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1019 257] * b
[l
+ (j
+ 3) * b_dim1
];
1021 c
[i
+ j
* c_dim1
] = f11
;
1022 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1023 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1024 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1031 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1033 i5
= ii
+ uisec
- 1;
1034 for (i
= ii
; i
<= i5
; i
+= 4)
1036 f11
= c
[i
+ j
* c_dim1
];
1037 f21
= c
[i
+ 1 + j
* c_dim1
];
1038 f31
= c
[i
+ 2 + j
* c_dim1
];
1039 f41
= c
[i
+ 3 + j
* c_dim1
];
1041 for (l
= ll
; l
<= i6
; ++l
)
1043 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1044 257] * b
[l
+ j
* b_dim1
];
1045 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1046 257] * b
[l
+ j
* b_dim1
];
1047 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1048 257] * b
[l
+ j
* b_dim1
];
1049 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1050 257] * b
[l
+ j
* b_dim1
];
1052 c
[i
+ j
* c_dim1
] = f11
;
1053 c
[i
+ 1 + j
* c_dim1
] = f21
;
1054 c
[i
+ 2 + j
* c_dim1
] = f31
;
1055 c
[i
+ 3 + j
* c_dim1
] = f41
;
1058 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1060 f11
= c
[i
+ j
* c_dim1
];
1062 for (l
= ll
; l
<= i6
; ++l
)
1064 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1065 257] * b
[l
+ j
* b_dim1
];
1067 c
[i
+ j
* c_dim1
] = f11
;
1076 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1078 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1080 const GFC_COMPLEX_8
*restrict abase_x
;
1081 const GFC_COMPLEX_8
*restrict bbase_y
;
1082 GFC_COMPLEX_8
*restrict dest_y
;
1085 for (y
= 0; y
< ycount
; y
++)
1087 bbase_y
= &bbase
[y
*bystride
];
1088 dest_y
= &dest
[y
*rystride
];
1089 for (x
= 0; x
< xcount
; x
++)
1091 abase_x
= &abase
[x
*axstride
];
1092 s
= (GFC_COMPLEX_8
) 0;
1093 for (n
= 0; n
< count
; n
++)
1094 s
+= abase_x
[n
] * bbase_y
[n
];
1101 const GFC_COMPLEX_8
*restrict bbase_y
;
1104 for (y
= 0; y
< ycount
; y
++)
1106 bbase_y
= &bbase
[y
*bystride
];
1107 s
= (GFC_COMPLEX_8
) 0;
1108 for (n
= 0; n
< count
; n
++)
1109 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1110 dest
[y
*rystride
] = s
;
1114 else if (axstride
< aystride
)
1116 for (y
= 0; y
< ycount
; y
++)
1117 for (x
= 0; x
< xcount
; x
++)
1118 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
1120 for (y
= 0; y
< ycount
; y
++)
1121 for (n
= 0; n
< count
; n
++)
1122 for (x
= 0; x
< xcount
; x
++)
1123 /* dest[x,y] += a[x,n] * b[n,y] */
1124 dest
[x
*rxstride
+ y
*rystride
] +=
1125 abase
[x
*axstride
+ n
*aystride
] *
1126 bbase
[n
*bxstride
+ y
*bystride
];
1128 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1130 const GFC_COMPLEX_8
*restrict bbase_y
;
1133 for (y
= 0; y
< ycount
; y
++)
1135 bbase_y
= &bbase
[y
*bystride
];
1136 s
= (GFC_COMPLEX_8
) 0;
1137 for (n
= 0; n
< count
; n
++)
1138 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1139 dest
[y
*rxstride
] = s
;
1144 const GFC_COMPLEX_8
*restrict abase_x
;
1145 const GFC_COMPLEX_8
*restrict bbase_y
;
1146 GFC_COMPLEX_8
*restrict dest_y
;
1149 for (y
= 0; y
< ycount
; y
++)
1151 bbase_y
= &bbase
[y
*bystride
];
1152 dest_y
= &dest
[y
*rystride
];
1153 for (x
= 0; x
< xcount
; x
++)
1155 abase_x
= &abase
[x
*axstride
];
1156 s
= (GFC_COMPLEX_8
) 0;
1157 for (n
= 0; n
< count
; n
++)
1158 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1159 dest_y
[x
*rxstride
] = s
;
1168 #endif /* HAVE_AVX2 */
1172 matmul_c8_avx512f (gfc_array_c8
* const restrict retarray
,
1173 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
1174 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx512f")));
1176 matmul_c8_avx512f (gfc_array_c8
* const restrict retarray
,
1177 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
1178 int blas_limit
, blas_call gemm
)
1180 const GFC_COMPLEX_8
* restrict abase
;
1181 const GFC_COMPLEX_8
* restrict bbase
;
1182 GFC_COMPLEX_8
* restrict dest
;
1184 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1185 index_type x
, y
, n
, count
, xcount
, ycount
;
1187 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1188 || GFC_DESCRIPTOR_RANK (b
) == 2);
1190 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1192 Either A or B (but not both) can be rank 1:
1194 o One-dimensional argument A is implicitly treated as a row matrix
1195 dimensioned [1,count], so xcount=1.
1197 o One-dimensional argument B is implicitly treated as a column matrix
1198 dimensioned [count, 1], so ycount=1.
1201 if (retarray
->base_addr
== NULL
)
1203 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1205 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1206 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1208 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1210 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1211 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1215 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1216 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1218 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1219 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1220 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1224 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
1225 retarray
->offset
= 0;
1227 else if (unlikely (compile_options
.bounds_check
))
1229 index_type ret_extent
, arg_extent
;
1231 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1233 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1234 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1235 if (arg_extent
!= ret_extent
)
1236 runtime_error ("Incorrect extent in return array in"
1237 " MATMUL intrinsic: is %ld, should be %ld",
1238 (long int) ret_extent
, (long int) arg_extent
);
1240 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1242 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1243 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1244 if (arg_extent
!= ret_extent
)
1245 runtime_error ("Incorrect extent in return array in"
1246 " MATMUL intrinsic: is %ld, should be %ld",
1247 (long int) ret_extent
, (long int) arg_extent
);
1251 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1252 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1253 if (arg_extent
!= ret_extent
)
1254 runtime_error ("Incorrect extent in return array in"
1255 " MATMUL intrinsic for dimension 1:"
1256 " is %ld, should be %ld",
1257 (long int) ret_extent
, (long int) arg_extent
);
1259 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1260 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1261 if (arg_extent
!= ret_extent
)
1262 runtime_error ("Incorrect extent in return array in"
1263 " MATMUL intrinsic for dimension 2:"
1264 " is %ld, should be %ld",
1265 (long int) ret_extent
, (long int) arg_extent
);
1270 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1272 /* One-dimensional result may be addressed in the code below
1273 either as a row or a column matrix. We want both cases to
1275 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1279 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1280 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1284 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1286 /* Treat it as a a row matrix A[1,count]. */
1287 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1291 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1295 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1296 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1298 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1299 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1302 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1304 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1305 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1308 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1310 /* Treat it as a column matrix B[count,1] */
1311 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1313 /* bystride should never be used for 1-dimensional b.
1314 in case it is we want it to cause a segfault, rather than
1315 an incorrect result. */
1316 bystride
= 0xDEADBEEF;
1321 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1322 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1323 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1326 abase
= a
->base_addr
;
1327 bbase
= b
->base_addr
;
1328 dest
= retarray
->base_addr
;
1330 /* Now that everything is set up, we perform the multiplication
1333 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1334 #define min(a,b) ((a) <= (b) ? (a) : (b))
1335 #define max(a,b) ((a) >= (b) ? (a) : (b))
1337 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1338 && (bxstride
== 1 || bystride
== 1)
1339 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1340 > POW3(blas_limit
)))
1342 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1343 const GFC_COMPLEX_8 one
= 1, zero
= 0;
1344 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1345 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1347 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1349 assert (gemm
!= NULL
);
1350 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1351 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1357 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1359 /* This block of code implements a tuned matmul, derived from
1360 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1362 Bo Kagstrom and Per Ling
1363 Department of Computing Science
1365 S-901 87 Umea, Sweden
1367 from netlib.org, translated to C, and modified for matmul.m4. */
1369 const GFC_COMPLEX_8
*a
, *b
;
1371 const index_type m
= xcount
, n
= ycount
, k
= count
;
1373 /* System generated locals */
1374 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1375 i1
, i2
, i3
, i4
, i5
, i6
;
1377 /* Local variables */
1378 GFC_COMPLEX_8 t1
[65536], /* was [256][256] */
1379 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1380 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1381 index_type i
, j
, l
, ii
, jj
, ll
;
1382 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1386 c
= retarray
->base_addr
;
1388 /* Parameter adjustments */
1390 c_offset
= 1 + c_dim1
;
1393 a_offset
= 1 + a_dim1
;
1396 b_offset
= 1 + b_dim1
;
1399 /* Early exit if possible */
1400 if (m
== 0 || n
== 0 || k
== 0)
1403 /* Empty c first. */
1404 for (j
=1; j
<=n
; j
++)
1405 for (i
=1; i
<=m
; i
++)
1406 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
1408 /* Start turning the crank. */
1410 for (jj
= 1; jj
<= i1
; jj
+= 512)
1416 ujsec
= jsec
- jsec
% 4;
1418 for (ll
= 1; ll
<= i2
; ll
+= 256)
1424 ulsec
= lsec
- lsec
% 2;
1427 for (ii
= 1; ii
<= i3
; ii
+= 256)
1433 uisec
= isec
- isec
% 2;
1434 i4
= ll
+ ulsec
- 1;
1435 for (l
= ll
; l
<= i4
; l
+= 2)
1437 i5
= ii
+ uisec
- 1;
1438 for (i
= ii
; i
<= i5
; i
+= 2)
1440 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1442 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1443 a
[i
+ (l
+ 1) * a_dim1
];
1444 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1445 a
[i
+ 1 + l
* a_dim1
];
1446 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1447 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1451 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1452 a
[ii
+ isec
- 1 + l
* a_dim1
];
1453 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1454 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1460 for (i
= ii
; i
<= i4
; ++i
)
1462 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
1463 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
1467 uisec
= isec
- isec
% 4;
1468 i4
= jj
+ ujsec
- 1;
1469 for (j
= jj
; j
<= i4
; j
+= 4)
1471 i5
= ii
+ uisec
- 1;
1472 for (i
= ii
; i
<= i5
; i
+= 4)
1474 f11
= c
[i
+ j
* c_dim1
];
1475 f21
= c
[i
+ 1 + j
* c_dim1
];
1476 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1477 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
1478 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1479 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
1480 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1481 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
1482 f31
= c
[i
+ 2 + j
* c_dim1
];
1483 f41
= c
[i
+ 3 + j
* c_dim1
];
1484 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
1485 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
1486 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
1487 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
1488 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
1489 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
1491 for (l
= ll
; l
<= i6
; ++l
)
1493 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1494 * b
[l
+ j
* b_dim1
];
1495 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1496 * b
[l
+ j
* b_dim1
];
1497 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1498 * b
[l
+ (j
+ 1) * b_dim1
];
1499 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1500 * b
[l
+ (j
+ 1) * b_dim1
];
1501 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1502 * b
[l
+ (j
+ 2) * b_dim1
];
1503 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1504 * b
[l
+ (j
+ 2) * b_dim1
];
1505 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1506 * b
[l
+ (j
+ 3) * b_dim1
];
1507 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1508 * b
[l
+ (j
+ 3) * b_dim1
];
1509 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1510 * b
[l
+ j
* b_dim1
];
1511 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1512 * b
[l
+ j
* b_dim1
];
1513 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1514 * b
[l
+ (j
+ 1) * b_dim1
];
1515 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1516 * b
[l
+ (j
+ 1) * b_dim1
];
1517 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1518 * b
[l
+ (j
+ 2) * b_dim1
];
1519 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1520 * b
[l
+ (j
+ 2) * b_dim1
];
1521 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1522 * b
[l
+ (j
+ 3) * b_dim1
];
1523 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1524 * b
[l
+ (j
+ 3) * b_dim1
];
1526 c
[i
+ j
* c_dim1
] = f11
;
1527 c
[i
+ 1 + j
* c_dim1
] = f21
;
1528 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1529 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1530 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1531 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1532 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1533 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1534 c
[i
+ 2 + j
* c_dim1
] = f31
;
1535 c
[i
+ 3 + j
* c_dim1
] = f41
;
1536 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1537 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1538 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1539 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1540 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1541 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1546 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1548 f11
= c
[i
+ j
* c_dim1
];
1549 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1550 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1551 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1553 for (l
= ll
; l
<= i6
; ++l
)
1555 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1556 257] * b
[l
+ j
* b_dim1
];
1557 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1558 257] * b
[l
+ (j
+ 1) * b_dim1
];
1559 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1560 257] * b
[l
+ (j
+ 2) * b_dim1
];
1561 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1562 257] * b
[l
+ (j
+ 3) * b_dim1
];
1564 c
[i
+ j
* c_dim1
] = f11
;
1565 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1566 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1567 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1574 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1576 i5
= ii
+ uisec
- 1;
1577 for (i
= ii
; i
<= i5
; i
+= 4)
1579 f11
= c
[i
+ j
* c_dim1
];
1580 f21
= c
[i
+ 1 + j
* c_dim1
];
1581 f31
= c
[i
+ 2 + j
* c_dim1
];
1582 f41
= c
[i
+ 3 + j
* c_dim1
];
1584 for (l
= ll
; l
<= i6
; ++l
)
1586 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1587 257] * b
[l
+ j
* b_dim1
];
1588 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1589 257] * b
[l
+ j
* b_dim1
];
1590 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1591 257] * b
[l
+ j
* b_dim1
];
1592 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1593 257] * b
[l
+ j
* b_dim1
];
1595 c
[i
+ j
* c_dim1
] = f11
;
1596 c
[i
+ 1 + j
* c_dim1
] = f21
;
1597 c
[i
+ 2 + j
* c_dim1
] = f31
;
1598 c
[i
+ 3 + j
* c_dim1
] = f41
;
1601 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1603 f11
= c
[i
+ j
* c_dim1
];
1605 for (l
= ll
; l
<= i6
; ++l
)
1607 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1608 257] * b
[l
+ j
* b_dim1
];
1610 c
[i
+ j
* c_dim1
] = f11
;
1619 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1621 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1623 const GFC_COMPLEX_8
*restrict abase_x
;
1624 const GFC_COMPLEX_8
*restrict bbase_y
;
1625 GFC_COMPLEX_8
*restrict dest_y
;
1628 for (y
= 0; y
< ycount
; y
++)
1630 bbase_y
= &bbase
[y
*bystride
];
1631 dest_y
= &dest
[y
*rystride
];
1632 for (x
= 0; x
< xcount
; x
++)
1634 abase_x
= &abase
[x
*axstride
];
1635 s
= (GFC_COMPLEX_8
) 0;
1636 for (n
= 0; n
< count
; n
++)
1637 s
+= abase_x
[n
] * bbase_y
[n
];
1644 const GFC_COMPLEX_8
*restrict bbase_y
;
1647 for (y
= 0; y
< ycount
; y
++)
1649 bbase_y
= &bbase
[y
*bystride
];
1650 s
= (GFC_COMPLEX_8
) 0;
1651 for (n
= 0; n
< count
; n
++)
1652 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1653 dest
[y
*rystride
] = s
;
1657 else if (axstride
< aystride
)
1659 for (y
= 0; y
< ycount
; y
++)
1660 for (x
= 0; x
< xcount
; x
++)
1661 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
1663 for (y
= 0; y
< ycount
; y
++)
1664 for (n
= 0; n
< count
; n
++)
1665 for (x
= 0; x
< xcount
; x
++)
1666 /* dest[x,y] += a[x,n] * b[n,y] */
1667 dest
[x
*rxstride
+ y
*rystride
] +=
1668 abase
[x
*axstride
+ n
*aystride
] *
1669 bbase
[n
*bxstride
+ y
*bystride
];
1671 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1673 const GFC_COMPLEX_8
*restrict bbase_y
;
1676 for (y
= 0; y
< ycount
; y
++)
1678 bbase_y
= &bbase
[y
*bystride
];
1679 s
= (GFC_COMPLEX_8
) 0;
1680 for (n
= 0; n
< count
; n
++)
1681 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1682 dest
[y
*rxstride
] = s
;
1687 const GFC_COMPLEX_8
*restrict abase_x
;
1688 const GFC_COMPLEX_8
*restrict bbase_y
;
1689 GFC_COMPLEX_8
*restrict dest_y
;
1692 for (y
= 0; y
< ycount
; y
++)
1694 bbase_y
= &bbase
[y
*bystride
];
1695 dest_y
= &dest
[y
*rystride
];
1696 for (x
= 0; x
< xcount
; x
++)
1698 abase_x
= &abase
[x
*axstride
];
1699 s
= (GFC_COMPLEX_8
) 0;
1700 for (n
= 0; n
< count
; n
++)
1701 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1702 dest_y
[x
*rxstride
] = s
;
1711 #endif /* HAVE_AVX512F */
1713 /* Function to fall back to if there is no special processor-specific version. */
1715 matmul_c8_vanilla (gfc_array_c8
* const restrict retarray
,
1716 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
1717 int blas_limit
, blas_call gemm
)
1719 const GFC_COMPLEX_8
* restrict abase
;
1720 const GFC_COMPLEX_8
* restrict bbase
;
1721 GFC_COMPLEX_8
* restrict dest
;
1723 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1724 index_type x
, y
, n
, count
, xcount
, ycount
;
1726 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1727 || GFC_DESCRIPTOR_RANK (b
) == 2);
1729 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1731 Either A or B (but not both) can be rank 1:
1733 o One-dimensional argument A is implicitly treated as a row matrix
1734 dimensioned [1,count], so xcount=1.
1736 o One-dimensional argument B is implicitly treated as a column matrix
1737 dimensioned [count, 1], so ycount=1.
1740 if (retarray
->base_addr
== NULL
)
1742 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1744 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1745 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1747 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1749 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1750 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1754 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1755 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1757 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1758 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1759 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1763 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
1764 retarray
->offset
= 0;
1766 else if (unlikely (compile_options
.bounds_check
))
1768 index_type ret_extent
, arg_extent
;
1770 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1772 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1773 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1774 if (arg_extent
!= ret_extent
)
1775 runtime_error ("Incorrect extent in return array in"
1776 " MATMUL intrinsic: is %ld, should be %ld",
1777 (long int) ret_extent
, (long int) arg_extent
);
1779 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1781 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1782 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1783 if (arg_extent
!= ret_extent
)
1784 runtime_error ("Incorrect extent in return array in"
1785 " MATMUL intrinsic: is %ld, should be %ld",
1786 (long int) ret_extent
, (long int) arg_extent
);
1790 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1791 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1792 if (arg_extent
!= ret_extent
)
1793 runtime_error ("Incorrect extent in return array in"
1794 " MATMUL intrinsic for dimension 1:"
1795 " is %ld, should be %ld",
1796 (long int) ret_extent
, (long int) arg_extent
);
1798 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1799 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1800 if (arg_extent
!= ret_extent
)
1801 runtime_error ("Incorrect extent in return array in"
1802 " MATMUL intrinsic for dimension 2:"
1803 " is %ld, should be %ld",
1804 (long int) ret_extent
, (long int) arg_extent
);
1809 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1811 /* One-dimensional result may be addressed in the code below
1812 either as a row or a column matrix. We want both cases to
1814 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1818 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1819 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1823 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1825 /* Treat it as a a row matrix A[1,count]. */
1826 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1830 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1834 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1835 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1837 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1838 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1841 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1843 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1844 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1847 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1849 /* Treat it as a column matrix B[count,1] */
1850 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1852 /* bystride should never be used for 1-dimensional b.
1853 in case it is we want it to cause a segfault, rather than
1854 an incorrect result. */
1855 bystride
= 0xDEADBEEF;
1860 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1861 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1862 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1865 abase
= a
->base_addr
;
1866 bbase
= b
->base_addr
;
1867 dest
= retarray
->base_addr
;
1869 /* Now that everything is set up, we perform the multiplication
1872 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1873 #define min(a,b) ((a) <= (b) ? (a) : (b))
1874 #define max(a,b) ((a) >= (b) ? (a) : (b))
1876 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1877 && (bxstride
== 1 || bystride
== 1)
1878 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1879 > POW3(blas_limit
)))
1881 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1882 const GFC_COMPLEX_8 one
= 1, zero
= 0;
1883 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1884 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1886 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1888 assert (gemm
!= NULL
);
1889 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1890 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1896 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1898 /* This block of code implements a tuned matmul, derived from
1899 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1901 Bo Kagstrom and Per Ling
1902 Department of Computing Science
1904 S-901 87 Umea, Sweden
1906 from netlib.org, translated to C, and modified for matmul.m4. */
1908 const GFC_COMPLEX_8
*a
, *b
;
1910 const index_type m
= xcount
, n
= ycount
, k
= count
;
1912 /* System generated locals */
1913 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1914 i1
, i2
, i3
, i4
, i5
, i6
;
1916 /* Local variables */
1917 GFC_COMPLEX_8 t1
[65536], /* was [256][256] */
1918 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1919 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1920 index_type i
, j
, l
, ii
, jj
, ll
;
1921 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1925 c
= retarray
->base_addr
;
1927 /* Parameter adjustments */
1929 c_offset
= 1 + c_dim1
;
1932 a_offset
= 1 + a_dim1
;
1935 b_offset
= 1 + b_dim1
;
1938 /* Early exit if possible */
1939 if (m
== 0 || n
== 0 || k
== 0)
1942 /* Empty c first. */
1943 for (j
=1; j
<=n
; j
++)
1944 for (i
=1; i
<=m
; i
++)
1945 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
1947 /* Start turning the crank. */
1949 for (jj
= 1; jj
<= i1
; jj
+= 512)
1955 ujsec
= jsec
- jsec
% 4;
1957 for (ll
= 1; ll
<= i2
; ll
+= 256)
1963 ulsec
= lsec
- lsec
% 2;
1966 for (ii
= 1; ii
<= i3
; ii
+= 256)
1972 uisec
= isec
- isec
% 2;
1973 i4
= ll
+ ulsec
- 1;
1974 for (l
= ll
; l
<= i4
; l
+= 2)
1976 i5
= ii
+ uisec
- 1;
1977 for (i
= ii
; i
<= i5
; i
+= 2)
1979 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1981 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1982 a
[i
+ (l
+ 1) * a_dim1
];
1983 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1984 a
[i
+ 1 + l
* a_dim1
];
1985 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1986 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1990 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1991 a
[ii
+ isec
- 1 + l
* a_dim1
];
1992 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1993 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1999 for (i
= ii
; i
<= i4
; ++i
)
2001 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2002 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2006 uisec
= isec
- isec
% 4;
2007 i4
= jj
+ ujsec
- 1;
2008 for (j
= jj
; j
<= i4
; j
+= 4)
2010 i5
= ii
+ uisec
- 1;
2011 for (i
= ii
; i
<= i5
; i
+= 4)
2013 f11
= c
[i
+ j
* c_dim1
];
2014 f21
= c
[i
+ 1 + j
* c_dim1
];
2015 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2016 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2017 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2018 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2019 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2020 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2021 f31
= c
[i
+ 2 + j
* c_dim1
];
2022 f41
= c
[i
+ 3 + j
* c_dim1
];
2023 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2024 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2025 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2026 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2027 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2028 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2030 for (l
= ll
; l
<= i6
; ++l
)
2032 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2033 * b
[l
+ j
* b_dim1
];
2034 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2035 * b
[l
+ j
* b_dim1
];
2036 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2037 * b
[l
+ (j
+ 1) * b_dim1
];
2038 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2039 * b
[l
+ (j
+ 1) * b_dim1
];
2040 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2041 * b
[l
+ (j
+ 2) * b_dim1
];
2042 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2043 * b
[l
+ (j
+ 2) * b_dim1
];
2044 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2045 * b
[l
+ (j
+ 3) * b_dim1
];
2046 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2047 * b
[l
+ (j
+ 3) * b_dim1
];
2048 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2049 * b
[l
+ j
* b_dim1
];
2050 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2051 * b
[l
+ j
* b_dim1
];
2052 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2053 * b
[l
+ (j
+ 1) * b_dim1
];
2054 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2055 * b
[l
+ (j
+ 1) * b_dim1
];
2056 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2057 * b
[l
+ (j
+ 2) * b_dim1
];
2058 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2059 * b
[l
+ (j
+ 2) * b_dim1
];
2060 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2061 * b
[l
+ (j
+ 3) * b_dim1
];
2062 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2063 * b
[l
+ (j
+ 3) * b_dim1
];
2065 c
[i
+ j
* c_dim1
] = f11
;
2066 c
[i
+ 1 + j
* c_dim1
] = f21
;
2067 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2068 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2069 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2070 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2071 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2072 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2073 c
[i
+ 2 + j
* c_dim1
] = f31
;
2074 c
[i
+ 3 + j
* c_dim1
] = f41
;
2075 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2076 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2077 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2078 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2079 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2080 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2085 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2087 f11
= c
[i
+ j
* c_dim1
];
2088 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2089 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2090 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2092 for (l
= ll
; l
<= i6
; ++l
)
2094 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2095 257] * b
[l
+ j
* b_dim1
];
2096 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2097 257] * b
[l
+ (j
+ 1) * b_dim1
];
2098 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2099 257] * b
[l
+ (j
+ 2) * b_dim1
];
2100 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2101 257] * b
[l
+ (j
+ 3) * b_dim1
];
2103 c
[i
+ j
* c_dim1
] = f11
;
2104 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2105 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2106 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2113 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2115 i5
= ii
+ uisec
- 1;
2116 for (i
= ii
; i
<= i5
; i
+= 4)
2118 f11
= c
[i
+ j
* c_dim1
];
2119 f21
= c
[i
+ 1 + j
* c_dim1
];
2120 f31
= c
[i
+ 2 + j
* c_dim1
];
2121 f41
= c
[i
+ 3 + j
* c_dim1
];
2123 for (l
= ll
; l
<= i6
; ++l
)
2125 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2126 257] * b
[l
+ j
* b_dim1
];
2127 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2128 257] * b
[l
+ j
* b_dim1
];
2129 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2130 257] * b
[l
+ j
* b_dim1
];
2131 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2132 257] * b
[l
+ j
* b_dim1
];
2134 c
[i
+ j
* c_dim1
] = f11
;
2135 c
[i
+ 1 + j
* c_dim1
] = f21
;
2136 c
[i
+ 2 + j
* c_dim1
] = f31
;
2137 c
[i
+ 3 + j
* c_dim1
] = f41
;
2140 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2142 f11
= c
[i
+ j
* c_dim1
];
2144 for (l
= ll
; l
<= i6
; ++l
)
2146 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2147 257] * b
[l
+ j
* b_dim1
];
2149 c
[i
+ j
* c_dim1
] = f11
;
2158 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2160 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2162 const GFC_COMPLEX_8
*restrict abase_x
;
2163 const GFC_COMPLEX_8
*restrict bbase_y
;
2164 GFC_COMPLEX_8
*restrict dest_y
;
2167 for (y
= 0; y
< ycount
; y
++)
2169 bbase_y
= &bbase
[y
*bystride
];
2170 dest_y
= &dest
[y
*rystride
];
2171 for (x
= 0; x
< xcount
; x
++)
2173 abase_x
= &abase
[x
*axstride
];
2174 s
= (GFC_COMPLEX_8
) 0;
2175 for (n
= 0; n
< count
; n
++)
2176 s
+= abase_x
[n
] * bbase_y
[n
];
2183 const GFC_COMPLEX_8
*restrict bbase_y
;
2186 for (y
= 0; y
< ycount
; y
++)
2188 bbase_y
= &bbase
[y
*bystride
];
2189 s
= (GFC_COMPLEX_8
) 0;
2190 for (n
= 0; n
< count
; n
++)
2191 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2192 dest
[y
*rystride
] = s
;
2196 else if (axstride
< aystride
)
2198 for (y
= 0; y
< ycount
; y
++)
2199 for (x
= 0; x
< xcount
; x
++)
2200 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
2202 for (y
= 0; y
< ycount
; y
++)
2203 for (n
= 0; n
< count
; n
++)
2204 for (x
= 0; x
< xcount
; x
++)
2205 /* dest[x,y] += a[x,n] * b[n,y] */
2206 dest
[x
*rxstride
+ y
*rystride
] +=
2207 abase
[x
*axstride
+ n
*aystride
] *
2208 bbase
[n
*bxstride
+ y
*bystride
];
2210 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2212 const GFC_COMPLEX_8
*restrict bbase_y
;
2215 for (y
= 0; y
< ycount
; y
++)
2217 bbase_y
= &bbase
[y
*bystride
];
2218 s
= (GFC_COMPLEX_8
) 0;
2219 for (n
= 0; n
< count
; n
++)
2220 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2221 dest
[y
*rxstride
] = s
;
2226 const GFC_COMPLEX_8
*restrict abase_x
;
2227 const GFC_COMPLEX_8
*restrict bbase_y
;
2228 GFC_COMPLEX_8
*restrict dest_y
;
2231 for (y
= 0; y
< ycount
; y
++)
2233 bbase_y
= &bbase
[y
*bystride
];
2234 dest_y
= &dest
[y
*rystride
];
2235 for (x
= 0; x
< xcount
; x
++)
2237 abase_x
= &abase
[x
*axstride
];
2238 s
= (GFC_COMPLEX_8
) 0;
2239 for (n
= 0; n
< count
; n
++)
2240 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2241 dest_y
[x
*rxstride
] = s
;
2251 /* Compiling main function, with selection code for the processor. */
2253 /* Currently, this is i386 only. Adjust for other architectures. */
2255 #include <config/i386/cpuinfo.h>
2256 void matmul_c8 (gfc_array_c8
* const restrict retarray
,
2257 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
2258 int blas_limit
, blas_call gemm
)
2260 static void (*matmul_p
) (gfc_array_c8
* const restrict retarray
,
2261 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
2262 int blas_limit
, blas_call gemm
) = NULL
;
2264 if (matmul_p
== NULL
)
2266 matmul_p
= matmul_c8_vanilla
;
2267 if (__cpu_model
.__cpu_vendor
== VENDOR_INTEL
)
2269 /* Run down the available processors in order of preference. */
2271 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX512F
))
2273 matmul_p
= matmul_c8_avx512f
;
2277 #endif /* HAVE_AVX512F */
2280 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX2
))
2282 matmul_p
= matmul_c8_avx2
;
2289 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2291 matmul_p
= matmul_c8_avx
;
2294 #endif /* HAVE_AVX */
2299 (*matmul_p
) (retarray
, a
, b
, try_blas
, blas_limit
, gemm
);
2302 #else /* Just the vanilla function. */
2305 matmul_c8 (gfc_array_c8
* const restrict retarray
,
2306 gfc_array_c8
* const restrict a
, gfc_array_c8
* const restrict b
, int try_blas
,
2307 int blas_limit
, blas_call gemm
)
2309 const GFC_COMPLEX_8
* restrict abase
;
2310 const GFC_COMPLEX_8
* restrict bbase
;
2311 GFC_COMPLEX_8
* restrict dest
;
2313 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
2314 index_type x
, y
, n
, count
, xcount
, ycount
;
2316 assert (GFC_DESCRIPTOR_RANK (a
) == 2
2317 || GFC_DESCRIPTOR_RANK (b
) == 2);
2319 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2321 Either A or B (but not both) can be rank 1:
2323 o One-dimensional argument A is implicitly treated as a row matrix
2324 dimensioned [1,count], so xcount=1.
2326 o One-dimensional argument B is implicitly treated as a column matrix
2327 dimensioned [count, 1], so ycount=1.
2330 if (retarray
->base_addr
== NULL
)
2332 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2334 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2335 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
2337 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2339 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2340 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2344 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2345 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2347 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
2348 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
2349 GFC_DESCRIPTOR_EXTENT(retarray
,0));
2353 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_COMPLEX_8
));
2354 retarray
->offset
= 0;
2356 else if (unlikely (compile_options
.bounds_check
))
2358 index_type ret_extent
, arg_extent
;
2360 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2362 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2363 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2364 if (arg_extent
!= ret_extent
)
2365 runtime_error ("Incorrect extent in return array in"
2366 " MATMUL intrinsic: is %ld, should be %ld",
2367 (long int) ret_extent
, (long int) arg_extent
);
2369 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2371 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2372 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2373 if (arg_extent
!= ret_extent
)
2374 runtime_error ("Incorrect extent in return array in"
2375 " MATMUL intrinsic: is %ld, should be %ld",
2376 (long int) ret_extent
, (long int) arg_extent
);
2380 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2381 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2382 if (arg_extent
!= ret_extent
)
2383 runtime_error ("Incorrect extent in return array in"
2384 " MATMUL intrinsic for dimension 1:"
2385 " is %ld, should be %ld",
2386 (long int) ret_extent
, (long int) arg_extent
);
2388 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2389 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
2390 if (arg_extent
!= ret_extent
)
2391 runtime_error ("Incorrect extent in return array in"
2392 " MATMUL intrinsic for dimension 2:"
2393 " is %ld, should be %ld",
2394 (long int) ret_extent
, (long int) arg_extent
);
2399 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
2401 /* One-dimensional result may be addressed in the code below
2402 either as a row or a column matrix. We want both cases to
2404 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2408 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2409 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
2413 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2415 /* Treat it as a a row matrix A[1,count]. */
2416 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2420 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
2424 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2425 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
2427 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
2428 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
2431 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
2433 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
2434 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2437 if (GFC_DESCRIPTOR_RANK (b
) == 1)
2439 /* Treat it as a column matrix B[count,1] */
2440 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2442 /* bystride should never be used for 1-dimensional b.
2443 in case it is we want it to cause a segfault, rather than
2444 an incorrect result. */
2445 bystride
= 0xDEADBEEF;
2450 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2451 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
2452 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
2455 abase
= a
->base_addr
;
2456 bbase
= b
->base_addr
;
2457 dest
= retarray
->base_addr
;
2459 /* Now that everything is set up, we perform the multiplication
2462 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2463 #define min(a,b) ((a) <= (b) ? (a) : (b))
2464 #define max(a,b) ((a) >= (b) ? (a) : (b))
2466 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
2467 && (bxstride
== 1 || bystride
== 1)
2468 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
2469 > POW3(blas_limit
)))
2471 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
2472 const GFC_COMPLEX_8 one
= 1, zero
= 0;
2473 const int lda
= (axstride
== 1) ? aystride
: axstride
,
2474 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
2476 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
2478 assert (gemm
!= NULL
);
2479 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
2480 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
2486 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
2488 /* This block of code implements a tuned matmul, derived from
2489 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2491 Bo Kagstrom and Per Ling
2492 Department of Computing Science
2494 S-901 87 Umea, Sweden
2496 from netlib.org, translated to C, and modified for matmul.m4. */
2498 const GFC_COMPLEX_8
*a
, *b
;
2500 const index_type m
= xcount
, n
= ycount
, k
= count
;
2502 /* System generated locals */
2503 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
2504 i1
, i2
, i3
, i4
, i5
, i6
;
2506 /* Local variables */
2507 GFC_COMPLEX_8 t1
[65536], /* was [256][256] */
2508 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
2509 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
2510 index_type i
, j
, l
, ii
, jj
, ll
;
2511 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
2515 c
= retarray
->base_addr
;
2517 /* Parameter adjustments */
2519 c_offset
= 1 + c_dim1
;
2522 a_offset
= 1 + a_dim1
;
2525 b_offset
= 1 + b_dim1
;
2528 /* Early exit if possible */
2529 if (m
== 0 || n
== 0 || k
== 0)
2532 /* Empty c first. */
2533 for (j
=1; j
<=n
; j
++)
2534 for (i
=1; i
<=m
; i
++)
2535 c
[i
+ j
* c_dim1
] = (GFC_COMPLEX_8
)0;
2537 /* Start turning the crank. */
2539 for (jj
= 1; jj
<= i1
; jj
+= 512)
2545 ujsec
= jsec
- jsec
% 4;
2547 for (ll
= 1; ll
<= i2
; ll
+= 256)
2553 ulsec
= lsec
- lsec
% 2;
2556 for (ii
= 1; ii
<= i3
; ii
+= 256)
2562 uisec
= isec
- isec
% 2;
2563 i4
= ll
+ ulsec
- 1;
2564 for (l
= ll
; l
<= i4
; l
+= 2)
2566 i5
= ii
+ uisec
- 1;
2567 for (i
= ii
; i
<= i5
; i
+= 2)
2569 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2571 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2572 a
[i
+ (l
+ 1) * a_dim1
];
2573 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2574 a
[i
+ 1 + l
* a_dim1
];
2575 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2576 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2580 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2581 a
[ii
+ isec
- 1 + l
* a_dim1
];
2582 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2583 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2589 for (i
= ii
; i
<= i4
; ++i
)
2591 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2592 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2596 uisec
= isec
- isec
% 4;
2597 i4
= jj
+ ujsec
- 1;
2598 for (j
= jj
; j
<= i4
; j
+= 4)
2600 i5
= ii
+ uisec
- 1;
2601 for (i
= ii
; i
<= i5
; i
+= 4)
2603 f11
= c
[i
+ j
* c_dim1
];
2604 f21
= c
[i
+ 1 + j
* c_dim1
];
2605 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2606 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2607 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2608 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2609 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2610 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2611 f31
= c
[i
+ 2 + j
* c_dim1
];
2612 f41
= c
[i
+ 3 + j
* c_dim1
];
2613 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2614 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2615 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2616 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2617 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2618 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2620 for (l
= ll
; l
<= i6
; ++l
)
2622 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2623 * b
[l
+ j
* b_dim1
];
2624 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2625 * b
[l
+ j
* b_dim1
];
2626 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2627 * b
[l
+ (j
+ 1) * b_dim1
];
2628 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2629 * b
[l
+ (j
+ 1) * b_dim1
];
2630 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2631 * b
[l
+ (j
+ 2) * b_dim1
];
2632 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2633 * b
[l
+ (j
+ 2) * b_dim1
];
2634 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2635 * b
[l
+ (j
+ 3) * b_dim1
];
2636 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2637 * b
[l
+ (j
+ 3) * b_dim1
];
2638 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2639 * b
[l
+ j
* b_dim1
];
2640 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2641 * b
[l
+ j
* b_dim1
];
2642 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2643 * b
[l
+ (j
+ 1) * b_dim1
];
2644 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2645 * b
[l
+ (j
+ 1) * b_dim1
];
2646 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2647 * b
[l
+ (j
+ 2) * b_dim1
];
2648 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2649 * b
[l
+ (j
+ 2) * b_dim1
];
2650 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2651 * b
[l
+ (j
+ 3) * b_dim1
];
2652 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2653 * b
[l
+ (j
+ 3) * b_dim1
];
2655 c
[i
+ j
* c_dim1
] = f11
;
2656 c
[i
+ 1 + j
* c_dim1
] = f21
;
2657 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2658 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2659 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2660 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2661 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2662 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2663 c
[i
+ 2 + j
* c_dim1
] = f31
;
2664 c
[i
+ 3 + j
* c_dim1
] = f41
;
2665 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2666 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2667 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2668 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2669 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2670 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2675 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2677 f11
= c
[i
+ j
* c_dim1
];
2678 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2679 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2680 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2682 for (l
= ll
; l
<= i6
; ++l
)
2684 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2685 257] * b
[l
+ j
* b_dim1
];
2686 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2687 257] * b
[l
+ (j
+ 1) * b_dim1
];
2688 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2689 257] * b
[l
+ (j
+ 2) * b_dim1
];
2690 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2691 257] * b
[l
+ (j
+ 3) * b_dim1
];
2693 c
[i
+ j
* c_dim1
] = f11
;
2694 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2695 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2696 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2703 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2705 i5
= ii
+ uisec
- 1;
2706 for (i
= ii
; i
<= i5
; i
+= 4)
2708 f11
= c
[i
+ j
* c_dim1
];
2709 f21
= c
[i
+ 1 + j
* c_dim1
];
2710 f31
= c
[i
+ 2 + j
* c_dim1
];
2711 f41
= c
[i
+ 3 + j
* c_dim1
];
2713 for (l
= ll
; l
<= i6
; ++l
)
2715 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2716 257] * b
[l
+ j
* b_dim1
];
2717 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2718 257] * b
[l
+ j
* b_dim1
];
2719 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2720 257] * b
[l
+ j
* b_dim1
];
2721 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2722 257] * b
[l
+ j
* b_dim1
];
2724 c
[i
+ j
* c_dim1
] = f11
;
2725 c
[i
+ 1 + j
* c_dim1
] = f21
;
2726 c
[i
+ 2 + j
* c_dim1
] = f31
;
2727 c
[i
+ 3 + j
* c_dim1
] = f41
;
2730 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2732 f11
= c
[i
+ j
* c_dim1
];
2734 for (l
= ll
; l
<= i6
; ++l
)
2736 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2737 257] * b
[l
+ j
* b_dim1
];
2739 c
[i
+ j
* c_dim1
] = f11
;
2748 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2750 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2752 const GFC_COMPLEX_8
*restrict abase_x
;
2753 const GFC_COMPLEX_8
*restrict bbase_y
;
2754 GFC_COMPLEX_8
*restrict dest_y
;
2757 for (y
= 0; y
< ycount
; y
++)
2759 bbase_y
= &bbase
[y
*bystride
];
2760 dest_y
= &dest
[y
*rystride
];
2761 for (x
= 0; x
< xcount
; x
++)
2763 abase_x
= &abase
[x
*axstride
];
2764 s
= (GFC_COMPLEX_8
) 0;
2765 for (n
= 0; n
< count
; n
++)
2766 s
+= abase_x
[n
] * bbase_y
[n
];
2773 const GFC_COMPLEX_8
*restrict bbase_y
;
2776 for (y
= 0; y
< ycount
; y
++)
2778 bbase_y
= &bbase
[y
*bystride
];
2779 s
= (GFC_COMPLEX_8
) 0;
2780 for (n
= 0; n
< count
; n
++)
2781 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2782 dest
[y
*rystride
] = s
;
2786 else if (axstride
< aystride
)
2788 for (y
= 0; y
< ycount
; y
++)
2789 for (x
= 0; x
< xcount
; x
++)
2790 dest
[x
*rxstride
+ y
*rystride
] = (GFC_COMPLEX_8
)0;
2792 for (y
= 0; y
< ycount
; y
++)
2793 for (n
= 0; n
< count
; n
++)
2794 for (x
= 0; x
< xcount
; x
++)
2795 /* dest[x,y] += a[x,n] * b[n,y] */
2796 dest
[x
*rxstride
+ y
*rystride
] +=
2797 abase
[x
*axstride
+ n
*aystride
] *
2798 bbase
[n
*bxstride
+ y
*bystride
];
2800 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2802 const GFC_COMPLEX_8
*restrict bbase_y
;
2805 for (y
= 0; y
< ycount
; y
++)
2807 bbase_y
= &bbase
[y
*bystride
];
2808 s
= (GFC_COMPLEX_8
) 0;
2809 for (n
= 0; n
< count
; n
++)
2810 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2811 dest
[y
*rxstride
] = s
;
2816 const GFC_COMPLEX_8
*restrict abase_x
;
2817 const GFC_COMPLEX_8
*restrict bbase_y
;
2818 GFC_COMPLEX_8
*restrict dest_y
;
2821 for (y
= 0; y
< ycount
; y
++)
2823 bbase_y
= &bbase
[y
*bystride
];
2824 dest_y
= &dest
[y
*rystride
];
2825 for (x
= 0; x
< xcount
; x
++)
2827 abase_x
= &abase
[x
*axstride
];
2828 s
= (GFC_COMPLEX_8
) 0;
2829 for (n
= 0; n
< count
; n
++)
2830 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2831 dest_y
[x
*rxstride
] = s
;