]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgfortran/generated/matmulavx128_c10.c
Update copyright years.
[thirdparty/gcc.git] / libgfortran / generated / matmulavx128_c10.c
CommitLineData
1d5cf7fc 1/* Implementation of the MATMUL intrinsic
8d9254fc 2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
1d5cf7fc
TK
3 Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4
5This file is part of the GNU Fortran runtime library (libgfortran).
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
10version 3 of the License, or (at your option) any later version.
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24<http://www.gnu.org/licenses/>. */
25
26#include "libgfortran.h"
27#include <string.h>
28#include <assert.h>
29
30
31/* These are the specific versions of matmul with -mprefer-avx128. */
32
33#if defined (HAVE_GFC_COMPLEX_10)
34
35/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36 passed to us by the front-end, in which case we call it for large
37 matrices. */
38
39typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40 const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
41 const int *, const GFC_COMPLEX_10 *, const int *,
42 const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
43 int, int);
44
45#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46void
47matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
48 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
49 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50internal_proto(matmul_c10_avx128_fma3);
51void
52matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
53 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
54 int blas_limit, blas_call gemm)
55{
56 const GFC_COMPLEX_10 * restrict abase;
57 const GFC_COMPLEX_10 * restrict bbase;
58 GFC_COMPLEX_10 * restrict dest;
59
60 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61 index_type x, y, n, count, xcount, ycount;
62
63 assert (GFC_DESCRIPTOR_RANK (a) == 2
64 || GFC_DESCRIPTOR_RANK (b) == 2);
65
66/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67
68 Either A or B (but not both) can be rank 1:
69
70 o One-dimensional argument A is implicitly treated as a row matrix
71 dimensioned [1,count], so xcount=1.
72
73 o One-dimensional argument B is implicitly treated as a column matrix
74 dimensioned [count, 1], so ycount=1.
75*/
76
77 if (retarray->base_addr == NULL)
78 {
79 if (GFC_DESCRIPTOR_RANK (a) == 1)
80 {
81 GFC_DIMENSION_SET(retarray->dim[0], 0,
82 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83 }
84 else if (GFC_DESCRIPTOR_RANK (b) == 1)
85 {
86 GFC_DIMENSION_SET(retarray->dim[0], 0,
87 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88 }
89 else
90 {
91 GFC_DIMENSION_SET(retarray->dim[0], 0,
92 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93
94 GFC_DIMENSION_SET(retarray->dim[1], 0,
95 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96 GFC_DESCRIPTOR_EXTENT(retarray,0));
97 }
98
99 retarray->base_addr
100 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
101 retarray->offset = 0;
102 }
103 else if (unlikely (compile_options.bounds_check))
104 {
105 index_type ret_extent, arg_extent;
106
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
108 {
109 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111 if (arg_extent != ret_extent)
ed33417a
TK
112 runtime_error ("Array bound mismatch for dimension 1 of "
113 "array (%ld/%ld) ",
1d5cf7fc
TK
114 (long int) ret_extent, (long int) arg_extent);
115 }
116 else if (GFC_DESCRIPTOR_RANK (b) == 1)
117 {
118 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120 if (arg_extent != ret_extent)
ed33417a
TK
121 runtime_error ("Array bound mismatch for dimension 1 of "
122 "array (%ld/%ld) ",
1d5cf7fc
TK
123 (long int) ret_extent, (long int) arg_extent);
124 }
125 else
126 {
127 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129 if (arg_extent != ret_extent)
ed33417a
TK
130 runtime_error ("Array bound mismatch for dimension 1 of "
131 "array (%ld/%ld) ",
1d5cf7fc
TK
132 (long int) ret_extent, (long int) arg_extent);
133
134 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136 if (arg_extent != ret_extent)
ed33417a
TK
137 runtime_error ("Array bound mismatch for dimension 2 of "
138 "array (%ld/%ld) ",
1d5cf7fc
TK
139 (long int) ret_extent, (long int) arg_extent);
140 }
141 }
142
143
144 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145 {
146 /* One-dimensional result may be addressed in the code below
147 either as a row or a column matrix. We want both cases to
148 work. */
149 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150 }
151 else
152 {
153 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155 }
156
157
158 if (GFC_DESCRIPTOR_RANK (a) == 1)
159 {
160 /* Treat it as a a row matrix A[1,count]. */
161 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162 aystride = 1;
163
164 xcount = 1;
165 count = GFC_DESCRIPTOR_EXTENT(a,0);
166 }
167 else
168 {
169 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171
172 count = GFC_DESCRIPTOR_EXTENT(a,1);
173 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174 }
175
176 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177 {
178 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
ed33417a
TK
179 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180 "in dimension 1: is %ld, should be %ld",
181 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1d5cf7fc
TK
182 }
183
184 if (GFC_DESCRIPTOR_RANK (b) == 1)
185 {
186 /* Treat it as a column matrix B[count,1] */
187 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188
189 /* bystride should never be used for 1-dimensional b.
190 The value is only used for calculation of the
191 memory by the buffer. */
192 bystride = 256;
193 ycount = 1;
194 }
195 else
196 {
197 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200 }
201
202 abase = a->base_addr;
203 bbase = b->base_addr;
204 dest = retarray->base_addr;
205
206 /* Now that everything is set up, we perform the multiplication
207 itself. */
208
209#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210#define min(a,b) ((a) <= (b) ? (a) : (b))
211#define max(a,b) ((a) >= (b) ? (a) : (b))
212
213 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214 && (bxstride == 1 || bystride == 1)
215 && (((float) xcount) * ((float) ycount) * ((float) count)
216 > POW3(blas_limit)))
217 {
218 const int m = xcount, n = ycount, k = count, ldc = rystride;
219 const GFC_COMPLEX_10 one = 1, zero = 0;
220 const int lda = (axstride == 1) ? aystride : axstride,
221 ldb = (bxstride == 1) ? bystride : bxstride;
222
223 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224 {
225 assert (gemm != NULL);
ed33417a
TK
226 const char *transa, *transb;
227 if (try_blas & 2)
228 transa = "C";
229 else
230 transa = axstride == 1 ? "N" : "T";
231
232 if (try_blas & 4)
233 transb = "C";
234 else
235 transb = bxstride == 1 ? "N" : "T";
236
237 gemm (transa, transb , &m,
1d5cf7fc
TK
238 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
239 &ldc, 1, 1);
240 return;
241 }
242 }
243
244 if (rxstride == 1 && axstride == 1 && bxstride == 1)
245 {
246 /* This block of code implements a tuned matmul, derived from
247 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
248
249 Bo Kagstrom and Per Ling
250 Department of Computing Science
251 Umea University
252 S-901 87 Umea, Sweden
253
254 from netlib.org, translated to C, and modified for matmul.m4. */
255
256 const GFC_COMPLEX_10 *a, *b;
257 GFC_COMPLEX_10 *c;
258 const index_type m = xcount, n = ycount, k = count;
259
260 /* System generated locals */
261 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
262 i1, i2, i3, i4, i5, i6;
263
264 /* Local variables */
265 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
266 f13, f14, f23, f24, f33, f34, f43, f44;
267 index_type i, j, l, ii, jj, ll;
268 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
269 GFC_COMPLEX_10 *t1;
270
271 a = abase;
272 b = bbase;
273 c = retarray->base_addr;
274
275 /* Parameter adjustments */
276 c_dim1 = rystride;
277 c_offset = 1 + c_dim1;
278 c -= c_offset;
279 a_dim1 = aystride;
280 a_offset = 1 + a_dim1;
281 a -= a_offset;
282 b_dim1 = bystride;
283 b_offset = 1 + b_dim1;
284 b -= b_offset;
285
bbf97416
TK
286 /* Empty c first. */
287 for (j=1; j<=n; j++)
288 for (i=1; i<=m; i++)
289 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
290
1d5cf7fc
TK
291 /* Early exit if possible */
292 if (m == 0 || n == 0 || k == 0)
293 return;
294
295 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
296 index_type t1_dim, a_sz;
297 if (aystride == 1)
298 a_sz = rystride;
299 else
300 a_sz = a_dim1;
301
302 t1_dim = a_sz * 256 + b_dim1;
1d5cf7fc
TK
303 if (t1_dim > 65536)
304 t1_dim = 65536;
305
306 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
307
1d5cf7fc
TK
308 /* Start turning the crank. */
309 i1 = n;
310 for (jj = 1; jj <= i1; jj += 512)
311 {
312 /* Computing MIN */
313 i2 = 512;
314 i3 = n - jj + 1;
315 jsec = min(i2,i3);
316 ujsec = jsec - jsec % 4;
317 i2 = k;
318 for (ll = 1; ll <= i2; ll += 256)
319 {
320 /* Computing MIN */
321 i3 = 256;
322 i4 = k - ll + 1;
323 lsec = min(i3,i4);
324 ulsec = lsec - lsec % 2;
325
326 i3 = m;
327 for (ii = 1; ii <= i3; ii += 256)
328 {
329 /* Computing MIN */
330 i4 = 256;
331 i5 = m - ii + 1;
332 isec = min(i4,i5);
333 uisec = isec - isec % 2;
334 i4 = ll + ulsec - 1;
335 for (l = ll; l <= i4; l += 2)
336 {
337 i5 = ii + uisec - 1;
338 for (i = ii; i <= i5; i += 2)
339 {
340 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
341 a[i + l * a_dim1];
342 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
343 a[i + (l + 1) * a_dim1];
344 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
345 a[i + 1 + l * a_dim1];
346 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
347 a[i + 1 + (l + 1) * a_dim1];
348 }
349 if (uisec < isec)
350 {
351 t1[l - ll + 1 + (isec << 8) - 257] =
352 a[ii + isec - 1 + l * a_dim1];
353 t1[l - ll + 2 + (isec << 8) - 257] =
354 a[ii + isec - 1 + (l + 1) * a_dim1];
355 }
356 }
357 if (ulsec < lsec)
358 {
359 i4 = ii + isec - 1;
360 for (i = ii; i<= i4; ++i)
361 {
362 t1[lsec + ((i - ii + 1) << 8) - 257] =
363 a[i + (ll + lsec - 1) * a_dim1];
364 }
365 }
366
367 uisec = isec - isec % 4;
368 i4 = jj + ujsec - 1;
369 for (j = jj; j <= i4; j += 4)
370 {
371 i5 = ii + uisec - 1;
372 for (i = ii; i <= i5; i += 4)
373 {
374 f11 = c[i + j * c_dim1];
375 f21 = c[i + 1 + j * c_dim1];
376 f12 = c[i + (j + 1) * c_dim1];
377 f22 = c[i + 1 + (j + 1) * c_dim1];
378 f13 = c[i + (j + 2) * c_dim1];
379 f23 = c[i + 1 + (j + 2) * c_dim1];
380 f14 = c[i + (j + 3) * c_dim1];
381 f24 = c[i + 1 + (j + 3) * c_dim1];
382 f31 = c[i + 2 + j * c_dim1];
383 f41 = c[i + 3 + j * c_dim1];
384 f32 = c[i + 2 + (j + 1) * c_dim1];
385 f42 = c[i + 3 + (j + 1) * c_dim1];
386 f33 = c[i + 2 + (j + 2) * c_dim1];
387 f43 = c[i + 3 + (j + 2) * c_dim1];
388 f34 = c[i + 2 + (j + 3) * c_dim1];
389 f44 = c[i + 3 + (j + 3) * c_dim1];
390 i6 = ll + lsec - 1;
391 for (l = ll; l <= i6; ++l)
392 {
393 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
394 * b[l + j * b_dim1];
395 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
396 * b[l + j * b_dim1];
397 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
398 * b[l + (j + 1) * b_dim1];
399 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
400 * b[l + (j + 1) * b_dim1];
401 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
402 * b[l + (j + 2) * b_dim1];
403 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
404 * b[l + (j + 2) * b_dim1];
405 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
406 * b[l + (j + 3) * b_dim1];
407 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
408 * b[l + (j + 3) * b_dim1];
409 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
410 * b[l + j * b_dim1];
411 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
412 * b[l + j * b_dim1];
413 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
414 * b[l + (j + 1) * b_dim1];
415 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
416 * b[l + (j + 1) * b_dim1];
417 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
418 * b[l + (j + 2) * b_dim1];
419 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
420 * b[l + (j + 2) * b_dim1];
421 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
422 * b[l + (j + 3) * b_dim1];
423 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
424 * b[l + (j + 3) * b_dim1];
425 }
426 c[i + j * c_dim1] = f11;
427 c[i + 1 + j * c_dim1] = f21;
428 c[i + (j + 1) * c_dim1] = f12;
429 c[i + 1 + (j + 1) * c_dim1] = f22;
430 c[i + (j + 2) * c_dim1] = f13;
431 c[i + 1 + (j + 2) * c_dim1] = f23;
432 c[i + (j + 3) * c_dim1] = f14;
433 c[i + 1 + (j + 3) * c_dim1] = f24;
434 c[i + 2 + j * c_dim1] = f31;
435 c[i + 3 + j * c_dim1] = f41;
436 c[i + 2 + (j + 1) * c_dim1] = f32;
437 c[i + 3 + (j + 1) * c_dim1] = f42;
438 c[i + 2 + (j + 2) * c_dim1] = f33;
439 c[i + 3 + (j + 2) * c_dim1] = f43;
440 c[i + 2 + (j + 3) * c_dim1] = f34;
441 c[i + 3 + (j + 3) * c_dim1] = f44;
442 }
443 if (uisec < isec)
444 {
445 i5 = ii + isec - 1;
446 for (i = ii + uisec; i <= i5; ++i)
447 {
448 f11 = c[i + j * c_dim1];
449 f12 = c[i + (j + 1) * c_dim1];
450 f13 = c[i + (j + 2) * c_dim1];
451 f14 = c[i + (j + 3) * c_dim1];
452 i6 = ll + lsec - 1;
453 for (l = ll; l <= i6; ++l)
454 {
455 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
456 257] * b[l + j * b_dim1];
457 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
458 257] * b[l + (j + 1) * b_dim1];
459 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
460 257] * b[l + (j + 2) * b_dim1];
461 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
462 257] * b[l + (j + 3) * b_dim1];
463 }
464 c[i + j * c_dim1] = f11;
465 c[i + (j + 1) * c_dim1] = f12;
466 c[i + (j + 2) * c_dim1] = f13;
467 c[i + (j + 3) * c_dim1] = f14;
468 }
469 }
470 }
471 if (ujsec < jsec)
472 {
473 i4 = jj + jsec - 1;
474 for (j = jj + ujsec; j <= i4; ++j)
475 {
476 i5 = ii + uisec - 1;
477 for (i = ii; i <= i5; i += 4)
478 {
479 f11 = c[i + j * c_dim1];
480 f21 = c[i + 1 + j * c_dim1];
481 f31 = c[i + 2 + j * c_dim1];
482 f41 = c[i + 3 + j * c_dim1];
483 i6 = ll + lsec - 1;
484 for (l = ll; l <= i6; ++l)
485 {
486 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
487 257] * b[l + j * b_dim1];
488 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
489 257] * b[l + j * b_dim1];
490 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
491 257] * b[l + j * b_dim1];
492 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
493 257] * b[l + j * b_dim1];
494 }
495 c[i + j * c_dim1] = f11;
496 c[i + 1 + j * c_dim1] = f21;
497 c[i + 2 + j * c_dim1] = f31;
498 c[i + 3 + j * c_dim1] = f41;
499 }
500 i5 = ii + isec - 1;
501 for (i = ii + uisec; i <= i5; ++i)
502 {
503 f11 = c[i + j * c_dim1];
504 i6 = ll + lsec - 1;
505 for (l = ll; l <= i6; ++l)
506 {
507 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
508 257] * b[l + j * b_dim1];
509 }
510 c[i + j * c_dim1] = f11;
511 }
512 }
513 }
514 }
515 }
516 }
517 free(t1);
518 return;
519 }
520 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
521 {
522 if (GFC_DESCRIPTOR_RANK (a) != 1)
523 {
524 const GFC_COMPLEX_10 *restrict abase_x;
525 const GFC_COMPLEX_10 *restrict bbase_y;
526 GFC_COMPLEX_10 *restrict dest_y;
527 GFC_COMPLEX_10 s;
528
529 for (y = 0; y < ycount; y++)
530 {
531 bbase_y = &bbase[y*bystride];
532 dest_y = &dest[y*rystride];
533 for (x = 0; x < xcount; x++)
534 {
535 abase_x = &abase[x*axstride];
536 s = (GFC_COMPLEX_10) 0;
537 for (n = 0; n < count; n++)
538 s += abase_x[n] * bbase_y[n];
539 dest_y[x] = s;
540 }
541 }
542 }
543 else
544 {
545 const GFC_COMPLEX_10 *restrict bbase_y;
546 GFC_COMPLEX_10 s;
547
548 for (y = 0; y < ycount; y++)
549 {
550 bbase_y = &bbase[y*bystride];
551 s = (GFC_COMPLEX_10) 0;
552 for (n = 0; n < count; n++)
553 s += abase[n*axstride] * bbase_y[n];
554 dest[y*rystride] = s;
555 }
556 }
557 }
558 else if (axstride < aystride)
559 {
560 for (y = 0; y < ycount; y++)
561 for (x = 0; x < xcount; x++)
562 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
563
564 for (y = 0; y < ycount; y++)
565 for (n = 0; n < count; n++)
566 for (x = 0; x < xcount; x++)
567 /* dest[x,y] += a[x,n] * b[n,y] */
568 dest[x*rxstride + y*rystride] +=
569 abase[x*axstride + n*aystride] *
570 bbase[n*bxstride + y*bystride];
571 }
572 else if (GFC_DESCRIPTOR_RANK (a) == 1)
573 {
574 const GFC_COMPLEX_10 *restrict bbase_y;
575 GFC_COMPLEX_10 s;
576
577 for (y = 0; y < ycount; y++)
578 {
579 bbase_y = &bbase[y*bystride];
580 s = (GFC_COMPLEX_10) 0;
581 for (n = 0; n < count; n++)
582 s += abase[n*axstride] * bbase_y[n*bxstride];
583 dest[y*rxstride] = s;
584 }
585 }
586 else
587 {
588 const GFC_COMPLEX_10 *restrict abase_x;
589 const GFC_COMPLEX_10 *restrict bbase_y;
590 GFC_COMPLEX_10 *restrict dest_y;
591 GFC_COMPLEX_10 s;
592
593 for (y = 0; y < ycount; y++)
594 {
595 bbase_y = &bbase[y*bystride];
596 dest_y = &dest[y*rystride];
597 for (x = 0; x < xcount; x++)
598 {
599 abase_x = &abase[x*axstride];
600 s = (GFC_COMPLEX_10) 0;
601 for (n = 0; n < count; n++)
602 s += abase_x[n*aystride] * bbase_y[n*bxstride];
603 dest_y[x*rxstride] = s;
604 }
605 }
606 }
607}
608#undef POW3
609#undef min
610#undef max
611
612#endif
613
614#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
615void
616matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
617 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
618 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
619internal_proto(matmul_c10_avx128_fma4);
620void
621matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
622 gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
623 int blas_limit, blas_call gemm)
624{
625 const GFC_COMPLEX_10 * restrict abase;
626 const GFC_COMPLEX_10 * restrict bbase;
627 GFC_COMPLEX_10 * restrict dest;
628
629 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
630 index_type x, y, n, count, xcount, ycount;
631
632 assert (GFC_DESCRIPTOR_RANK (a) == 2
633 || GFC_DESCRIPTOR_RANK (b) == 2);
634
635/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
636
637 Either A or B (but not both) can be rank 1:
638
639 o One-dimensional argument A is implicitly treated as a row matrix
640 dimensioned [1,count], so xcount=1.
641
642 o One-dimensional argument B is implicitly treated as a column matrix
643 dimensioned [count, 1], so ycount=1.
644*/
645
646 if (retarray->base_addr == NULL)
647 {
648 if (GFC_DESCRIPTOR_RANK (a) == 1)
649 {
650 GFC_DIMENSION_SET(retarray->dim[0], 0,
651 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
652 }
653 else if (GFC_DESCRIPTOR_RANK (b) == 1)
654 {
655 GFC_DIMENSION_SET(retarray->dim[0], 0,
656 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
657 }
658 else
659 {
660 GFC_DIMENSION_SET(retarray->dim[0], 0,
661 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
662
663 GFC_DIMENSION_SET(retarray->dim[1], 0,
664 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
665 GFC_DESCRIPTOR_EXTENT(retarray,0));
666 }
667
668 retarray->base_addr
669 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
670 retarray->offset = 0;
671 }
672 else if (unlikely (compile_options.bounds_check))
673 {
674 index_type ret_extent, arg_extent;
675
676 if (GFC_DESCRIPTOR_RANK (a) == 1)
677 {
678 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
679 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
680 if (arg_extent != ret_extent)
ed33417a
TK
681 runtime_error ("Array bound mismatch for dimension 1 of "
682 "array (%ld/%ld) ",
1d5cf7fc
TK
683 (long int) ret_extent, (long int) arg_extent);
684 }
685 else if (GFC_DESCRIPTOR_RANK (b) == 1)
686 {
687 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
689 if (arg_extent != ret_extent)
ed33417a
TK
690 runtime_error ("Array bound mismatch for dimension 1 of "
691 "array (%ld/%ld) ",
1d5cf7fc
TK
692 (long int) ret_extent, (long int) arg_extent);
693 }
694 else
695 {
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
ed33417a
TK
699 runtime_error ("Array bound mismatch for dimension 1 of "
700 "array (%ld/%ld) ",
1d5cf7fc
TK
701 (long int) ret_extent, (long int) arg_extent);
702
703 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
704 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
705 if (arg_extent != ret_extent)
ed33417a
TK
706 runtime_error ("Array bound mismatch for dimension 2 of "
707 "array (%ld/%ld) ",
1d5cf7fc
TK
708 (long int) ret_extent, (long int) arg_extent);
709 }
710 }
711
712
713 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
714 {
715 /* One-dimensional result may be addressed in the code below
716 either as a row or a column matrix. We want both cases to
717 work. */
718 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
719 }
720 else
721 {
722 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
723 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
724 }
725
726
727 if (GFC_DESCRIPTOR_RANK (a) == 1)
728 {
729 /* Treat it as a a row matrix A[1,count]. */
730 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
731 aystride = 1;
732
733 xcount = 1;
734 count = GFC_DESCRIPTOR_EXTENT(a,0);
735 }
736 else
737 {
738 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
739 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
740
741 count = GFC_DESCRIPTOR_EXTENT(a,1);
742 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
743 }
744
745 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
746 {
747 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
ed33417a
TK
748 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
749 "in dimension 1: is %ld, should be %ld",
750 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
1d5cf7fc
TK
751 }
752
753 if (GFC_DESCRIPTOR_RANK (b) == 1)
754 {
755 /* Treat it as a column matrix B[count,1] */
756 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
757
758 /* bystride should never be used for 1-dimensional b.
759 The value is only used for calculation of the
760 memory by the buffer. */
761 bystride = 256;
762 ycount = 1;
763 }
764 else
765 {
766 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
767 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
768 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
769 }
770
771 abase = a->base_addr;
772 bbase = b->base_addr;
773 dest = retarray->base_addr;
774
775 /* Now that everything is set up, we perform the multiplication
776 itself. */
777
778#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
779#define min(a,b) ((a) <= (b) ? (a) : (b))
780#define max(a,b) ((a) >= (b) ? (a) : (b))
781
782 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
783 && (bxstride == 1 || bystride == 1)
784 && (((float) xcount) * ((float) ycount) * ((float) count)
785 > POW3(blas_limit)))
786 {
787 const int m = xcount, n = ycount, k = count, ldc = rystride;
788 const GFC_COMPLEX_10 one = 1, zero = 0;
789 const int lda = (axstride == 1) ? aystride : axstride,
790 ldb = (bxstride == 1) ? bystride : bxstride;
791
792 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
793 {
794 assert (gemm != NULL);
ed33417a
TK
795 const char *transa, *transb;
796 if (try_blas & 2)
797 transa = "C";
798 else
799 transa = axstride == 1 ? "N" : "T";
800
801 if (try_blas & 4)
802 transb = "C";
803 else
804 transb = bxstride == 1 ? "N" : "T";
805
806 gemm (transa, transb , &m,
1d5cf7fc
TK
807 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
808 &ldc, 1, 1);
809 return;
810 }
811 }
812
813 if (rxstride == 1 && axstride == 1 && bxstride == 1)
814 {
815 /* This block of code implements a tuned matmul, derived from
816 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
817
818 Bo Kagstrom and Per Ling
819 Department of Computing Science
820 Umea University
821 S-901 87 Umea, Sweden
822
823 from netlib.org, translated to C, and modified for matmul.m4. */
824
825 const GFC_COMPLEX_10 *a, *b;
826 GFC_COMPLEX_10 *c;
827 const index_type m = xcount, n = ycount, k = count;
828
829 /* System generated locals */
830 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
831 i1, i2, i3, i4, i5, i6;
832
833 /* Local variables */
834 GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
835 f13, f14, f23, f24, f33, f34, f43, f44;
836 index_type i, j, l, ii, jj, ll;
837 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
838 GFC_COMPLEX_10 *t1;
839
840 a = abase;
841 b = bbase;
842 c = retarray->base_addr;
843
844 /* Parameter adjustments */
845 c_dim1 = rystride;
846 c_offset = 1 + c_dim1;
847 c -= c_offset;
848 a_dim1 = aystride;
849 a_offset = 1 + a_dim1;
850 a -= a_offset;
851 b_dim1 = bystride;
852 b_offset = 1 + b_dim1;
853 b -= b_offset;
854
bbf97416
TK
855 /* Empty c first. */
856 for (j=1; j<=n; j++)
857 for (i=1; i<=m; i++)
858 c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
859
1d5cf7fc
TK
860 /* Early exit if possible */
861 if (m == 0 || n == 0 || k == 0)
862 return;
863
864 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
865 index_type t1_dim, a_sz;
866 if (aystride == 1)
867 a_sz = rystride;
868 else
869 a_sz = a_dim1;
870
871 t1_dim = a_sz * 256 + b_dim1;
1d5cf7fc
TK
872 if (t1_dim > 65536)
873 t1_dim = 65536;
874
875 t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
876
1d5cf7fc
TK
877 /* Start turning the crank. */
878 i1 = n;
879 for (jj = 1; jj <= i1; jj += 512)
880 {
881 /* Computing MIN */
882 i2 = 512;
883 i3 = n - jj + 1;
884 jsec = min(i2,i3);
885 ujsec = jsec - jsec % 4;
886 i2 = k;
887 for (ll = 1; ll <= i2; ll += 256)
888 {
889 /* Computing MIN */
890 i3 = 256;
891 i4 = k - ll + 1;
892 lsec = min(i3,i4);
893 ulsec = lsec - lsec % 2;
894
895 i3 = m;
896 for (ii = 1; ii <= i3; ii += 256)
897 {
898 /* Computing MIN */
899 i4 = 256;
900 i5 = m - ii + 1;
901 isec = min(i4,i5);
902 uisec = isec - isec % 2;
903 i4 = ll + ulsec - 1;
904 for (l = ll; l <= i4; l += 2)
905 {
906 i5 = ii + uisec - 1;
907 for (i = ii; i <= i5; i += 2)
908 {
909 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
910 a[i + l * a_dim1];
911 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
912 a[i + (l + 1) * a_dim1];
913 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
914 a[i + 1 + l * a_dim1];
915 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
916 a[i + 1 + (l + 1) * a_dim1];
917 }
918 if (uisec < isec)
919 {
920 t1[l - ll + 1 + (isec << 8) - 257] =
921 a[ii + isec - 1 + l * a_dim1];
922 t1[l - ll + 2 + (isec << 8) - 257] =
923 a[ii + isec - 1 + (l + 1) * a_dim1];
924 }
925 }
926 if (ulsec < lsec)
927 {
928 i4 = ii + isec - 1;
929 for (i = ii; i<= i4; ++i)
930 {
931 t1[lsec + ((i - ii + 1) << 8) - 257] =
932 a[i + (ll + lsec - 1) * a_dim1];
933 }
934 }
935
936 uisec = isec - isec % 4;
937 i4 = jj + ujsec - 1;
938 for (j = jj; j <= i4; j += 4)
939 {
940 i5 = ii + uisec - 1;
941 for (i = ii; i <= i5; i += 4)
942 {
943 f11 = c[i + j * c_dim1];
944 f21 = c[i + 1 + j * c_dim1];
945 f12 = c[i + (j + 1) * c_dim1];
946 f22 = c[i + 1 + (j + 1) * c_dim1];
947 f13 = c[i + (j + 2) * c_dim1];
948 f23 = c[i + 1 + (j + 2) * c_dim1];
949 f14 = c[i + (j + 3) * c_dim1];
950 f24 = c[i + 1 + (j + 3) * c_dim1];
951 f31 = c[i + 2 + j * c_dim1];
952 f41 = c[i + 3 + j * c_dim1];
953 f32 = c[i + 2 + (j + 1) * c_dim1];
954 f42 = c[i + 3 + (j + 1) * c_dim1];
955 f33 = c[i + 2 + (j + 2) * c_dim1];
956 f43 = c[i + 3 + (j + 2) * c_dim1];
957 f34 = c[i + 2 + (j + 3) * c_dim1];
958 f44 = c[i + 3 + (j + 3) * c_dim1];
959 i6 = ll + lsec - 1;
960 for (l = ll; l <= i6; ++l)
961 {
962 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
963 * b[l + j * b_dim1];
964 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
965 * b[l + j * b_dim1];
966 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
967 * b[l + (j + 1) * b_dim1];
968 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
969 * b[l + (j + 1) * b_dim1];
970 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
971 * b[l + (j + 2) * b_dim1];
972 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
973 * b[l + (j + 2) * b_dim1];
974 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
975 * b[l + (j + 3) * b_dim1];
976 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
977 * b[l + (j + 3) * b_dim1];
978 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
979 * b[l + j * b_dim1];
980 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
981 * b[l + j * b_dim1];
982 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
983 * b[l + (j + 1) * b_dim1];
984 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
985 * b[l + (j + 1) * b_dim1];
986 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
987 * b[l + (j + 2) * b_dim1];
988 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
989 * b[l + (j + 2) * b_dim1];
990 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
991 * b[l + (j + 3) * b_dim1];
992 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
993 * b[l + (j + 3) * b_dim1];
994 }
995 c[i + j * c_dim1] = f11;
996 c[i + 1 + j * c_dim1] = f21;
997 c[i + (j + 1) * c_dim1] = f12;
998 c[i + 1 + (j + 1) * c_dim1] = f22;
999 c[i + (j + 2) * c_dim1] = f13;
1000 c[i + 1 + (j + 2) * c_dim1] = f23;
1001 c[i + (j + 3) * c_dim1] = f14;
1002 c[i + 1 + (j + 3) * c_dim1] = f24;
1003 c[i + 2 + j * c_dim1] = f31;
1004 c[i + 3 + j * c_dim1] = f41;
1005 c[i + 2 + (j + 1) * c_dim1] = f32;
1006 c[i + 3 + (j + 1) * c_dim1] = f42;
1007 c[i + 2 + (j + 2) * c_dim1] = f33;
1008 c[i + 3 + (j + 2) * c_dim1] = f43;
1009 c[i + 2 + (j + 3) * c_dim1] = f34;
1010 c[i + 3 + (j + 3) * c_dim1] = f44;
1011 }
1012 if (uisec < isec)
1013 {
1014 i5 = ii + isec - 1;
1015 for (i = ii + uisec; i <= i5; ++i)
1016 {
1017 f11 = c[i + j * c_dim1];
1018 f12 = c[i + (j + 1) * c_dim1];
1019 f13 = c[i + (j + 2) * c_dim1];
1020 f14 = c[i + (j + 3) * c_dim1];
1021 i6 = ll + lsec - 1;
1022 for (l = ll; l <= i6; ++l)
1023 {
1024 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1025 257] * b[l + j * b_dim1];
1026 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027 257] * b[l + (j + 1) * b_dim1];
1028 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029 257] * b[l + (j + 2) * b_dim1];
1030 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031 257] * b[l + (j + 3) * b_dim1];
1032 }
1033 c[i + j * c_dim1] = f11;
1034 c[i + (j + 1) * c_dim1] = f12;
1035 c[i + (j + 2) * c_dim1] = f13;
1036 c[i + (j + 3) * c_dim1] = f14;
1037 }
1038 }
1039 }
1040 if (ujsec < jsec)
1041 {
1042 i4 = jj + jsec - 1;
1043 for (j = jj + ujsec; j <= i4; ++j)
1044 {
1045 i5 = ii + uisec - 1;
1046 for (i = ii; i <= i5; i += 4)
1047 {
1048 f11 = c[i + j * c_dim1];
1049 f21 = c[i + 1 + j * c_dim1];
1050 f31 = c[i + 2 + j * c_dim1];
1051 f41 = c[i + 3 + j * c_dim1];
1052 i6 = ll + lsec - 1;
1053 for (l = ll; l <= i6; ++l)
1054 {
1055 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1056 257] * b[l + j * b_dim1];
1057 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1058 257] * b[l + j * b_dim1];
1059 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1060 257] * b[l + j * b_dim1];
1061 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1062 257] * b[l + j * b_dim1];
1063 }
1064 c[i + j * c_dim1] = f11;
1065 c[i + 1 + j * c_dim1] = f21;
1066 c[i + 2 + j * c_dim1] = f31;
1067 c[i + 3 + j * c_dim1] = f41;
1068 }
1069 i5 = ii + isec - 1;
1070 for (i = ii + uisec; i <= i5; ++i)
1071 {
1072 f11 = c[i + j * c_dim1];
1073 i6 = ll + lsec - 1;
1074 for (l = ll; l <= i6; ++l)
1075 {
1076 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1077 257] * b[l + j * b_dim1];
1078 }
1079 c[i + j * c_dim1] = f11;
1080 }
1081 }
1082 }
1083 }
1084 }
1085 }
1086 free(t1);
1087 return;
1088 }
1089 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1090 {
1091 if (GFC_DESCRIPTOR_RANK (a) != 1)
1092 {
1093 const GFC_COMPLEX_10 *restrict abase_x;
1094 const GFC_COMPLEX_10 *restrict bbase_y;
1095 GFC_COMPLEX_10 *restrict dest_y;
1096 GFC_COMPLEX_10 s;
1097
1098 for (y = 0; y < ycount; y++)
1099 {
1100 bbase_y = &bbase[y*bystride];
1101 dest_y = &dest[y*rystride];
1102 for (x = 0; x < xcount; x++)
1103 {
1104 abase_x = &abase[x*axstride];
1105 s = (GFC_COMPLEX_10) 0;
1106 for (n = 0; n < count; n++)
1107 s += abase_x[n] * bbase_y[n];
1108 dest_y[x] = s;
1109 }
1110 }
1111 }
1112 else
1113 {
1114 const GFC_COMPLEX_10 *restrict bbase_y;
1115 GFC_COMPLEX_10 s;
1116
1117 for (y = 0; y < ycount; y++)
1118 {
1119 bbase_y = &bbase[y*bystride];
1120 s = (GFC_COMPLEX_10) 0;
1121 for (n = 0; n < count; n++)
1122 s += abase[n*axstride] * bbase_y[n];
1123 dest[y*rystride] = s;
1124 }
1125 }
1126 }
1127 else if (axstride < aystride)
1128 {
1129 for (y = 0; y < ycount; y++)
1130 for (x = 0; x < xcount; x++)
1131 dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
1132
1133 for (y = 0; y < ycount; y++)
1134 for (n = 0; n < count; n++)
1135 for (x = 0; x < xcount; x++)
1136 /* dest[x,y] += a[x,n] * b[n,y] */
1137 dest[x*rxstride + y*rystride] +=
1138 abase[x*axstride + n*aystride] *
1139 bbase[n*bxstride + y*bystride];
1140 }
1141 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1142 {
1143 const GFC_COMPLEX_10 *restrict bbase_y;
1144 GFC_COMPLEX_10 s;
1145
1146 for (y = 0; y < ycount; y++)
1147 {
1148 bbase_y = &bbase[y*bystride];
1149 s = (GFC_COMPLEX_10) 0;
1150 for (n = 0; n < count; n++)
1151 s += abase[n*axstride] * bbase_y[n*bxstride];
1152 dest[y*rxstride] = s;
1153 }
1154 }
1155 else
1156 {
1157 const GFC_COMPLEX_10 *restrict abase_x;
1158 const GFC_COMPLEX_10 *restrict bbase_y;
1159 GFC_COMPLEX_10 *restrict dest_y;
1160 GFC_COMPLEX_10 s;
1161
1162 for (y = 0; y < ycount; y++)
1163 {
1164 bbase_y = &bbase[y*bystride];
1165 dest_y = &dest[y*rystride];
1166 for (x = 0; x < xcount; x++)
1167 {
1168 abase_x = &abase[x*axstride];
1169 s = (GFC_COMPLEX_10) 0;
1170 for (n = 0; n < count; n++)
1171 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1172 dest_y[x*rxstride] = s;
1173 }
1174 }
1175 }
1176}
1177#undef POW3
1178#undef min
1179#undef max
1180
1181#endif
1182
1183#endif
1184