]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgfortran/generated/matmulavx128_i2.c
Daily bump.
[thirdparty/gcc.git] / libgfortran / generated / matmulavx128_i2.c
CommitLineData
1d5cf7fc 1/* Implementation of the MATMUL intrinsic
85ec4feb 2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
1d5cf7fc
TK
3 Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4
5This file is part of the GNU Fortran runtime library (libgfortran).
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
10version 3 of the License, or (at your option) any later version.
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24<http://www.gnu.org/licenses/>. */
25
26#include "libgfortran.h"
27#include <string.h>
28#include <assert.h>
29
30
31/* These are the specific versions of matmul with -mprefer-avx128. */
32
33#if defined (HAVE_GFC_INTEGER_2)
34
35/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36 passed to us by the front-end, in which case we call it for large
37 matrices. */
38
39typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40 const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
41 const int *, const GFC_INTEGER_2 *, const int *,
42 const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
43 int, int);
44
45#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46void
47matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
48 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
49 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50internal_proto(matmul_i2_avx128_fma3);
51void
52matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
53 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
54 int blas_limit, blas_call gemm)
55{
56 const GFC_INTEGER_2 * restrict abase;
57 const GFC_INTEGER_2 * restrict bbase;
58 GFC_INTEGER_2 * restrict dest;
59
60 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61 index_type x, y, n, count, xcount, ycount;
62
63 assert (GFC_DESCRIPTOR_RANK (a) == 2
64 || GFC_DESCRIPTOR_RANK (b) == 2);
65
66/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67
68 Either A or B (but not both) can be rank 1:
69
70 o One-dimensional argument A is implicitly treated as a row matrix
71 dimensioned [1,count], so xcount=1.
72
73 o One-dimensional argument B is implicitly treated as a column matrix
74 dimensioned [count, 1], so ycount=1.
75*/
76
77 if (retarray->base_addr == NULL)
78 {
79 if (GFC_DESCRIPTOR_RANK (a) == 1)
80 {
81 GFC_DIMENSION_SET(retarray->dim[0], 0,
82 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83 }
84 else if (GFC_DESCRIPTOR_RANK (b) == 1)
85 {
86 GFC_DIMENSION_SET(retarray->dim[0], 0,
87 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88 }
89 else
90 {
91 GFC_DIMENSION_SET(retarray->dim[0], 0,
92 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93
94 GFC_DIMENSION_SET(retarray->dim[1], 0,
95 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96 GFC_DESCRIPTOR_EXTENT(retarray,0));
97 }
98
99 retarray->base_addr
100 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
101 retarray->offset = 0;
102 }
103 else if (unlikely (compile_options.bounds_check))
104 {
105 index_type ret_extent, arg_extent;
106
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
108 {
109 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111 if (arg_extent != ret_extent)
112 runtime_error ("Incorrect extent in return array in"
113 " MATMUL intrinsic: is %ld, should be %ld",
114 (long int) ret_extent, (long int) arg_extent);
115 }
116 else if (GFC_DESCRIPTOR_RANK (b) == 1)
117 {
118 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120 if (arg_extent != ret_extent)
121 runtime_error ("Incorrect extent in return array in"
122 " MATMUL intrinsic: is %ld, should be %ld",
123 (long int) ret_extent, (long int) arg_extent);
124 }
125 else
126 {
127 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129 if (arg_extent != ret_extent)
130 runtime_error ("Incorrect extent in return array in"
131 " MATMUL intrinsic for dimension 1:"
132 " is %ld, should be %ld",
133 (long int) ret_extent, (long int) arg_extent);
134
135 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
136 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
137 if (arg_extent != ret_extent)
138 runtime_error ("Incorrect extent in return array in"
139 " MATMUL intrinsic for dimension 2:"
140 " is %ld, should be %ld",
141 (long int) ret_extent, (long int) arg_extent);
142 }
143 }
144
145
146 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
147 {
148 /* One-dimensional result may be addressed in the code below
149 either as a row or a column matrix. We want both cases to
150 work. */
151 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
152 }
153 else
154 {
155 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
156 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
157 }
158
159
160 if (GFC_DESCRIPTOR_RANK (a) == 1)
161 {
162 /* Treat it as a a row matrix A[1,count]. */
163 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
164 aystride = 1;
165
166 xcount = 1;
167 count = GFC_DESCRIPTOR_EXTENT(a,0);
168 }
169 else
170 {
171 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
172 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
173
174 count = GFC_DESCRIPTOR_EXTENT(a,1);
175 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
176 }
177
178 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
179 {
180 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
181 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
182 }
183
184 if (GFC_DESCRIPTOR_RANK (b) == 1)
185 {
186 /* Treat it as a column matrix B[count,1] */
187 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188
189 /* bystride should never be used for 1-dimensional b.
190 The value is only used for calculation of the
191 memory by the buffer. */
192 bystride = 256;
193 ycount = 1;
194 }
195 else
196 {
197 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200 }
201
202 abase = a->base_addr;
203 bbase = b->base_addr;
204 dest = retarray->base_addr;
205
206 /* Now that everything is set up, we perform the multiplication
207 itself. */
208
209#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210#define min(a,b) ((a) <= (b) ? (a) : (b))
211#define max(a,b) ((a) >= (b) ? (a) : (b))
212
213 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214 && (bxstride == 1 || bystride == 1)
215 && (((float) xcount) * ((float) ycount) * ((float) count)
216 > POW3(blas_limit)))
217 {
218 const int m = xcount, n = ycount, k = count, ldc = rystride;
219 const GFC_INTEGER_2 one = 1, zero = 0;
220 const int lda = (axstride == 1) ? aystride : axstride,
221 ldb = (bxstride == 1) ? bystride : bxstride;
222
223 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224 {
225 assert (gemm != NULL);
226 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
227 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
228 &ldc, 1, 1);
229 return;
230 }
231 }
232
233 if (rxstride == 1 && axstride == 1 && bxstride == 1)
234 {
235 /* This block of code implements a tuned matmul, derived from
236 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
237
238 Bo Kagstrom and Per Ling
239 Department of Computing Science
240 Umea University
241 S-901 87 Umea, Sweden
242
243 from netlib.org, translated to C, and modified for matmul.m4. */
244
245 const GFC_INTEGER_2 *a, *b;
246 GFC_INTEGER_2 *c;
247 const index_type m = xcount, n = ycount, k = count;
248
249 /* System generated locals */
250 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
251 i1, i2, i3, i4, i5, i6;
252
253 /* Local variables */
254 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
255 f13, f14, f23, f24, f33, f34, f43, f44;
256 index_type i, j, l, ii, jj, ll;
257 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
258 GFC_INTEGER_2 *t1;
259
260 a = abase;
261 b = bbase;
262 c = retarray->base_addr;
263
264 /* Parameter adjustments */
265 c_dim1 = rystride;
266 c_offset = 1 + c_dim1;
267 c -= c_offset;
268 a_dim1 = aystride;
269 a_offset = 1 + a_dim1;
270 a -= a_offset;
271 b_dim1 = bystride;
272 b_offset = 1 + b_dim1;
273 b -= b_offset;
274
bbf97416
TK
275 /* Empty c first. */
276 for (j=1; j<=n; j++)
277 for (i=1; i<=m; i++)
278 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
279
1d5cf7fc
TK
280 /* Early exit if possible */
281 if (m == 0 || n == 0 || k == 0)
282 return;
283
284 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
285 index_type t1_dim, a_sz;
286 if (aystride == 1)
287 a_sz = rystride;
288 else
289 a_sz = a_dim1;
290
291 t1_dim = a_sz * 256 + b_dim1;
1d5cf7fc
TK
292 if (t1_dim > 65536)
293 t1_dim = 65536;
294
295 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
296
1d5cf7fc
TK
297 /* Start turning the crank. */
298 i1 = n;
299 for (jj = 1; jj <= i1; jj += 512)
300 {
301 /* Computing MIN */
302 i2 = 512;
303 i3 = n - jj + 1;
304 jsec = min(i2,i3);
305 ujsec = jsec - jsec % 4;
306 i2 = k;
307 for (ll = 1; ll <= i2; ll += 256)
308 {
309 /* Computing MIN */
310 i3 = 256;
311 i4 = k - ll + 1;
312 lsec = min(i3,i4);
313 ulsec = lsec - lsec % 2;
314
315 i3 = m;
316 for (ii = 1; ii <= i3; ii += 256)
317 {
318 /* Computing MIN */
319 i4 = 256;
320 i5 = m - ii + 1;
321 isec = min(i4,i5);
322 uisec = isec - isec % 2;
323 i4 = ll + ulsec - 1;
324 for (l = ll; l <= i4; l += 2)
325 {
326 i5 = ii + uisec - 1;
327 for (i = ii; i <= i5; i += 2)
328 {
329 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
330 a[i + l * a_dim1];
331 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
332 a[i + (l + 1) * a_dim1];
333 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
334 a[i + 1 + l * a_dim1];
335 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
336 a[i + 1 + (l + 1) * a_dim1];
337 }
338 if (uisec < isec)
339 {
340 t1[l - ll + 1 + (isec << 8) - 257] =
341 a[ii + isec - 1 + l * a_dim1];
342 t1[l - ll + 2 + (isec << 8) - 257] =
343 a[ii + isec - 1 + (l + 1) * a_dim1];
344 }
345 }
346 if (ulsec < lsec)
347 {
348 i4 = ii + isec - 1;
349 for (i = ii; i<= i4; ++i)
350 {
351 t1[lsec + ((i - ii + 1) << 8) - 257] =
352 a[i + (ll + lsec - 1) * a_dim1];
353 }
354 }
355
356 uisec = isec - isec % 4;
357 i4 = jj + ujsec - 1;
358 for (j = jj; j <= i4; j += 4)
359 {
360 i5 = ii + uisec - 1;
361 for (i = ii; i <= i5; i += 4)
362 {
363 f11 = c[i + j * c_dim1];
364 f21 = c[i + 1 + j * c_dim1];
365 f12 = c[i + (j + 1) * c_dim1];
366 f22 = c[i + 1 + (j + 1) * c_dim1];
367 f13 = c[i + (j + 2) * c_dim1];
368 f23 = c[i + 1 + (j + 2) * c_dim1];
369 f14 = c[i + (j + 3) * c_dim1];
370 f24 = c[i + 1 + (j + 3) * c_dim1];
371 f31 = c[i + 2 + j * c_dim1];
372 f41 = c[i + 3 + j * c_dim1];
373 f32 = c[i + 2 + (j + 1) * c_dim1];
374 f42 = c[i + 3 + (j + 1) * c_dim1];
375 f33 = c[i + 2 + (j + 2) * c_dim1];
376 f43 = c[i + 3 + (j + 2) * c_dim1];
377 f34 = c[i + 2 + (j + 3) * c_dim1];
378 f44 = c[i + 3 + (j + 3) * c_dim1];
379 i6 = ll + lsec - 1;
380 for (l = ll; l <= i6; ++l)
381 {
382 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
383 * b[l + j * b_dim1];
384 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
385 * b[l + j * b_dim1];
386 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
387 * b[l + (j + 1) * b_dim1];
388 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
389 * b[l + (j + 1) * b_dim1];
390 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
391 * b[l + (j + 2) * b_dim1];
392 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
393 * b[l + (j + 2) * b_dim1];
394 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395 * b[l + (j + 3) * b_dim1];
396 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397 * b[l + (j + 3) * b_dim1];
398 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
399 * b[l + j * b_dim1];
400 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
401 * b[l + j * b_dim1];
402 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
403 * b[l + (j + 1) * b_dim1];
404 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
405 * b[l + (j + 1) * b_dim1];
406 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
407 * b[l + (j + 2) * b_dim1];
408 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
409 * b[l + (j + 2) * b_dim1];
410 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411 * b[l + (j + 3) * b_dim1];
412 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413 * b[l + (j + 3) * b_dim1];
414 }
415 c[i + j * c_dim1] = f11;
416 c[i + 1 + j * c_dim1] = f21;
417 c[i + (j + 1) * c_dim1] = f12;
418 c[i + 1 + (j + 1) * c_dim1] = f22;
419 c[i + (j + 2) * c_dim1] = f13;
420 c[i + 1 + (j + 2) * c_dim1] = f23;
421 c[i + (j + 3) * c_dim1] = f14;
422 c[i + 1 + (j + 3) * c_dim1] = f24;
423 c[i + 2 + j * c_dim1] = f31;
424 c[i + 3 + j * c_dim1] = f41;
425 c[i + 2 + (j + 1) * c_dim1] = f32;
426 c[i + 3 + (j + 1) * c_dim1] = f42;
427 c[i + 2 + (j + 2) * c_dim1] = f33;
428 c[i + 3 + (j + 2) * c_dim1] = f43;
429 c[i + 2 + (j + 3) * c_dim1] = f34;
430 c[i + 3 + (j + 3) * c_dim1] = f44;
431 }
432 if (uisec < isec)
433 {
434 i5 = ii + isec - 1;
435 for (i = ii + uisec; i <= i5; ++i)
436 {
437 f11 = c[i + j * c_dim1];
438 f12 = c[i + (j + 1) * c_dim1];
439 f13 = c[i + (j + 2) * c_dim1];
440 f14 = c[i + (j + 3) * c_dim1];
441 i6 = ll + lsec - 1;
442 for (l = ll; l <= i6; ++l)
443 {
444 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
445 257] * b[l + j * b_dim1];
446 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
447 257] * b[l + (j + 1) * b_dim1];
448 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
449 257] * b[l + (j + 2) * b_dim1];
450 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
451 257] * b[l + (j + 3) * b_dim1];
452 }
453 c[i + j * c_dim1] = f11;
454 c[i + (j + 1) * c_dim1] = f12;
455 c[i + (j + 2) * c_dim1] = f13;
456 c[i + (j + 3) * c_dim1] = f14;
457 }
458 }
459 }
460 if (ujsec < jsec)
461 {
462 i4 = jj + jsec - 1;
463 for (j = jj + ujsec; j <= i4; ++j)
464 {
465 i5 = ii + uisec - 1;
466 for (i = ii; i <= i5; i += 4)
467 {
468 f11 = c[i + j * c_dim1];
469 f21 = c[i + 1 + j * c_dim1];
470 f31 = c[i + 2 + j * c_dim1];
471 f41 = c[i + 3 + j * c_dim1];
472 i6 = ll + lsec - 1;
473 for (l = ll; l <= i6; ++l)
474 {
475 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
476 257] * b[l + j * b_dim1];
477 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
478 257] * b[l + j * b_dim1];
479 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
480 257] * b[l + j * b_dim1];
481 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
482 257] * b[l + j * b_dim1];
483 }
484 c[i + j * c_dim1] = f11;
485 c[i + 1 + j * c_dim1] = f21;
486 c[i + 2 + j * c_dim1] = f31;
487 c[i + 3 + j * c_dim1] = f41;
488 }
489 i5 = ii + isec - 1;
490 for (i = ii + uisec; i <= i5; ++i)
491 {
492 f11 = c[i + j * c_dim1];
493 i6 = ll + lsec - 1;
494 for (l = ll; l <= i6; ++l)
495 {
496 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
497 257] * b[l + j * b_dim1];
498 }
499 c[i + j * c_dim1] = f11;
500 }
501 }
502 }
503 }
504 }
505 }
506 free(t1);
507 return;
508 }
509 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
510 {
511 if (GFC_DESCRIPTOR_RANK (a) != 1)
512 {
513 const GFC_INTEGER_2 *restrict abase_x;
514 const GFC_INTEGER_2 *restrict bbase_y;
515 GFC_INTEGER_2 *restrict dest_y;
516 GFC_INTEGER_2 s;
517
518 for (y = 0; y < ycount; y++)
519 {
520 bbase_y = &bbase[y*bystride];
521 dest_y = &dest[y*rystride];
522 for (x = 0; x < xcount; x++)
523 {
524 abase_x = &abase[x*axstride];
525 s = (GFC_INTEGER_2) 0;
526 for (n = 0; n < count; n++)
527 s += abase_x[n] * bbase_y[n];
528 dest_y[x] = s;
529 }
530 }
531 }
532 else
533 {
534 const GFC_INTEGER_2 *restrict bbase_y;
535 GFC_INTEGER_2 s;
536
537 for (y = 0; y < ycount; y++)
538 {
539 bbase_y = &bbase[y*bystride];
540 s = (GFC_INTEGER_2) 0;
541 for (n = 0; n < count; n++)
542 s += abase[n*axstride] * bbase_y[n];
543 dest[y*rystride] = s;
544 }
545 }
546 }
547 else if (axstride < aystride)
548 {
549 for (y = 0; y < ycount; y++)
550 for (x = 0; x < xcount; x++)
551 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
552
553 for (y = 0; y < ycount; y++)
554 for (n = 0; n < count; n++)
555 for (x = 0; x < xcount; x++)
556 /* dest[x,y] += a[x,n] * b[n,y] */
557 dest[x*rxstride + y*rystride] +=
558 abase[x*axstride + n*aystride] *
559 bbase[n*bxstride + y*bystride];
560 }
561 else if (GFC_DESCRIPTOR_RANK (a) == 1)
562 {
563 const GFC_INTEGER_2 *restrict bbase_y;
564 GFC_INTEGER_2 s;
565
566 for (y = 0; y < ycount; y++)
567 {
568 bbase_y = &bbase[y*bystride];
569 s = (GFC_INTEGER_2) 0;
570 for (n = 0; n < count; n++)
571 s += abase[n*axstride] * bbase_y[n*bxstride];
572 dest[y*rxstride] = s;
573 }
574 }
575 else
576 {
577 const GFC_INTEGER_2 *restrict abase_x;
578 const GFC_INTEGER_2 *restrict bbase_y;
579 GFC_INTEGER_2 *restrict dest_y;
580 GFC_INTEGER_2 s;
581
582 for (y = 0; y < ycount; y++)
583 {
584 bbase_y = &bbase[y*bystride];
585 dest_y = &dest[y*rystride];
586 for (x = 0; x < xcount; x++)
587 {
588 abase_x = &abase[x*axstride];
589 s = (GFC_INTEGER_2) 0;
590 for (n = 0; n < count; n++)
591 s += abase_x[n*aystride] * bbase_y[n*bxstride];
592 dest_y[x*rxstride] = s;
593 }
594 }
595 }
596}
597#undef POW3
598#undef min
599#undef max
600
601#endif
602
603#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
604void
605matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
606 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
607 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
608internal_proto(matmul_i2_avx128_fma4);
609void
610matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
611 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
612 int blas_limit, blas_call gemm)
613{
614 const GFC_INTEGER_2 * restrict abase;
615 const GFC_INTEGER_2 * restrict bbase;
616 GFC_INTEGER_2 * restrict dest;
617
618 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
619 index_type x, y, n, count, xcount, ycount;
620
621 assert (GFC_DESCRIPTOR_RANK (a) == 2
622 || GFC_DESCRIPTOR_RANK (b) == 2);
623
624/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
625
626 Either A or B (but not both) can be rank 1:
627
628 o One-dimensional argument A is implicitly treated as a row matrix
629 dimensioned [1,count], so xcount=1.
630
631 o One-dimensional argument B is implicitly treated as a column matrix
632 dimensioned [count, 1], so ycount=1.
633*/
634
635 if (retarray->base_addr == NULL)
636 {
637 if (GFC_DESCRIPTOR_RANK (a) == 1)
638 {
639 GFC_DIMENSION_SET(retarray->dim[0], 0,
640 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
641 }
642 else if (GFC_DESCRIPTOR_RANK (b) == 1)
643 {
644 GFC_DIMENSION_SET(retarray->dim[0], 0,
645 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
646 }
647 else
648 {
649 GFC_DIMENSION_SET(retarray->dim[0], 0,
650 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
651
652 GFC_DIMENSION_SET(retarray->dim[1], 0,
653 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
654 GFC_DESCRIPTOR_EXTENT(retarray,0));
655 }
656
657 retarray->base_addr
658 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
659 retarray->offset = 0;
660 }
661 else if (unlikely (compile_options.bounds_check))
662 {
663 index_type ret_extent, arg_extent;
664
665 if (GFC_DESCRIPTOR_RANK (a) == 1)
666 {
667 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
668 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
669 if (arg_extent != ret_extent)
670 runtime_error ("Incorrect extent in return array in"
671 " MATMUL intrinsic: is %ld, should be %ld",
672 (long int) ret_extent, (long int) arg_extent);
673 }
674 else if (GFC_DESCRIPTOR_RANK (b) == 1)
675 {
676 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
677 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
678 if (arg_extent != ret_extent)
679 runtime_error ("Incorrect extent in return array in"
680 " MATMUL intrinsic: is %ld, should be %ld",
681 (long int) ret_extent, (long int) arg_extent);
682 }
683 else
684 {
685 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
686 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
687 if (arg_extent != ret_extent)
688 runtime_error ("Incorrect extent in return array in"
689 " MATMUL intrinsic for dimension 1:"
690 " is %ld, should be %ld",
691 (long int) ret_extent, (long int) arg_extent);
692
693 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
694 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
695 if (arg_extent != ret_extent)
696 runtime_error ("Incorrect extent in return array in"
697 " MATMUL intrinsic for dimension 2:"
698 " is %ld, should be %ld",
699 (long int) ret_extent, (long int) arg_extent);
700 }
701 }
702
703
704 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
705 {
706 /* One-dimensional result may be addressed in the code below
707 either as a row or a column matrix. We want both cases to
708 work. */
709 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
710 }
711 else
712 {
713 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
714 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
715 }
716
717
718 if (GFC_DESCRIPTOR_RANK (a) == 1)
719 {
720 /* Treat it as a a row matrix A[1,count]. */
721 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
722 aystride = 1;
723
724 xcount = 1;
725 count = GFC_DESCRIPTOR_EXTENT(a,0);
726 }
727 else
728 {
729 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
730 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
731
732 count = GFC_DESCRIPTOR_EXTENT(a,1);
733 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
734 }
735
736 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
737 {
738 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
739 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
740 }
741
742 if (GFC_DESCRIPTOR_RANK (b) == 1)
743 {
744 /* Treat it as a column matrix B[count,1] */
745 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
746
747 /* bystride should never be used for 1-dimensional b.
748 The value is only used for calculation of the
749 memory by the buffer. */
750 bystride = 256;
751 ycount = 1;
752 }
753 else
754 {
755 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
756 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
757 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
758 }
759
760 abase = a->base_addr;
761 bbase = b->base_addr;
762 dest = retarray->base_addr;
763
764 /* Now that everything is set up, we perform the multiplication
765 itself. */
766
767#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
768#define min(a,b) ((a) <= (b) ? (a) : (b))
769#define max(a,b) ((a) >= (b) ? (a) : (b))
770
771 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
772 && (bxstride == 1 || bystride == 1)
773 && (((float) xcount) * ((float) ycount) * ((float) count)
774 > POW3(blas_limit)))
775 {
776 const int m = xcount, n = ycount, k = count, ldc = rystride;
777 const GFC_INTEGER_2 one = 1, zero = 0;
778 const int lda = (axstride == 1) ? aystride : axstride,
779 ldb = (bxstride == 1) ? bystride : bxstride;
780
781 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
782 {
783 assert (gemm != NULL);
784 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
785 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
786 &ldc, 1, 1);
787 return;
788 }
789 }
790
791 if (rxstride == 1 && axstride == 1 && bxstride == 1)
792 {
793 /* This block of code implements a tuned matmul, derived from
794 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
795
796 Bo Kagstrom and Per Ling
797 Department of Computing Science
798 Umea University
799 S-901 87 Umea, Sweden
800
801 from netlib.org, translated to C, and modified for matmul.m4. */
802
803 const GFC_INTEGER_2 *a, *b;
804 GFC_INTEGER_2 *c;
805 const index_type m = xcount, n = ycount, k = count;
806
807 /* System generated locals */
808 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
809 i1, i2, i3, i4, i5, i6;
810
811 /* Local variables */
812 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
813 f13, f14, f23, f24, f33, f34, f43, f44;
814 index_type i, j, l, ii, jj, ll;
815 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
816 GFC_INTEGER_2 *t1;
817
818 a = abase;
819 b = bbase;
820 c = retarray->base_addr;
821
822 /* Parameter adjustments */
823 c_dim1 = rystride;
824 c_offset = 1 + c_dim1;
825 c -= c_offset;
826 a_dim1 = aystride;
827 a_offset = 1 + a_dim1;
828 a -= a_offset;
829 b_dim1 = bystride;
830 b_offset = 1 + b_dim1;
831 b -= b_offset;
832
bbf97416
TK
833 /* Empty c first. */
834 for (j=1; j<=n; j++)
835 for (i=1; i<=m; i++)
836 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
837
1d5cf7fc
TK
838 /* Early exit if possible */
839 if (m == 0 || n == 0 || k == 0)
840 return;
841
842 /* Adjust size of t1 to what is needed. */
4f4fabd7
TK
843 index_type t1_dim, a_sz;
844 if (aystride == 1)
845 a_sz = rystride;
846 else
847 a_sz = a_dim1;
848
849 t1_dim = a_sz * 256 + b_dim1;
1d5cf7fc
TK
850 if (t1_dim > 65536)
851 t1_dim = 65536;
852
853 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
854
1d5cf7fc
TK
855 /* Start turning the crank. */
856 i1 = n;
857 for (jj = 1; jj <= i1; jj += 512)
858 {
859 /* Computing MIN */
860 i2 = 512;
861 i3 = n - jj + 1;
862 jsec = min(i2,i3);
863 ujsec = jsec - jsec % 4;
864 i2 = k;
865 for (ll = 1; ll <= i2; ll += 256)
866 {
867 /* Computing MIN */
868 i3 = 256;
869 i4 = k - ll + 1;
870 lsec = min(i3,i4);
871 ulsec = lsec - lsec % 2;
872
873 i3 = m;
874 for (ii = 1; ii <= i3; ii += 256)
875 {
876 /* Computing MIN */
877 i4 = 256;
878 i5 = m - ii + 1;
879 isec = min(i4,i5);
880 uisec = isec - isec % 2;
881 i4 = ll + ulsec - 1;
882 for (l = ll; l <= i4; l += 2)
883 {
884 i5 = ii + uisec - 1;
885 for (i = ii; i <= i5; i += 2)
886 {
887 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
888 a[i + l * a_dim1];
889 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
890 a[i + (l + 1) * a_dim1];
891 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
892 a[i + 1 + l * a_dim1];
893 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
894 a[i + 1 + (l + 1) * a_dim1];
895 }
896 if (uisec < isec)
897 {
898 t1[l - ll + 1 + (isec << 8) - 257] =
899 a[ii + isec - 1 + l * a_dim1];
900 t1[l - ll + 2 + (isec << 8) - 257] =
901 a[ii + isec - 1 + (l + 1) * a_dim1];
902 }
903 }
904 if (ulsec < lsec)
905 {
906 i4 = ii + isec - 1;
907 for (i = ii; i<= i4; ++i)
908 {
909 t1[lsec + ((i - ii + 1) << 8) - 257] =
910 a[i + (ll + lsec - 1) * a_dim1];
911 }
912 }
913
914 uisec = isec - isec % 4;
915 i4 = jj + ujsec - 1;
916 for (j = jj; j <= i4; j += 4)
917 {
918 i5 = ii + uisec - 1;
919 for (i = ii; i <= i5; i += 4)
920 {
921 f11 = c[i + j * c_dim1];
922 f21 = c[i + 1 + j * c_dim1];
923 f12 = c[i + (j + 1) * c_dim1];
924 f22 = c[i + 1 + (j + 1) * c_dim1];
925 f13 = c[i + (j + 2) * c_dim1];
926 f23 = c[i + 1 + (j + 2) * c_dim1];
927 f14 = c[i + (j + 3) * c_dim1];
928 f24 = c[i + 1 + (j + 3) * c_dim1];
929 f31 = c[i + 2 + j * c_dim1];
930 f41 = c[i + 3 + j * c_dim1];
931 f32 = c[i + 2 + (j + 1) * c_dim1];
932 f42 = c[i + 3 + (j + 1) * c_dim1];
933 f33 = c[i + 2 + (j + 2) * c_dim1];
934 f43 = c[i + 3 + (j + 2) * c_dim1];
935 f34 = c[i + 2 + (j + 3) * c_dim1];
936 f44 = c[i + 3 + (j + 3) * c_dim1];
937 i6 = ll + lsec - 1;
938 for (l = ll; l <= i6; ++l)
939 {
940 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
941 * b[l + j * b_dim1];
942 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
943 * b[l + j * b_dim1];
944 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
945 * b[l + (j + 1) * b_dim1];
946 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
947 * b[l + (j + 1) * b_dim1];
948 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
949 * b[l + (j + 2) * b_dim1];
950 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
951 * b[l + (j + 2) * b_dim1];
952 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
953 * b[l + (j + 3) * b_dim1];
954 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
955 * b[l + (j + 3) * b_dim1];
956 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
957 * b[l + j * b_dim1];
958 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
959 * b[l + j * b_dim1];
960 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
961 * b[l + (j + 1) * b_dim1];
962 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
963 * b[l + (j + 1) * b_dim1];
964 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
965 * b[l + (j + 2) * b_dim1];
966 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
967 * b[l + (j + 2) * b_dim1];
968 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
969 * b[l + (j + 3) * b_dim1];
970 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
971 * b[l + (j + 3) * b_dim1];
972 }
973 c[i + j * c_dim1] = f11;
974 c[i + 1 + j * c_dim1] = f21;
975 c[i + (j + 1) * c_dim1] = f12;
976 c[i + 1 + (j + 1) * c_dim1] = f22;
977 c[i + (j + 2) * c_dim1] = f13;
978 c[i + 1 + (j + 2) * c_dim1] = f23;
979 c[i + (j + 3) * c_dim1] = f14;
980 c[i + 1 + (j + 3) * c_dim1] = f24;
981 c[i + 2 + j * c_dim1] = f31;
982 c[i + 3 + j * c_dim1] = f41;
983 c[i + 2 + (j + 1) * c_dim1] = f32;
984 c[i + 3 + (j + 1) * c_dim1] = f42;
985 c[i + 2 + (j + 2) * c_dim1] = f33;
986 c[i + 3 + (j + 2) * c_dim1] = f43;
987 c[i + 2 + (j + 3) * c_dim1] = f34;
988 c[i + 3 + (j + 3) * c_dim1] = f44;
989 }
990 if (uisec < isec)
991 {
992 i5 = ii + isec - 1;
993 for (i = ii + uisec; i <= i5; ++i)
994 {
995 f11 = c[i + j * c_dim1];
996 f12 = c[i + (j + 1) * c_dim1];
997 f13 = c[i + (j + 2) * c_dim1];
998 f14 = c[i + (j + 3) * c_dim1];
999 i6 = ll + lsec - 1;
1000 for (l = ll; l <= i6; ++l)
1001 {
1002 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1003 257] * b[l + j * b_dim1];
1004 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1005 257] * b[l + (j + 1) * b_dim1];
1006 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1007 257] * b[l + (j + 2) * b_dim1];
1008 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1009 257] * b[l + (j + 3) * b_dim1];
1010 }
1011 c[i + j * c_dim1] = f11;
1012 c[i + (j + 1) * c_dim1] = f12;
1013 c[i + (j + 2) * c_dim1] = f13;
1014 c[i + (j + 3) * c_dim1] = f14;
1015 }
1016 }
1017 }
1018 if (ujsec < jsec)
1019 {
1020 i4 = jj + jsec - 1;
1021 for (j = jj + ujsec; j <= i4; ++j)
1022 {
1023 i5 = ii + uisec - 1;
1024 for (i = ii; i <= i5; i += 4)
1025 {
1026 f11 = c[i + j * c_dim1];
1027 f21 = c[i + 1 + j * c_dim1];
1028 f31 = c[i + 2 + j * c_dim1];
1029 f41 = c[i + 3 + j * c_dim1];
1030 i6 = ll + lsec - 1;
1031 for (l = ll; l <= i6; ++l)
1032 {
1033 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1034 257] * b[l + j * b_dim1];
1035 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1036 257] * b[l + j * b_dim1];
1037 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1038 257] * b[l + j * b_dim1];
1039 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1040 257] * b[l + j * b_dim1];
1041 }
1042 c[i + j * c_dim1] = f11;
1043 c[i + 1 + j * c_dim1] = f21;
1044 c[i + 2 + j * c_dim1] = f31;
1045 c[i + 3 + j * c_dim1] = f41;
1046 }
1047 i5 = ii + isec - 1;
1048 for (i = ii + uisec; i <= i5; ++i)
1049 {
1050 f11 = c[i + j * c_dim1];
1051 i6 = ll + lsec - 1;
1052 for (l = ll; l <= i6; ++l)
1053 {
1054 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1055 257] * b[l + j * b_dim1];
1056 }
1057 c[i + j * c_dim1] = f11;
1058 }
1059 }
1060 }
1061 }
1062 }
1063 }
1064 free(t1);
1065 return;
1066 }
1067 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1068 {
1069 if (GFC_DESCRIPTOR_RANK (a) != 1)
1070 {
1071 const GFC_INTEGER_2 *restrict abase_x;
1072 const GFC_INTEGER_2 *restrict bbase_y;
1073 GFC_INTEGER_2 *restrict dest_y;
1074 GFC_INTEGER_2 s;
1075
1076 for (y = 0; y < ycount; y++)
1077 {
1078 bbase_y = &bbase[y*bystride];
1079 dest_y = &dest[y*rystride];
1080 for (x = 0; x < xcount; x++)
1081 {
1082 abase_x = &abase[x*axstride];
1083 s = (GFC_INTEGER_2) 0;
1084 for (n = 0; n < count; n++)
1085 s += abase_x[n] * bbase_y[n];
1086 dest_y[x] = s;
1087 }
1088 }
1089 }
1090 else
1091 {
1092 const GFC_INTEGER_2 *restrict bbase_y;
1093 GFC_INTEGER_2 s;
1094
1095 for (y = 0; y < ycount; y++)
1096 {
1097 bbase_y = &bbase[y*bystride];
1098 s = (GFC_INTEGER_2) 0;
1099 for (n = 0; n < count; n++)
1100 s += abase[n*axstride] * bbase_y[n];
1101 dest[y*rystride] = s;
1102 }
1103 }
1104 }
1105 else if (axstride < aystride)
1106 {
1107 for (y = 0; y < ycount; y++)
1108 for (x = 0; x < xcount; x++)
1109 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1110
1111 for (y = 0; y < ycount; y++)
1112 for (n = 0; n < count; n++)
1113 for (x = 0; x < xcount; x++)
1114 /* dest[x,y] += a[x,n] * b[n,y] */
1115 dest[x*rxstride + y*rystride] +=
1116 abase[x*axstride + n*aystride] *
1117 bbase[n*bxstride + y*bystride];
1118 }
1119 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1120 {
1121 const GFC_INTEGER_2 *restrict bbase_y;
1122 GFC_INTEGER_2 s;
1123
1124 for (y = 0; y < ycount; y++)
1125 {
1126 bbase_y = &bbase[y*bystride];
1127 s = (GFC_INTEGER_2) 0;
1128 for (n = 0; n < count; n++)
1129 s += abase[n*axstride] * bbase_y[n*bxstride];
1130 dest[y*rxstride] = s;
1131 }
1132 }
1133 else
1134 {
1135 const GFC_INTEGER_2 *restrict abase_x;
1136 const GFC_INTEGER_2 *restrict bbase_y;
1137 GFC_INTEGER_2 *restrict dest_y;
1138 GFC_INTEGER_2 s;
1139
1140 for (y = 0; y < ycount; y++)
1141 {
1142 bbase_y = &bbase[y*bystride];
1143 dest_y = &dest[y*rystride];
1144 for (x = 0; x < xcount; x++)
1145 {
1146 abase_x = &abase[x*axstride];
1147 s = (GFC_INTEGER_2) 0;
1148 for (n = 0; n < count; n++)
1149 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1150 dest_y[x*rxstride] = s;
1151 }
1152 }
1153 }
1154}
1155#undef POW3
1156#undef min
1157#undef max
1158
1159#endif
1160
1161#endif
1162