]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmulavx128_i2.c
Daily bump.
[thirdparty/gcc.git] / libgfortran / generated / matmulavx128_i2.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 /* These are the specific versions of matmul with -mprefer-avx128. */
32
33 #if defined (HAVE_GFC_INTEGER_2)
34
35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36 passed to us by the front-end, in which case we call it for large
37 matrices. */
38
39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40 const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
41 const int *, const GFC_INTEGER_2 *, const int *,
42 const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
43 int, int);
44
45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46 void
47 matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
48 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
49 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50 internal_proto(matmul_i2_avx128_fma3);
51 void
52 matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
53 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
54 int blas_limit, blas_call gemm)
55 {
56 const GFC_INTEGER_2 * restrict abase;
57 const GFC_INTEGER_2 * restrict bbase;
58 GFC_INTEGER_2 * restrict dest;
59
60 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61 index_type x, y, n, count, xcount, ycount;
62
63 assert (GFC_DESCRIPTOR_RANK (a) == 2
64 || GFC_DESCRIPTOR_RANK (b) == 2);
65
66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67
68 Either A or B (but not both) can be rank 1:
69
70 o One-dimensional argument A is implicitly treated as a row matrix
71 dimensioned [1,count], so xcount=1.
72
73 o One-dimensional argument B is implicitly treated as a column matrix
74 dimensioned [count, 1], so ycount=1.
75 */
76
77 if (retarray->base_addr == NULL)
78 {
79 if (GFC_DESCRIPTOR_RANK (a) == 1)
80 {
81 GFC_DIMENSION_SET(retarray->dim[0], 0,
82 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83 }
84 else if (GFC_DESCRIPTOR_RANK (b) == 1)
85 {
86 GFC_DIMENSION_SET(retarray->dim[0], 0,
87 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88 }
89 else
90 {
91 GFC_DIMENSION_SET(retarray->dim[0], 0,
92 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93
94 GFC_DIMENSION_SET(retarray->dim[1], 0,
95 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96 GFC_DESCRIPTOR_EXTENT(retarray,0));
97 }
98
99 retarray->base_addr
100 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
101 retarray->offset = 0;
102 }
103 else if (unlikely (compile_options.bounds_check))
104 {
105 index_type ret_extent, arg_extent;
106
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
108 {
109 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111 if (arg_extent != ret_extent)
112 runtime_error ("Incorrect extent in return array in"
113 " MATMUL intrinsic: is %ld, should be %ld",
114 (long int) ret_extent, (long int) arg_extent);
115 }
116 else if (GFC_DESCRIPTOR_RANK (b) == 1)
117 {
118 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120 if (arg_extent != ret_extent)
121 runtime_error ("Incorrect extent in return array in"
122 " MATMUL intrinsic: is %ld, should be %ld",
123 (long int) ret_extent, (long int) arg_extent);
124 }
125 else
126 {
127 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129 if (arg_extent != ret_extent)
130 runtime_error ("Incorrect extent in return array in"
131 " MATMUL intrinsic for dimension 1:"
132 " is %ld, should be %ld",
133 (long int) ret_extent, (long int) arg_extent);
134
135 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
136 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
137 if (arg_extent != ret_extent)
138 runtime_error ("Incorrect extent in return array in"
139 " MATMUL intrinsic for dimension 2:"
140 " is %ld, should be %ld",
141 (long int) ret_extent, (long int) arg_extent);
142 }
143 }
144
145
146 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
147 {
148 /* One-dimensional result may be addressed in the code below
149 either as a row or a column matrix. We want both cases to
150 work. */
151 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
152 }
153 else
154 {
155 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
156 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
157 }
158
159
160 if (GFC_DESCRIPTOR_RANK (a) == 1)
161 {
162 /* Treat it as a a row matrix A[1,count]. */
163 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
164 aystride = 1;
165
166 xcount = 1;
167 count = GFC_DESCRIPTOR_EXTENT(a,0);
168 }
169 else
170 {
171 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
172 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
173
174 count = GFC_DESCRIPTOR_EXTENT(a,1);
175 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
176 }
177
178 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
179 {
180 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
181 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
182 }
183
184 if (GFC_DESCRIPTOR_RANK (b) == 1)
185 {
186 /* Treat it as a column matrix B[count,1] */
187 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188
189 /* bystride should never be used for 1-dimensional b.
190 The value is only used for calculation of the
191 memory by the buffer. */
192 bystride = 256;
193 ycount = 1;
194 }
195 else
196 {
197 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200 }
201
202 abase = a->base_addr;
203 bbase = b->base_addr;
204 dest = retarray->base_addr;
205
206 /* Now that everything is set up, we perform the multiplication
207 itself. */
208
209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210 #define min(a,b) ((a) <= (b) ? (a) : (b))
211 #define max(a,b) ((a) >= (b) ? (a) : (b))
212
213 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214 && (bxstride == 1 || bystride == 1)
215 && (((float) xcount) * ((float) ycount) * ((float) count)
216 > POW3(blas_limit)))
217 {
218 const int m = xcount, n = ycount, k = count, ldc = rystride;
219 const GFC_INTEGER_2 one = 1, zero = 0;
220 const int lda = (axstride == 1) ? aystride : axstride,
221 ldb = (bxstride == 1) ? bystride : bxstride;
222
223 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224 {
225 assert (gemm != NULL);
226 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
227 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
228 &ldc, 1, 1);
229 return;
230 }
231 }
232
233 if (rxstride == 1 && axstride == 1 && bxstride == 1)
234 {
235 /* This block of code implements a tuned matmul, derived from
236 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
237
238 Bo Kagstrom and Per Ling
239 Department of Computing Science
240 Umea University
241 S-901 87 Umea, Sweden
242
243 from netlib.org, translated to C, and modified for matmul.m4. */
244
245 const GFC_INTEGER_2 *a, *b;
246 GFC_INTEGER_2 *c;
247 const index_type m = xcount, n = ycount, k = count;
248
249 /* System generated locals */
250 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
251 i1, i2, i3, i4, i5, i6;
252
253 /* Local variables */
254 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
255 f13, f14, f23, f24, f33, f34, f43, f44;
256 index_type i, j, l, ii, jj, ll;
257 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
258 GFC_INTEGER_2 *t1;
259
260 a = abase;
261 b = bbase;
262 c = retarray->base_addr;
263
264 /* Parameter adjustments */
265 c_dim1 = rystride;
266 c_offset = 1 + c_dim1;
267 c -= c_offset;
268 a_dim1 = aystride;
269 a_offset = 1 + a_dim1;
270 a -= a_offset;
271 b_dim1 = bystride;
272 b_offset = 1 + b_dim1;
273 b -= b_offset;
274
275 /* Empty c first. */
276 for (j=1; j<=n; j++)
277 for (i=1; i<=m; i++)
278 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
279
280 /* Early exit if possible */
281 if (m == 0 || n == 0 || k == 0)
282 return;
283
284 /* Adjust size of t1 to what is needed. */
285 index_type t1_dim, a_sz;
286 if (aystride == 1)
287 a_sz = rystride;
288 else
289 a_sz = a_dim1;
290
291 t1_dim = a_sz * 256 + b_dim1;
292 if (t1_dim > 65536)
293 t1_dim = 65536;
294
295 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
296
297 /* Start turning the crank. */
298 i1 = n;
299 for (jj = 1; jj <= i1; jj += 512)
300 {
301 /* Computing MIN */
302 i2 = 512;
303 i3 = n - jj + 1;
304 jsec = min(i2,i3);
305 ujsec = jsec - jsec % 4;
306 i2 = k;
307 for (ll = 1; ll <= i2; ll += 256)
308 {
309 /* Computing MIN */
310 i3 = 256;
311 i4 = k - ll + 1;
312 lsec = min(i3,i4);
313 ulsec = lsec - lsec % 2;
314
315 i3 = m;
316 for (ii = 1; ii <= i3; ii += 256)
317 {
318 /* Computing MIN */
319 i4 = 256;
320 i5 = m - ii + 1;
321 isec = min(i4,i5);
322 uisec = isec - isec % 2;
323 i4 = ll + ulsec - 1;
324 for (l = ll; l <= i4; l += 2)
325 {
326 i5 = ii + uisec - 1;
327 for (i = ii; i <= i5; i += 2)
328 {
329 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
330 a[i + l * a_dim1];
331 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
332 a[i + (l + 1) * a_dim1];
333 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
334 a[i + 1 + l * a_dim1];
335 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
336 a[i + 1 + (l + 1) * a_dim1];
337 }
338 if (uisec < isec)
339 {
340 t1[l - ll + 1 + (isec << 8) - 257] =
341 a[ii + isec - 1 + l * a_dim1];
342 t1[l - ll + 2 + (isec << 8) - 257] =
343 a[ii + isec - 1 + (l + 1) * a_dim1];
344 }
345 }
346 if (ulsec < lsec)
347 {
348 i4 = ii + isec - 1;
349 for (i = ii; i<= i4; ++i)
350 {
351 t1[lsec + ((i - ii + 1) << 8) - 257] =
352 a[i + (ll + lsec - 1) * a_dim1];
353 }
354 }
355
356 uisec = isec - isec % 4;
357 i4 = jj + ujsec - 1;
358 for (j = jj; j <= i4; j += 4)
359 {
360 i5 = ii + uisec - 1;
361 for (i = ii; i <= i5; i += 4)
362 {
363 f11 = c[i + j * c_dim1];
364 f21 = c[i + 1 + j * c_dim1];
365 f12 = c[i + (j + 1) * c_dim1];
366 f22 = c[i + 1 + (j + 1) * c_dim1];
367 f13 = c[i + (j + 2) * c_dim1];
368 f23 = c[i + 1 + (j + 2) * c_dim1];
369 f14 = c[i + (j + 3) * c_dim1];
370 f24 = c[i + 1 + (j + 3) * c_dim1];
371 f31 = c[i + 2 + j * c_dim1];
372 f41 = c[i + 3 + j * c_dim1];
373 f32 = c[i + 2 + (j + 1) * c_dim1];
374 f42 = c[i + 3 + (j + 1) * c_dim1];
375 f33 = c[i + 2 + (j + 2) * c_dim1];
376 f43 = c[i + 3 + (j + 2) * c_dim1];
377 f34 = c[i + 2 + (j + 3) * c_dim1];
378 f44 = c[i + 3 + (j + 3) * c_dim1];
379 i6 = ll + lsec - 1;
380 for (l = ll; l <= i6; ++l)
381 {
382 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
383 * b[l + j * b_dim1];
384 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
385 * b[l + j * b_dim1];
386 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
387 * b[l + (j + 1) * b_dim1];
388 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
389 * b[l + (j + 1) * b_dim1];
390 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
391 * b[l + (j + 2) * b_dim1];
392 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
393 * b[l + (j + 2) * b_dim1];
394 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
395 * b[l + (j + 3) * b_dim1];
396 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
397 * b[l + (j + 3) * b_dim1];
398 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
399 * b[l + j * b_dim1];
400 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
401 * b[l + j * b_dim1];
402 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
403 * b[l + (j + 1) * b_dim1];
404 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
405 * b[l + (j + 1) * b_dim1];
406 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
407 * b[l + (j + 2) * b_dim1];
408 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
409 * b[l + (j + 2) * b_dim1];
410 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
411 * b[l + (j + 3) * b_dim1];
412 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
413 * b[l + (j + 3) * b_dim1];
414 }
415 c[i + j * c_dim1] = f11;
416 c[i + 1 + j * c_dim1] = f21;
417 c[i + (j + 1) * c_dim1] = f12;
418 c[i + 1 + (j + 1) * c_dim1] = f22;
419 c[i + (j + 2) * c_dim1] = f13;
420 c[i + 1 + (j + 2) * c_dim1] = f23;
421 c[i + (j + 3) * c_dim1] = f14;
422 c[i + 1 + (j + 3) * c_dim1] = f24;
423 c[i + 2 + j * c_dim1] = f31;
424 c[i + 3 + j * c_dim1] = f41;
425 c[i + 2 + (j + 1) * c_dim1] = f32;
426 c[i + 3 + (j + 1) * c_dim1] = f42;
427 c[i + 2 + (j + 2) * c_dim1] = f33;
428 c[i + 3 + (j + 2) * c_dim1] = f43;
429 c[i + 2 + (j + 3) * c_dim1] = f34;
430 c[i + 3 + (j + 3) * c_dim1] = f44;
431 }
432 if (uisec < isec)
433 {
434 i5 = ii + isec - 1;
435 for (i = ii + uisec; i <= i5; ++i)
436 {
437 f11 = c[i + j * c_dim1];
438 f12 = c[i + (j + 1) * c_dim1];
439 f13 = c[i + (j + 2) * c_dim1];
440 f14 = c[i + (j + 3) * c_dim1];
441 i6 = ll + lsec - 1;
442 for (l = ll; l <= i6; ++l)
443 {
444 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
445 257] * b[l + j * b_dim1];
446 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
447 257] * b[l + (j + 1) * b_dim1];
448 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
449 257] * b[l + (j + 2) * b_dim1];
450 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
451 257] * b[l + (j + 3) * b_dim1];
452 }
453 c[i + j * c_dim1] = f11;
454 c[i + (j + 1) * c_dim1] = f12;
455 c[i + (j + 2) * c_dim1] = f13;
456 c[i + (j + 3) * c_dim1] = f14;
457 }
458 }
459 }
460 if (ujsec < jsec)
461 {
462 i4 = jj + jsec - 1;
463 for (j = jj + ujsec; j <= i4; ++j)
464 {
465 i5 = ii + uisec - 1;
466 for (i = ii; i <= i5; i += 4)
467 {
468 f11 = c[i + j * c_dim1];
469 f21 = c[i + 1 + j * c_dim1];
470 f31 = c[i + 2 + j * c_dim1];
471 f41 = c[i + 3 + j * c_dim1];
472 i6 = ll + lsec - 1;
473 for (l = ll; l <= i6; ++l)
474 {
475 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
476 257] * b[l + j * b_dim1];
477 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
478 257] * b[l + j * b_dim1];
479 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
480 257] * b[l + j * b_dim1];
481 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
482 257] * b[l + j * b_dim1];
483 }
484 c[i + j * c_dim1] = f11;
485 c[i + 1 + j * c_dim1] = f21;
486 c[i + 2 + j * c_dim1] = f31;
487 c[i + 3 + j * c_dim1] = f41;
488 }
489 i5 = ii + isec - 1;
490 for (i = ii + uisec; i <= i5; ++i)
491 {
492 f11 = c[i + j * c_dim1];
493 i6 = ll + lsec - 1;
494 for (l = ll; l <= i6; ++l)
495 {
496 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
497 257] * b[l + j * b_dim1];
498 }
499 c[i + j * c_dim1] = f11;
500 }
501 }
502 }
503 }
504 }
505 }
506 free(t1);
507 return;
508 }
509 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
510 {
511 if (GFC_DESCRIPTOR_RANK (a) != 1)
512 {
513 const GFC_INTEGER_2 *restrict abase_x;
514 const GFC_INTEGER_2 *restrict bbase_y;
515 GFC_INTEGER_2 *restrict dest_y;
516 GFC_INTEGER_2 s;
517
518 for (y = 0; y < ycount; y++)
519 {
520 bbase_y = &bbase[y*bystride];
521 dest_y = &dest[y*rystride];
522 for (x = 0; x < xcount; x++)
523 {
524 abase_x = &abase[x*axstride];
525 s = (GFC_INTEGER_2) 0;
526 for (n = 0; n < count; n++)
527 s += abase_x[n] * bbase_y[n];
528 dest_y[x] = s;
529 }
530 }
531 }
532 else
533 {
534 const GFC_INTEGER_2 *restrict bbase_y;
535 GFC_INTEGER_2 s;
536
537 for (y = 0; y < ycount; y++)
538 {
539 bbase_y = &bbase[y*bystride];
540 s = (GFC_INTEGER_2) 0;
541 for (n = 0; n < count; n++)
542 s += abase[n*axstride] * bbase_y[n];
543 dest[y*rystride] = s;
544 }
545 }
546 }
547 else if (axstride < aystride)
548 {
549 for (y = 0; y < ycount; y++)
550 for (x = 0; x < xcount; x++)
551 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
552
553 for (y = 0; y < ycount; y++)
554 for (n = 0; n < count; n++)
555 for (x = 0; x < xcount; x++)
556 /* dest[x,y] += a[x,n] * b[n,y] */
557 dest[x*rxstride + y*rystride] +=
558 abase[x*axstride + n*aystride] *
559 bbase[n*bxstride + y*bystride];
560 }
561 else if (GFC_DESCRIPTOR_RANK (a) == 1)
562 {
563 const GFC_INTEGER_2 *restrict bbase_y;
564 GFC_INTEGER_2 s;
565
566 for (y = 0; y < ycount; y++)
567 {
568 bbase_y = &bbase[y*bystride];
569 s = (GFC_INTEGER_2) 0;
570 for (n = 0; n < count; n++)
571 s += abase[n*axstride] * bbase_y[n*bxstride];
572 dest[y*rxstride] = s;
573 }
574 }
575 else
576 {
577 const GFC_INTEGER_2 *restrict abase_x;
578 const GFC_INTEGER_2 *restrict bbase_y;
579 GFC_INTEGER_2 *restrict dest_y;
580 GFC_INTEGER_2 s;
581
582 for (y = 0; y < ycount; y++)
583 {
584 bbase_y = &bbase[y*bystride];
585 dest_y = &dest[y*rystride];
586 for (x = 0; x < xcount; x++)
587 {
588 abase_x = &abase[x*axstride];
589 s = (GFC_INTEGER_2) 0;
590 for (n = 0; n < count; n++)
591 s += abase_x[n*aystride] * bbase_y[n*bxstride];
592 dest_y[x*rxstride] = s;
593 }
594 }
595 }
596 }
597 #undef POW3
598 #undef min
599 #undef max
600
601 #endif
602
603 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
604 void
605 matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
606 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
607 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
608 internal_proto(matmul_i2_avx128_fma4);
609 void
610 matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
611 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
612 int blas_limit, blas_call gemm)
613 {
614 const GFC_INTEGER_2 * restrict abase;
615 const GFC_INTEGER_2 * restrict bbase;
616 GFC_INTEGER_2 * restrict dest;
617
618 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
619 index_type x, y, n, count, xcount, ycount;
620
621 assert (GFC_DESCRIPTOR_RANK (a) == 2
622 || GFC_DESCRIPTOR_RANK (b) == 2);
623
624 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
625
626 Either A or B (but not both) can be rank 1:
627
628 o One-dimensional argument A is implicitly treated as a row matrix
629 dimensioned [1,count], so xcount=1.
630
631 o One-dimensional argument B is implicitly treated as a column matrix
632 dimensioned [count, 1], so ycount=1.
633 */
634
635 if (retarray->base_addr == NULL)
636 {
637 if (GFC_DESCRIPTOR_RANK (a) == 1)
638 {
639 GFC_DIMENSION_SET(retarray->dim[0], 0,
640 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
641 }
642 else if (GFC_DESCRIPTOR_RANK (b) == 1)
643 {
644 GFC_DIMENSION_SET(retarray->dim[0], 0,
645 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
646 }
647 else
648 {
649 GFC_DIMENSION_SET(retarray->dim[0], 0,
650 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
651
652 GFC_DIMENSION_SET(retarray->dim[1], 0,
653 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
654 GFC_DESCRIPTOR_EXTENT(retarray,0));
655 }
656
657 retarray->base_addr
658 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
659 retarray->offset = 0;
660 }
661 else if (unlikely (compile_options.bounds_check))
662 {
663 index_type ret_extent, arg_extent;
664
665 if (GFC_DESCRIPTOR_RANK (a) == 1)
666 {
667 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
668 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
669 if (arg_extent != ret_extent)
670 runtime_error ("Incorrect extent in return array in"
671 " MATMUL intrinsic: is %ld, should be %ld",
672 (long int) ret_extent, (long int) arg_extent);
673 }
674 else if (GFC_DESCRIPTOR_RANK (b) == 1)
675 {
676 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
677 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
678 if (arg_extent != ret_extent)
679 runtime_error ("Incorrect extent in return array in"
680 " MATMUL intrinsic: is %ld, should be %ld",
681 (long int) ret_extent, (long int) arg_extent);
682 }
683 else
684 {
685 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
686 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
687 if (arg_extent != ret_extent)
688 runtime_error ("Incorrect extent in return array in"
689 " MATMUL intrinsic for dimension 1:"
690 " is %ld, should be %ld",
691 (long int) ret_extent, (long int) arg_extent);
692
693 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
694 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
695 if (arg_extent != ret_extent)
696 runtime_error ("Incorrect extent in return array in"
697 " MATMUL intrinsic for dimension 2:"
698 " is %ld, should be %ld",
699 (long int) ret_extent, (long int) arg_extent);
700 }
701 }
702
703
704 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
705 {
706 /* One-dimensional result may be addressed in the code below
707 either as a row or a column matrix. We want both cases to
708 work. */
709 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
710 }
711 else
712 {
713 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
714 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
715 }
716
717
718 if (GFC_DESCRIPTOR_RANK (a) == 1)
719 {
720 /* Treat it as a a row matrix A[1,count]. */
721 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
722 aystride = 1;
723
724 xcount = 1;
725 count = GFC_DESCRIPTOR_EXTENT(a,0);
726 }
727 else
728 {
729 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
730 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
731
732 count = GFC_DESCRIPTOR_EXTENT(a,1);
733 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
734 }
735
736 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
737 {
738 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
739 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
740 }
741
742 if (GFC_DESCRIPTOR_RANK (b) == 1)
743 {
744 /* Treat it as a column matrix B[count,1] */
745 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
746
747 /* bystride should never be used for 1-dimensional b.
748 The value is only used for calculation of the
749 memory by the buffer. */
750 bystride = 256;
751 ycount = 1;
752 }
753 else
754 {
755 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
756 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
757 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
758 }
759
760 abase = a->base_addr;
761 bbase = b->base_addr;
762 dest = retarray->base_addr;
763
764 /* Now that everything is set up, we perform the multiplication
765 itself. */
766
767 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
768 #define min(a,b) ((a) <= (b) ? (a) : (b))
769 #define max(a,b) ((a) >= (b) ? (a) : (b))
770
771 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
772 && (bxstride == 1 || bystride == 1)
773 && (((float) xcount) * ((float) ycount) * ((float) count)
774 > POW3(blas_limit)))
775 {
776 const int m = xcount, n = ycount, k = count, ldc = rystride;
777 const GFC_INTEGER_2 one = 1, zero = 0;
778 const int lda = (axstride == 1) ? aystride : axstride,
779 ldb = (bxstride == 1) ? bystride : bxstride;
780
781 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
782 {
783 assert (gemm != NULL);
784 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
785 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
786 &ldc, 1, 1);
787 return;
788 }
789 }
790
791 if (rxstride == 1 && axstride == 1 && bxstride == 1)
792 {
793 /* This block of code implements a tuned matmul, derived from
794 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
795
796 Bo Kagstrom and Per Ling
797 Department of Computing Science
798 Umea University
799 S-901 87 Umea, Sweden
800
801 from netlib.org, translated to C, and modified for matmul.m4. */
802
803 const GFC_INTEGER_2 *a, *b;
804 GFC_INTEGER_2 *c;
805 const index_type m = xcount, n = ycount, k = count;
806
807 /* System generated locals */
808 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
809 i1, i2, i3, i4, i5, i6;
810
811 /* Local variables */
812 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
813 f13, f14, f23, f24, f33, f34, f43, f44;
814 index_type i, j, l, ii, jj, ll;
815 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
816 GFC_INTEGER_2 *t1;
817
818 a = abase;
819 b = bbase;
820 c = retarray->base_addr;
821
822 /* Parameter adjustments */
823 c_dim1 = rystride;
824 c_offset = 1 + c_dim1;
825 c -= c_offset;
826 a_dim1 = aystride;
827 a_offset = 1 + a_dim1;
828 a -= a_offset;
829 b_dim1 = bystride;
830 b_offset = 1 + b_dim1;
831 b -= b_offset;
832
833 /* Empty c first. */
834 for (j=1; j<=n; j++)
835 for (i=1; i<=m; i++)
836 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
837
838 /* Early exit if possible */
839 if (m == 0 || n == 0 || k == 0)
840 return;
841
842 /* Adjust size of t1 to what is needed. */
843 index_type t1_dim, a_sz;
844 if (aystride == 1)
845 a_sz = rystride;
846 else
847 a_sz = a_dim1;
848
849 t1_dim = a_sz * 256 + b_dim1;
850 if (t1_dim > 65536)
851 t1_dim = 65536;
852
853 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
854
855 /* Start turning the crank. */
856 i1 = n;
857 for (jj = 1; jj <= i1; jj += 512)
858 {
859 /* Computing MIN */
860 i2 = 512;
861 i3 = n - jj + 1;
862 jsec = min(i2,i3);
863 ujsec = jsec - jsec % 4;
864 i2 = k;
865 for (ll = 1; ll <= i2; ll += 256)
866 {
867 /* Computing MIN */
868 i3 = 256;
869 i4 = k - ll + 1;
870 lsec = min(i3,i4);
871 ulsec = lsec - lsec % 2;
872
873 i3 = m;
874 for (ii = 1; ii <= i3; ii += 256)
875 {
876 /* Computing MIN */
877 i4 = 256;
878 i5 = m - ii + 1;
879 isec = min(i4,i5);
880 uisec = isec - isec % 2;
881 i4 = ll + ulsec - 1;
882 for (l = ll; l <= i4; l += 2)
883 {
884 i5 = ii + uisec - 1;
885 for (i = ii; i <= i5; i += 2)
886 {
887 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
888 a[i + l * a_dim1];
889 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
890 a[i + (l + 1) * a_dim1];
891 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
892 a[i + 1 + l * a_dim1];
893 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
894 a[i + 1 + (l + 1) * a_dim1];
895 }
896 if (uisec < isec)
897 {
898 t1[l - ll + 1 + (isec << 8) - 257] =
899 a[ii + isec - 1 + l * a_dim1];
900 t1[l - ll + 2 + (isec << 8) - 257] =
901 a[ii + isec - 1 + (l + 1) * a_dim1];
902 }
903 }
904 if (ulsec < lsec)
905 {
906 i4 = ii + isec - 1;
907 for (i = ii; i<= i4; ++i)
908 {
909 t1[lsec + ((i - ii + 1) << 8) - 257] =
910 a[i + (ll + lsec - 1) * a_dim1];
911 }
912 }
913
914 uisec = isec - isec % 4;
915 i4 = jj + ujsec - 1;
916 for (j = jj; j <= i4; j += 4)
917 {
918 i5 = ii + uisec - 1;
919 for (i = ii; i <= i5; i += 4)
920 {
921 f11 = c[i + j * c_dim1];
922 f21 = c[i + 1 + j * c_dim1];
923 f12 = c[i + (j + 1) * c_dim1];
924 f22 = c[i + 1 + (j + 1) * c_dim1];
925 f13 = c[i + (j + 2) * c_dim1];
926 f23 = c[i + 1 + (j + 2) * c_dim1];
927 f14 = c[i + (j + 3) * c_dim1];
928 f24 = c[i + 1 + (j + 3) * c_dim1];
929 f31 = c[i + 2 + j * c_dim1];
930 f41 = c[i + 3 + j * c_dim1];
931 f32 = c[i + 2 + (j + 1) * c_dim1];
932 f42 = c[i + 3 + (j + 1) * c_dim1];
933 f33 = c[i + 2 + (j + 2) * c_dim1];
934 f43 = c[i + 3 + (j + 2) * c_dim1];
935 f34 = c[i + 2 + (j + 3) * c_dim1];
936 f44 = c[i + 3 + (j + 3) * c_dim1];
937 i6 = ll + lsec - 1;
938 for (l = ll; l <= i6; ++l)
939 {
940 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
941 * b[l + j * b_dim1];
942 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
943 * b[l + j * b_dim1];
944 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
945 * b[l + (j + 1) * b_dim1];
946 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
947 * b[l + (j + 1) * b_dim1];
948 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
949 * b[l + (j + 2) * b_dim1];
950 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
951 * b[l + (j + 2) * b_dim1];
952 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
953 * b[l + (j + 3) * b_dim1];
954 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
955 * b[l + (j + 3) * b_dim1];
956 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
957 * b[l + j * b_dim1];
958 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
959 * b[l + j * b_dim1];
960 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
961 * b[l + (j + 1) * b_dim1];
962 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
963 * b[l + (j + 1) * b_dim1];
964 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
965 * b[l + (j + 2) * b_dim1];
966 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
967 * b[l + (j + 2) * b_dim1];
968 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
969 * b[l + (j + 3) * b_dim1];
970 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
971 * b[l + (j + 3) * b_dim1];
972 }
973 c[i + j * c_dim1] = f11;
974 c[i + 1 + j * c_dim1] = f21;
975 c[i + (j + 1) * c_dim1] = f12;
976 c[i + 1 + (j + 1) * c_dim1] = f22;
977 c[i + (j + 2) * c_dim1] = f13;
978 c[i + 1 + (j + 2) * c_dim1] = f23;
979 c[i + (j + 3) * c_dim1] = f14;
980 c[i + 1 + (j + 3) * c_dim1] = f24;
981 c[i + 2 + j * c_dim1] = f31;
982 c[i + 3 + j * c_dim1] = f41;
983 c[i + 2 + (j + 1) * c_dim1] = f32;
984 c[i + 3 + (j + 1) * c_dim1] = f42;
985 c[i + 2 + (j + 2) * c_dim1] = f33;
986 c[i + 3 + (j + 2) * c_dim1] = f43;
987 c[i + 2 + (j + 3) * c_dim1] = f34;
988 c[i + 3 + (j + 3) * c_dim1] = f44;
989 }
990 if (uisec < isec)
991 {
992 i5 = ii + isec - 1;
993 for (i = ii + uisec; i <= i5; ++i)
994 {
995 f11 = c[i + j * c_dim1];
996 f12 = c[i + (j + 1) * c_dim1];
997 f13 = c[i + (j + 2) * c_dim1];
998 f14 = c[i + (j + 3) * c_dim1];
999 i6 = ll + lsec - 1;
1000 for (l = ll; l <= i6; ++l)
1001 {
1002 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1003 257] * b[l + j * b_dim1];
1004 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1005 257] * b[l + (j + 1) * b_dim1];
1006 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1007 257] * b[l + (j + 2) * b_dim1];
1008 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1009 257] * b[l + (j + 3) * b_dim1];
1010 }
1011 c[i + j * c_dim1] = f11;
1012 c[i + (j + 1) * c_dim1] = f12;
1013 c[i + (j + 2) * c_dim1] = f13;
1014 c[i + (j + 3) * c_dim1] = f14;
1015 }
1016 }
1017 }
1018 if (ujsec < jsec)
1019 {
1020 i4 = jj + jsec - 1;
1021 for (j = jj + ujsec; j <= i4; ++j)
1022 {
1023 i5 = ii + uisec - 1;
1024 for (i = ii; i <= i5; i += 4)
1025 {
1026 f11 = c[i + j * c_dim1];
1027 f21 = c[i + 1 + j * c_dim1];
1028 f31 = c[i + 2 + j * c_dim1];
1029 f41 = c[i + 3 + j * c_dim1];
1030 i6 = ll + lsec - 1;
1031 for (l = ll; l <= i6; ++l)
1032 {
1033 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1034 257] * b[l + j * b_dim1];
1035 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1036 257] * b[l + j * b_dim1];
1037 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1038 257] * b[l + j * b_dim1];
1039 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1040 257] * b[l + j * b_dim1];
1041 }
1042 c[i + j * c_dim1] = f11;
1043 c[i + 1 + j * c_dim1] = f21;
1044 c[i + 2 + j * c_dim1] = f31;
1045 c[i + 3 + j * c_dim1] = f41;
1046 }
1047 i5 = ii + isec - 1;
1048 for (i = ii + uisec; i <= i5; ++i)
1049 {
1050 f11 = c[i + j * c_dim1];
1051 i6 = ll + lsec - 1;
1052 for (l = ll; l <= i6; ++l)
1053 {
1054 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1055 257] * b[l + j * b_dim1];
1056 }
1057 c[i + j * c_dim1] = f11;
1058 }
1059 }
1060 }
1061 }
1062 }
1063 }
1064 free(t1);
1065 return;
1066 }
1067 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1068 {
1069 if (GFC_DESCRIPTOR_RANK (a) != 1)
1070 {
1071 const GFC_INTEGER_2 *restrict abase_x;
1072 const GFC_INTEGER_2 *restrict bbase_y;
1073 GFC_INTEGER_2 *restrict dest_y;
1074 GFC_INTEGER_2 s;
1075
1076 for (y = 0; y < ycount; y++)
1077 {
1078 bbase_y = &bbase[y*bystride];
1079 dest_y = &dest[y*rystride];
1080 for (x = 0; x < xcount; x++)
1081 {
1082 abase_x = &abase[x*axstride];
1083 s = (GFC_INTEGER_2) 0;
1084 for (n = 0; n < count; n++)
1085 s += abase_x[n] * bbase_y[n];
1086 dest_y[x] = s;
1087 }
1088 }
1089 }
1090 else
1091 {
1092 const GFC_INTEGER_2 *restrict bbase_y;
1093 GFC_INTEGER_2 s;
1094
1095 for (y = 0; y < ycount; y++)
1096 {
1097 bbase_y = &bbase[y*bystride];
1098 s = (GFC_INTEGER_2) 0;
1099 for (n = 0; n < count; n++)
1100 s += abase[n*axstride] * bbase_y[n];
1101 dest[y*rystride] = s;
1102 }
1103 }
1104 }
1105 else if (axstride < aystride)
1106 {
1107 for (y = 0; y < ycount; y++)
1108 for (x = 0; x < xcount; x++)
1109 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1110
1111 for (y = 0; y < ycount; y++)
1112 for (n = 0; n < count; n++)
1113 for (x = 0; x < xcount; x++)
1114 /* dest[x,y] += a[x,n] * b[n,y] */
1115 dest[x*rxstride + y*rystride] +=
1116 abase[x*axstride + n*aystride] *
1117 bbase[n*bxstride + y*bystride];
1118 }
1119 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1120 {
1121 const GFC_INTEGER_2 *restrict bbase_y;
1122 GFC_INTEGER_2 s;
1123
1124 for (y = 0; y < ycount; y++)
1125 {
1126 bbase_y = &bbase[y*bystride];
1127 s = (GFC_INTEGER_2) 0;
1128 for (n = 0; n < count; n++)
1129 s += abase[n*axstride] * bbase_y[n*bxstride];
1130 dest[y*rxstride] = s;
1131 }
1132 }
1133 else
1134 {
1135 const GFC_INTEGER_2 *restrict abase_x;
1136 const GFC_INTEGER_2 *restrict bbase_y;
1137 GFC_INTEGER_2 *restrict dest_y;
1138 GFC_INTEGER_2 s;
1139
1140 for (y = 0; y < ycount; y++)
1141 {
1142 bbase_y = &bbase[y*bystride];
1143 dest_y = &dest[y*rystride];
1144 for (x = 0; x < xcount; x++)
1145 {
1146 abase_x = &abase[x*axstride];
1147 s = (GFC_INTEGER_2) 0;
1148 for (n = 0; n < count; n++)
1149 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1150 dest_y[x*rxstride] = s;
1151 }
1152 }
1153 }
1154 }
1155 #undef POW3
1156 #undef min
1157 #undef max
1158
1159 #endif
1160
1161 #endif
1162