]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_i1.c
libgfortran.h: Include <stdlib.h> header.
[thirdparty/gcc.git] / libgfortran / generated / matmul_i1.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_INTEGER_1)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *,
39 const int *, const GFC_INTEGER_1 *, const int *,
40 const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_i1 (gfc_array_i1 * const restrict retarray,
73 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_i1);
76
77
78
79
80 /* Put exhaustive list of possible architectures here here, ORed together. */
81
82 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
83
84 #ifdef HAVE_AVX
85 static void
86 matmul_i1_avx (gfc_array_i1 * const restrict retarray,
87 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
88 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
89 static void
90 matmul_i1_avx (gfc_array_i1 * const restrict retarray,
91 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
92 int blas_limit, blas_call gemm)
93 {
94 const GFC_INTEGER_1 * restrict abase;
95 const GFC_INTEGER_1 * restrict bbase;
96 GFC_INTEGER_1 * restrict dest;
97
98 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
99 index_type x, y, n, count, xcount, ycount;
100
101 assert (GFC_DESCRIPTOR_RANK (a) == 2
102 || GFC_DESCRIPTOR_RANK (b) == 2);
103
104 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
105
106 Either A or B (but not both) can be rank 1:
107
108 o One-dimensional argument A is implicitly treated as a row matrix
109 dimensioned [1,count], so xcount=1.
110
111 o One-dimensional argument B is implicitly treated as a column matrix
112 dimensioned [count, 1], so ycount=1.
113 */
114
115 if (retarray->base_addr == NULL)
116 {
117 if (GFC_DESCRIPTOR_RANK (a) == 1)
118 {
119 GFC_DIMENSION_SET(retarray->dim[0], 0,
120 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
121 }
122 else if (GFC_DESCRIPTOR_RANK (b) == 1)
123 {
124 GFC_DIMENSION_SET(retarray->dim[0], 0,
125 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
126 }
127 else
128 {
129 GFC_DIMENSION_SET(retarray->dim[0], 0,
130 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
131
132 GFC_DIMENSION_SET(retarray->dim[1], 0,
133 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
134 GFC_DESCRIPTOR_EXTENT(retarray,0));
135 }
136
137 retarray->base_addr
138 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
139 retarray->offset = 0;
140 }
141 else if (unlikely (compile_options.bounds_check))
142 {
143 index_type ret_extent, arg_extent;
144
145 if (GFC_DESCRIPTOR_RANK (a) == 1)
146 {
147 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
148 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
149 if (arg_extent != ret_extent)
150 runtime_error ("Incorrect extent in return array in"
151 " MATMUL intrinsic: is %ld, should be %ld",
152 (long int) ret_extent, (long int) arg_extent);
153 }
154 else if (GFC_DESCRIPTOR_RANK (b) == 1)
155 {
156 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
157 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
158 if (arg_extent != ret_extent)
159 runtime_error ("Incorrect extent in return array in"
160 " MATMUL intrinsic: is %ld, should be %ld",
161 (long int) ret_extent, (long int) arg_extent);
162 }
163 else
164 {
165 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
166 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
167 if (arg_extent != ret_extent)
168 runtime_error ("Incorrect extent in return array in"
169 " MATMUL intrinsic for dimension 1:"
170 " is %ld, should be %ld",
171 (long int) ret_extent, (long int) arg_extent);
172
173 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
174 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
175 if (arg_extent != ret_extent)
176 runtime_error ("Incorrect extent in return array in"
177 " MATMUL intrinsic for dimension 2:"
178 " is %ld, should be %ld",
179 (long int) ret_extent, (long int) arg_extent);
180 }
181 }
182
183
184 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
185 {
186 /* One-dimensional result may be addressed in the code below
187 either as a row or a column matrix. We want both cases to
188 work. */
189 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
190 }
191 else
192 {
193 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
194 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
195 }
196
197
198 if (GFC_DESCRIPTOR_RANK (a) == 1)
199 {
200 /* Treat it as a a row matrix A[1,count]. */
201 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
202 aystride = 1;
203
204 xcount = 1;
205 count = GFC_DESCRIPTOR_EXTENT(a,0);
206 }
207 else
208 {
209 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
210 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
211
212 count = GFC_DESCRIPTOR_EXTENT(a,1);
213 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
214 }
215
216 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
217 {
218 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
219 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
220 }
221
222 if (GFC_DESCRIPTOR_RANK (b) == 1)
223 {
224 /* Treat it as a column matrix B[count,1] */
225 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
226
227 /* bystride should never be used for 1-dimensional b.
228 in case it is we want it to cause a segfault, rather than
229 an incorrect result. */
230 bystride = 0xDEADBEEF;
231 ycount = 1;
232 }
233 else
234 {
235 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
236 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
237 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
238 }
239
240 abase = a->base_addr;
241 bbase = b->base_addr;
242 dest = retarray->base_addr;
243
244 /* Now that everything is set up, we perform the multiplication
245 itself. */
246
247 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
248 #define min(a,b) ((a) <= (b) ? (a) : (b))
249 #define max(a,b) ((a) >= (b) ? (a) : (b))
250
251 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
252 && (bxstride == 1 || bystride == 1)
253 && (((float) xcount) * ((float) ycount) * ((float) count)
254 > POW3(blas_limit)))
255 {
256 const int m = xcount, n = ycount, k = count, ldc = rystride;
257 const GFC_INTEGER_1 one = 1, zero = 0;
258 const int lda = (axstride == 1) ? aystride : axstride,
259 ldb = (bxstride == 1) ? bystride : bxstride;
260
261 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
262 {
263 assert (gemm != NULL);
264 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
265 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
266 &ldc, 1, 1);
267 return;
268 }
269 }
270
271 if (rxstride == 1 && axstride == 1 && bxstride == 1)
272 {
273 /* This block of code implements a tuned matmul, derived from
274 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
275
276 Bo Kagstrom and Per Ling
277 Department of Computing Science
278 Umea University
279 S-901 87 Umea, Sweden
280
281 from netlib.org, translated to C, and modified for matmul.m4. */
282
283 const GFC_INTEGER_1 *a, *b;
284 GFC_INTEGER_1 *c;
285 const index_type m = xcount, n = ycount, k = count;
286
287 /* System generated locals */
288 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
289 i1, i2, i3, i4, i5, i6;
290
291 /* Local variables */
292 GFC_INTEGER_1 t1[65536], /* was [256][256] */
293 f11, f12, f21, f22, f31, f32, f41, f42,
294 f13, f14, f23, f24, f33, f34, f43, f44;
295 index_type i, j, l, ii, jj, ll;
296 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
297
298 a = abase;
299 b = bbase;
300 c = retarray->base_addr;
301
302 /* Parameter adjustments */
303 c_dim1 = rystride;
304 c_offset = 1 + c_dim1;
305 c -= c_offset;
306 a_dim1 = aystride;
307 a_offset = 1 + a_dim1;
308 a -= a_offset;
309 b_dim1 = bystride;
310 b_offset = 1 + b_dim1;
311 b -= b_offset;
312
313 /* Early exit if possible */
314 if (m == 0 || n == 0 || k == 0)
315 return;
316
317 /* Empty c first. */
318 for (j=1; j<=n; j++)
319 for (i=1; i<=m; i++)
320 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
321
322 /* Start turning the crank. */
323 i1 = n;
324 for (jj = 1; jj <= i1; jj += 512)
325 {
326 /* Computing MIN */
327 i2 = 512;
328 i3 = n - jj + 1;
329 jsec = min(i2,i3);
330 ujsec = jsec - jsec % 4;
331 i2 = k;
332 for (ll = 1; ll <= i2; ll += 256)
333 {
334 /* Computing MIN */
335 i3 = 256;
336 i4 = k - ll + 1;
337 lsec = min(i3,i4);
338 ulsec = lsec - lsec % 2;
339
340 i3 = m;
341 for (ii = 1; ii <= i3; ii += 256)
342 {
343 /* Computing MIN */
344 i4 = 256;
345 i5 = m - ii + 1;
346 isec = min(i4,i5);
347 uisec = isec - isec % 2;
348 i4 = ll + ulsec - 1;
349 for (l = ll; l <= i4; l += 2)
350 {
351 i5 = ii + uisec - 1;
352 for (i = ii; i <= i5; i += 2)
353 {
354 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
355 a[i + l * a_dim1];
356 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
357 a[i + (l + 1) * a_dim1];
358 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
359 a[i + 1 + l * a_dim1];
360 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
361 a[i + 1 + (l + 1) * a_dim1];
362 }
363 if (uisec < isec)
364 {
365 t1[l - ll + 1 + (isec << 8) - 257] =
366 a[ii + isec - 1 + l * a_dim1];
367 t1[l - ll + 2 + (isec << 8) - 257] =
368 a[ii + isec - 1 + (l + 1) * a_dim1];
369 }
370 }
371 if (ulsec < lsec)
372 {
373 i4 = ii + isec - 1;
374 for (i = ii; i<= i4; ++i)
375 {
376 t1[lsec + ((i - ii + 1) << 8) - 257] =
377 a[i + (ll + lsec - 1) * a_dim1];
378 }
379 }
380
381 uisec = isec - isec % 4;
382 i4 = jj + ujsec - 1;
383 for (j = jj; j <= i4; j += 4)
384 {
385 i5 = ii + uisec - 1;
386 for (i = ii; i <= i5; i += 4)
387 {
388 f11 = c[i + j * c_dim1];
389 f21 = c[i + 1 + j * c_dim1];
390 f12 = c[i + (j + 1) * c_dim1];
391 f22 = c[i + 1 + (j + 1) * c_dim1];
392 f13 = c[i + (j + 2) * c_dim1];
393 f23 = c[i + 1 + (j + 2) * c_dim1];
394 f14 = c[i + (j + 3) * c_dim1];
395 f24 = c[i + 1 + (j + 3) * c_dim1];
396 f31 = c[i + 2 + j * c_dim1];
397 f41 = c[i + 3 + j * c_dim1];
398 f32 = c[i + 2 + (j + 1) * c_dim1];
399 f42 = c[i + 3 + (j + 1) * c_dim1];
400 f33 = c[i + 2 + (j + 2) * c_dim1];
401 f43 = c[i + 3 + (j + 2) * c_dim1];
402 f34 = c[i + 2 + (j + 3) * c_dim1];
403 f44 = c[i + 3 + (j + 3) * c_dim1];
404 i6 = ll + lsec - 1;
405 for (l = ll; l <= i6; ++l)
406 {
407 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
408 * b[l + j * b_dim1];
409 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
410 * b[l + j * b_dim1];
411 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
412 * b[l + (j + 1) * b_dim1];
413 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
414 * b[l + (j + 1) * b_dim1];
415 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
416 * b[l + (j + 2) * b_dim1];
417 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
418 * b[l + (j + 2) * b_dim1];
419 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
420 * b[l + (j + 3) * b_dim1];
421 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
422 * b[l + (j + 3) * b_dim1];
423 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
424 * b[l + j * b_dim1];
425 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
426 * b[l + j * b_dim1];
427 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
428 * b[l + (j + 1) * b_dim1];
429 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
430 * b[l + (j + 1) * b_dim1];
431 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
432 * b[l + (j + 2) * b_dim1];
433 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
434 * b[l + (j + 2) * b_dim1];
435 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
436 * b[l + (j + 3) * b_dim1];
437 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
438 * b[l + (j + 3) * b_dim1];
439 }
440 c[i + j * c_dim1] = f11;
441 c[i + 1 + j * c_dim1] = f21;
442 c[i + (j + 1) * c_dim1] = f12;
443 c[i + 1 + (j + 1) * c_dim1] = f22;
444 c[i + (j + 2) * c_dim1] = f13;
445 c[i + 1 + (j + 2) * c_dim1] = f23;
446 c[i + (j + 3) * c_dim1] = f14;
447 c[i + 1 + (j + 3) * c_dim1] = f24;
448 c[i + 2 + j * c_dim1] = f31;
449 c[i + 3 + j * c_dim1] = f41;
450 c[i + 2 + (j + 1) * c_dim1] = f32;
451 c[i + 3 + (j + 1) * c_dim1] = f42;
452 c[i + 2 + (j + 2) * c_dim1] = f33;
453 c[i + 3 + (j + 2) * c_dim1] = f43;
454 c[i + 2 + (j + 3) * c_dim1] = f34;
455 c[i + 3 + (j + 3) * c_dim1] = f44;
456 }
457 if (uisec < isec)
458 {
459 i5 = ii + isec - 1;
460 for (i = ii + uisec; i <= i5; ++i)
461 {
462 f11 = c[i + j * c_dim1];
463 f12 = c[i + (j + 1) * c_dim1];
464 f13 = c[i + (j + 2) * c_dim1];
465 f14 = c[i + (j + 3) * c_dim1];
466 i6 = ll + lsec - 1;
467 for (l = ll; l <= i6; ++l)
468 {
469 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
470 257] * b[l + j * b_dim1];
471 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
472 257] * b[l + (j + 1) * b_dim1];
473 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
474 257] * b[l + (j + 2) * b_dim1];
475 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
476 257] * b[l + (j + 3) * b_dim1];
477 }
478 c[i + j * c_dim1] = f11;
479 c[i + (j + 1) * c_dim1] = f12;
480 c[i + (j + 2) * c_dim1] = f13;
481 c[i + (j + 3) * c_dim1] = f14;
482 }
483 }
484 }
485 if (ujsec < jsec)
486 {
487 i4 = jj + jsec - 1;
488 for (j = jj + ujsec; j <= i4; ++j)
489 {
490 i5 = ii + uisec - 1;
491 for (i = ii; i <= i5; i += 4)
492 {
493 f11 = c[i + j * c_dim1];
494 f21 = c[i + 1 + j * c_dim1];
495 f31 = c[i + 2 + j * c_dim1];
496 f41 = c[i + 3 + j * c_dim1];
497 i6 = ll + lsec - 1;
498 for (l = ll; l <= i6; ++l)
499 {
500 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
501 257] * b[l + j * b_dim1];
502 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
503 257] * b[l + j * b_dim1];
504 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
505 257] * b[l + j * b_dim1];
506 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
507 257] * b[l + j * b_dim1];
508 }
509 c[i + j * c_dim1] = f11;
510 c[i + 1 + j * c_dim1] = f21;
511 c[i + 2 + j * c_dim1] = f31;
512 c[i + 3 + j * c_dim1] = f41;
513 }
514 i5 = ii + isec - 1;
515 for (i = ii + uisec; i <= i5; ++i)
516 {
517 f11 = c[i + j * c_dim1];
518 i6 = ll + lsec - 1;
519 for (l = ll; l <= i6; ++l)
520 {
521 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
522 257] * b[l + j * b_dim1];
523 }
524 c[i + j * c_dim1] = f11;
525 }
526 }
527 }
528 }
529 }
530 }
531 return;
532 }
533 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
534 {
535 if (GFC_DESCRIPTOR_RANK (a) != 1)
536 {
537 const GFC_INTEGER_1 *restrict abase_x;
538 const GFC_INTEGER_1 *restrict bbase_y;
539 GFC_INTEGER_1 *restrict dest_y;
540 GFC_INTEGER_1 s;
541
542 for (y = 0; y < ycount; y++)
543 {
544 bbase_y = &bbase[y*bystride];
545 dest_y = &dest[y*rystride];
546 for (x = 0; x < xcount; x++)
547 {
548 abase_x = &abase[x*axstride];
549 s = (GFC_INTEGER_1) 0;
550 for (n = 0; n < count; n++)
551 s += abase_x[n] * bbase_y[n];
552 dest_y[x] = s;
553 }
554 }
555 }
556 else
557 {
558 const GFC_INTEGER_1 *restrict bbase_y;
559 GFC_INTEGER_1 s;
560
561 for (y = 0; y < ycount; y++)
562 {
563 bbase_y = &bbase[y*bystride];
564 s = (GFC_INTEGER_1) 0;
565 for (n = 0; n < count; n++)
566 s += abase[n*axstride] * bbase_y[n];
567 dest[y*rystride] = s;
568 }
569 }
570 }
571 else if (axstride < aystride)
572 {
573 for (y = 0; y < ycount; y++)
574 for (x = 0; x < xcount; x++)
575 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
576
577 for (y = 0; y < ycount; y++)
578 for (n = 0; n < count; n++)
579 for (x = 0; x < xcount; x++)
580 /* dest[x,y] += a[x,n] * b[n,y] */
581 dest[x*rxstride + y*rystride] +=
582 abase[x*axstride + n*aystride] *
583 bbase[n*bxstride + y*bystride];
584 }
585 else if (GFC_DESCRIPTOR_RANK (a) == 1)
586 {
587 const GFC_INTEGER_1 *restrict bbase_y;
588 GFC_INTEGER_1 s;
589
590 for (y = 0; y < ycount; y++)
591 {
592 bbase_y = &bbase[y*bystride];
593 s = (GFC_INTEGER_1) 0;
594 for (n = 0; n < count; n++)
595 s += abase[n*axstride] * bbase_y[n*bxstride];
596 dest[y*rxstride] = s;
597 }
598 }
599 else
600 {
601 const GFC_INTEGER_1 *restrict abase_x;
602 const GFC_INTEGER_1 *restrict bbase_y;
603 GFC_INTEGER_1 *restrict dest_y;
604 GFC_INTEGER_1 s;
605
606 for (y = 0; y < ycount; y++)
607 {
608 bbase_y = &bbase[y*bystride];
609 dest_y = &dest[y*rystride];
610 for (x = 0; x < xcount; x++)
611 {
612 abase_x = &abase[x*axstride];
613 s = (GFC_INTEGER_1) 0;
614 for (n = 0; n < count; n++)
615 s += abase_x[n*aystride] * bbase_y[n*bxstride];
616 dest_y[x*rxstride] = s;
617 }
618 }
619 }
620 }
621 #undef POW3
622 #undef min
623 #undef max
624
625 #endif /* HAVE_AVX */
626
627 #ifdef HAVE_AVX2
628 static void
629 matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
630 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
631 int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
632 static void
633 matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
634 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
635 int blas_limit, blas_call gemm)
636 {
637 const GFC_INTEGER_1 * restrict abase;
638 const GFC_INTEGER_1 * restrict bbase;
639 GFC_INTEGER_1 * restrict dest;
640
641 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
642 index_type x, y, n, count, xcount, ycount;
643
644 assert (GFC_DESCRIPTOR_RANK (a) == 2
645 || GFC_DESCRIPTOR_RANK (b) == 2);
646
647 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
648
649 Either A or B (but not both) can be rank 1:
650
651 o One-dimensional argument A is implicitly treated as a row matrix
652 dimensioned [1,count], so xcount=1.
653
654 o One-dimensional argument B is implicitly treated as a column matrix
655 dimensioned [count, 1], so ycount=1.
656 */
657
658 if (retarray->base_addr == NULL)
659 {
660 if (GFC_DESCRIPTOR_RANK (a) == 1)
661 {
662 GFC_DIMENSION_SET(retarray->dim[0], 0,
663 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
664 }
665 else if (GFC_DESCRIPTOR_RANK (b) == 1)
666 {
667 GFC_DIMENSION_SET(retarray->dim[0], 0,
668 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
669 }
670 else
671 {
672 GFC_DIMENSION_SET(retarray->dim[0], 0,
673 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
674
675 GFC_DIMENSION_SET(retarray->dim[1], 0,
676 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
677 GFC_DESCRIPTOR_EXTENT(retarray,0));
678 }
679
680 retarray->base_addr
681 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
682 retarray->offset = 0;
683 }
684 else if (unlikely (compile_options.bounds_check))
685 {
686 index_type ret_extent, arg_extent;
687
688 if (GFC_DESCRIPTOR_RANK (a) == 1)
689 {
690 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
691 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
692 if (arg_extent != ret_extent)
693 runtime_error ("Incorrect extent in return array in"
694 " MATMUL intrinsic: is %ld, should be %ld",
695 (long int) ret_extent, (long int) arg_extent);
696 }
697 else if (GFC_DESCRIPTOR_RANK (b) == 1)
698 {
699 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
700 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
701 if (arg_extent != ret_extent)
702 runtime_error ("Incorrect extent in return array in"
703 " MATMUL intrinsic: is %ld, should be %ld",
704 (long int) ret_extent, (long int) arg_extent);
705 }
706 else
707 {
708 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
709 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
710 if (arg_extent != ret_extent)
711 runtime_error ("Incorrect extent in return array in"
712 " MATMUL intrinsic for dimension 1:"
713 " is %ld, should be %ld",
714 (long int) ret_extent, (long int) arg_extent);
715
716 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
717 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
718 if (arg_extent != ret_extent)
719 runtime_error ("Incorrect extent in return array in"
720 " MATMUL intrinsic for dimension 2:"
721 " is %ld, should be %ld",
722 (long int) ret_extent, (long int) arg_extent);
723 }
724 }
725
726
727 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
728 {
729 /* One-dimensional result may be addressed in the code below
730 either as a row or a column matrix. We want both cases to
731 work. */
732 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
733 }
734 else
735 {
736 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
737 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
738 }
739
740
741 if (GFC_DESCRIPTOR_RANK (a) == 1)
742 {
743 /* Treat it as a a row matrix A[1,count]. */
744 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
745 aystride = 1;
746
747 xcount = 1;
748 count = GFC_DESCRIPTOR_EXTENT(a,0);
749 }
750 else
751 {
752 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
753 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
754
755 count = GFC_DESCRIPTOR_EXTENT(a,1);
756 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
757 }
758
759 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
760 {
761 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
762 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
763 }
764
765 if (GFC_DESCRIPTOR_RANK (b) == 1)
766 {
767 /* Treat it as a column matrix B[count,1] */
768 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
769
770 /* bystride should never be used for 1-dimensional b.
771 in case it is we want it to cause a segfault, rather than
772 an incorrect result. */
773 bystride = 0xDEADBEEF;
774 ycount = 1;
775 }
776 else
777 {
778 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
779 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
780 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
781 }
782
783 abase = a->base_addr;
784 bbase = b->base_addr;
785 dest = retarray->base_addr;
786
787 /* Now that everything is set up, we perform the multiplication
788 itself. */
789
790 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
791 #define min(a,b) ((a) <= (b) ? (a) : (b))
792 #define max(a,b) ((a) >= (b) ? (a) : (b))
793
794 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
795 && (bxstride == 1 || bystride == 1)
796 && (((float) xcount) * ((float) ycount) * ((float) count)
797 > POW3(blas_limit)))
798 {
799 const int m = xcount, n = ycount, k = count, ldc = rystride;
800 const GFC_INTEGER_1 one = 1, zero = 0;
801 const int lda = (axstride == 1) ? aystride : axstride,
802 ldb = (bxstride == 1) ? bystride : bxstride;
803
804 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
805 {
806 assert (gemm != NULL);
807 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
808 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
809 &ldc, 1, 1);
810 return;
811 }
812 }
813
814 if (rxstride == 1 && axstride == 1 && bxstride == 1)
815 {
816 /* This block of code implements a tuned matmul, derived from
817 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
818
819 Bo Kagstrom and Per Ling
820 Department of Computing Science
821 Umea University
822 S-901 87 Umea, Sweden
823
824 from netlib.org, translated to C, and modified for matmul.m4. */
825
826 const GFC_INTEGER_1 *a, *b;
827 GFC_INTEGER_1 *c;
828 const index_type m = xcount, n = ycount, k = count;
829
830 /* System generated locals */
831 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
832 i1, i2, i3, i4, i5, i6;
833
834 /* Local variables */
835 GFC_INTEGER_1 t1[65536], /* was [256][256] */
836 f11, f12, f21, f22, f31, f32, f41, f42,
837 f13, f14, f23, f24, f33, f34, f43, f44;
838 index_type i, j, l, ii, jj, ll;
839 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840
841 a = abase;
842 b = bbase;
843 c = retarray->base_addr;
844
845 /* Parameter adjustments */
846 c_dim1 = rystride;
847 c_offset = 1 + c_dim1;
848 c -= c_offset;
849 a_dim1 = aystride;
850 a_offset = 1 + a_dim1;
851 a -= a_offset;
852 b_dim1 = bystride;
853 b_offset = 1 + b_dim1;
854 b -= b_offset;
855
856 /* Early exit if possible */
857 if (m == 0 || n == 0 || k == 0)
858 return;
859
860 /* Empty c first. */
861 for (j=1; j<=n; j++)
862 for (i=1; i<=m; i++)
863 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
864
865 /* Start turning the crank. */
866 i1 = n;
867 for (jj = 1; jj <= i1; jj += 512)
868 {
869 /* Computing MIN */
870 i2 = 512;
871 i3 = n - jj + 1;
872 jsec = min(i2,i3);
873 ujsec = jsec - jsec % 4;
874 i2 = k;
875 for (ll = 1; ll <= i2; ll += 256)
876 {
877 /* Computing MIN */
878 i3 = 256;
879 i4 = k - ll + 1;
880 lsec = min(i3,i4);
881 ulsec = lsec - lsec % 2;
882
883 i3 = m;
884 for (ii = 1; ii <= i3; ii += 256)
885 {
886 /* Computing MIN */
887 i4 = 256;
888 i5 = m - ii + 1;
889 isec = min(i4,i5);
890 uisec = isec - isec % 2;
891 i4 = ll + ulsec - 1;
892 for (l = ll; l <= i4; l += 2)
893 {
894 i5 = ii + uisec - 1;
895 for (i = ii; i <= i5; i += 2)
896 {
897 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
898 a[i + l * a_dim1];
899 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
900 a[i + (l + 1) * a_dim1];
901 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
902 a[i + 1 + l * a_dim1];
903 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
904 a[i + 1 + (l + 1) * a_dim1];
905 }
906 if (uisec < isec)
907 {
908 t1[l - ll + 1 + (isec << 8) - 257] =
909 a[ii + isec - 1 + l * a_dim1];
910 t1[l - ll + 2 + (isec << 8) - 257] =
911 a[ii + isec - 1 + (l + 1) * a_dim1];
912 }
913 }
914 if (ulsec < lsec)
915 {
916 i4 = ii + isec - 1;
917 for (i = ii; i<= i4; ++i)
918 {
919 t1[lsec + ((i - ii + 1) << 8) - 257] =
920 a[i + (ll + lsec - 1) * a_dim1];
921 }
922 }
923
924 uisec = isec - isec % 4;
925 i4 = jj + ujsec - 1;
926 for (j = jj; j <= i4; j += 4)
927 {
928 i5 = ii + uisec - 1;
929 for (i = ii; i <= i5; i += 4)
930 {
931 f11 = c[i + j * c_dim1];
932 f21 = c[i + 1 + j * c_dim1];
933 f12 = c[i + (j + 1) * c_dim1];
934 f22 = c[i + 1 + (j + 1) * c_dim1];
935 f13 = c[i + (j + 2) * c_dim1];
936 f23 = c[i + 1 + (j + 2) * c_dim1];
937 f14 = c[i + (j + 3) * c_dim1];
938 f24 = c[i + 1 + (j + 3) * c_dim1];
939 f31 = c[i + 2 + j * c_dim1];
940 f41 = c[i + 3 + j * c_dim1];
941 f32 = c[i + 2 + (j + 1) * c_dim1];
942 f42 = c[i + 3 + (j + 1) * c_dim1];
943 f33 = c[i + 2 + (j + 2) * c_dim1];
944 f43 = c[i + 3 + (j + 2) * c_dim1];
945 f34 = c[i + 2 + (j + 3) * c_dim1];
946 f44 = c[i + 3 + (j + 3) * c_dim1];
947 i6 = ll + lsec - 1;
948 for (l = ll; l <= i6; ++l)
949 {
950 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
951 * b[l + j * b_dim1];
952 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
953 * b[l + j * b_dim1];
954 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
955 * b[l + (j + 1) * b_dim1];
956 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
957 * b[l + (j + 1) * b_dim1];
958 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
959 * b[l + (j + 2) * b_dim1];
960 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
961 * b[l + (j + 2) * b_dim1];
962 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
963 * b[l + (j + 3) * b_dim1];
964 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
965 * b[l + (j + 3) * b_dim1];
966 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
967 * b[l + j * b_dim1];
968 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
969 * b[l + j * b_dim1];
970 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
971 * b[l + (j + 1) * b_dim1];
972 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
973 * b[l + (j + 1) * b_dim1];
974 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
975 * b[l + (j + 2) * b_dim1];
976 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
977 * b[l + (j + 2) * b_dim1];
978 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
979 * b[l + (j + 3) * b_dim1];
980 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
981 * b[l + (j + 3) * b_dim1];
982 }
983 c[i + j * c_dim1] = f11;
984 c[i + 1 + j * c_dim1] = f21;
985 c[i + (j + 1) * c_dim1] = f12;
986 c[i + 1 + (j + 1) * c_dim1] = f22;
987 c[i + (j + 2) * c_dim1] = f13;
988 c[i + 1 + (j + 2) * c_dim1] = f23;
989 c[i + (j + 3) * c_dim1] = f14;
990 c[i + 1 + (j + 3) * c_dim1] = f24;
991 c[i + 2 + j * c_dim1] = f31;
992 c[i + 3 + j * c_dim1] = f41;
993 c[i + 2 + (j + 1) * c_dim1] = f32;
994 c[i + 3 + (j + 1) * c_dim1] = f42;
995 c[i + 2 + (j + 2) * c_dim1] = f33;
996 c[i + 3 + (j + 2) * c_dim1] = f43;
997 c[i + 2 + (j + 3) * c_dim1] = f34;
998 c[i + 3 + (j + 3) * c_dim1] = f44;
999 }
1000 if (uisec < isec)
1001 {
1002 i5 = ii + isec - 1;
1003 for (i = ii + uisec; i <= i5; ++i)
1004 {
1005 f11 = c[i + j * c_dim1];
1006 f12 = c[i + (j + 1) * c_dim1];
1007 f13 = c[i + (j + 2) * c_dim1];
1008 f14 = c[i + (j + 3) * c_dim1];
1009 i6 = ll + lsec - 1;
1010 for (l = ll; l <= i6; ++l)
1011 {
1012 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1013 257] * b[l + j * b_dim1];
1014 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1015 257] * b[l + (j + 1) * b_dim1];
1016 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1017 257] * b[l + (j + 2) * b_dim1];
1018 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1019 257] * b[l + (j + 3) * b_dim1];
1020 }
1021 c[i + j * c_dim1] = f11;
1022 c[i + (j + 1) * c_dim1] = f12;
1023 c[i + (j + 2) * c_dim1] = f13;
1024 c[i + (j + 3) * c_dim1] = f14;
1025 }
1026 }
1027 }
1028 if (ujsec < jsec)
1029 {
1030 i4 = jj + jsec - 1;
1031 for (j = jj + ujsec; j <= i4; ++j)
1032 {
1033 i5 = ii + uisec - 1;
1034 for (i = ii; i <= i5; i += 4)
1035 {
1036 f11 = c[i + j * c_dim1];
1037 f21 = c[i + 1 + j * c_dim1];
1038 f31 = c[i + 2 + j * c_dim1];
1039 f41 = c[i + 3 + j * c_dim1];
1040 i6 = ll + lsec - 1;
1041 for (l = ll; l <= i6; ++l)
1042 {
1043 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1044 257] * b[l + j * b_dim1];
1045 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1046 257] * b[l + j * b_dim1];
1047 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1048 257] * b[l + j * b_dim1];
1049 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1050 257] * b[l + j * b_dim1];
1051 }
1052 c[i + j * c_dim1] = f11;
1053 c[i + 1 + j * c_dim1] = f21;
1054 c[i + 2 + j * c_dim1] = f31;
1055 c[i + 3 + j * c_dim1] = f41;
1056 }
1057 i5 = ii + isec - 1;
1058 for (i = ii + uisec; i <= i5; ++i)
1059 {
1060 f11 = c[i + j * c_dim1];
1061 i6 = ll + lsec - 1;
1062 for (l = ll; l <= i6; ++l)
1063 {
1064 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065 257] * b[l + j * b_dim1];
1066 }
1067 c[i + j * c_dim1] = f11;
1068 }
1069 }
1070 }
1071 }
1072 }
1073 }
1074 return;
1075 }
1076 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1077 {
1078 if (GFC_DESCRIPTOR_RANK (a) != 1)
1079 {
1080 const GFC_INTEGER_1 *restrict abase_x;
1081 const GFC_INTEGER_1 *restrict bbase_y;
1082 GFC_INTEGER_1 *restrict dest_y;
1083 GFC_INTEGER_1 s;
1084
1085 for (y = 0; y < ycount; y++)
1086 {
1087 bbase_y = &bbase[y*bystride];
1088 dest_y = &dest[y*rystride];
1089 for (x = 0; x < xcount; x++)
1090 {
1091 abase_x = &abase[x*axstride];
1092 s = (GFC_INTEGER_1) 0;
1093 for (n = 0; n < count; n++)
1094 s += abase_x[n] * bbase_y[n];
1095 dest_y[x] = s;
1096 }
1097 }
1098 }
1099 else
1100 {
1101 const GFC_INTEGER_1 *restrict bbase_y;
1102 GFC_INTEGER_1 s;
1103
1104 for (y = 0; y < ycount; y++)
1105 {
1106 bbase_y = &bbase[y*bystride];
1107 s = (GFC_INTEGER_1) 0;
1108 for (n = 0; n < count; n++)
1109 s += abase[n*axstride] * bbase_y[n];
1110 dest[y*rystride] = s;
1111 }
1112 }
1113 }
1114 else if (axstride < aystride)
1115 {
1116 for (y = 0; y < ycount; y++)
1117 for (x = 0; x < xcount; x++)
1118 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
1119
1120 for (y = 0; y < ycount; y++)
1121 for (n = 0; n < count; n++)
1122 for (x = 0; x < xcount; x++)
1123 /* dest[x,y] += a[x,n] * b[n,y] */
1124 dest[x*rxstride + y*rystride] +=
1125 abase[x*axstride + n*aystride] *
1126 bbase[n*bxstride + y*bystride];
1127 }
1128 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1129 {
1130 const GFC_INTEGER_1 *restrict bbase_y;
1131 GFC_INTEGER_1 s;
1132
1133 for (y = 0; y < ycount; y++)
1134 {
1135 bbase_y = &bbase[y*bystride];
1136 s = (GFC_INTEGER_1) 0;
1137 for (n = 0; n < count; n++)
1138 s += abase[n*axstride] * bbase_y[n*bxstride];
1139 dest[y*rxstride] = s;
1140 }
1141 }
1142 else
1143 {
1144 const GFC_INTEGER_1 *restrict abase_x;
1145 const GFC_INTEGER_1 *restrict bbase_y;
1146 GFC_INTEGER_1 *restrict dest_y;
1147 GFC_INTEGER_1 s;
1148
1149 for (y = 0; y < ycount; y++)
1150 {
1151 bbase_y = &bbase[y*bystride];
1152 dest_y = &dest[y*rystride];
1153 for (x = 0; x < xcount; x++)
1154 {
1155 abase_x = &abase[x*axstride];
1156 s = (GFC_INTEGER_1) 0;
1157 for (n = 0; n < count; n++)
1158 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1159 dest_y[x*rxstride] = s;
1160 }
1161 }
1162 }
1163 }
1164 #undef POW3
1165 #undef min
1166 #undef max
1167
1168 #endif /* HAVE_AVX2 */
1169
1170 #ifdef HAVE_AVX512F
1171 static void
1172 matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
1173 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
1174 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1175 static void
1176 matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
1177 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
1178 int blas_limit, blas_call gemm)
1179 {
1180 const GFC_INTEGER_1 * restrict abase;
1181 const GFC_INTEGER_1 * restrict bbase;
1182 GFC_INTEGER_1 * restrict dest;
1183
1184 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1185 index_type x, y, n, count, xcount, ycount;
1186
1187 assert (GFC_DESCRIPTOR_RANK (a) == 2
1188 || GFC_DESCRIPTOR_RANK (b) == 2);
1189
1190 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1191
1192 Either A or B (but not both) can be rank 1:
1193
1194 o One-dimensional argument A is implicitly treated as a row matrix
1195 dimensioned [1,count], so xcount=1.
1196
1197 o One-dimensional argument B is implicitly treated as a column matrix
1198 dimensioned [count, 1], so ycount=1.
1199 */
1200
1201 if (retarray->base_addr == NULL)
1202 {
1203 if (GFC_DESCRIPTOR_RANK (a) == 1)
1204 {
1205 GFC_DIMENSION_SET(retarray->dim[0], 0,
1206 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1207 }
1208 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1209 {
1210 GFC_DIMENSION_SET(retarray->dim[0], 0,
1211 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1212 }
1213 else
1214 {
1215 GFC_DIMENSION_SET(retarray->dim[0], 0,
1216 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1217
1218 GFC_DIMENSION_SET(retarray->dim[1], 0,
1219 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1220 GFC_DESCRIPTOR_EXTENT(retarray,0));
1221 }
1222
1223 retarray->base_addr
1224 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
1225 retarray->offset = 0;
1226 }
1227 else if (unlikely (compile_options.bounds_check))
1228 {
1229 index_type ret_extent, arg_extent;
1230
1231 if (GFC_DESCRIPTOR_RANK (a) == 1)
1232 {
1233 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1234 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1235 if (arg_extent != ret_extent)
1236 runtime_error ("Incorrect extent in return array in"
1237 " MATMUL intrinsic: is %ld, should be %ld",
1238 (long int) ret_extent, (long int) arg_extent);
1239 }
1240 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1241 {
1242 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1243 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1244 if (arg_extent != ret_extent)
1245 runtime_error ("Incorrect extent in return array in"
1246 " MATMUL intrinsic: is %ld, should be %ld",
1247 (long int) ret_extent, (long int) arg_extent);
1248 }
1249 else
1250 {
1251 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1252 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1253 if (arg_extent != ret_extent)
1254 runtime_error ("Incorrect extent in return array in"
1255 " MATMUL intrinsic for dimension 1:"
1256 " is %ld, should be %ld",
1257 (long int) ret_extent, (long int) arg_extent);
1258
1259 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1260 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1261 if (arg_extent != ret_extent)
1262 runtime_error ("Incorrect extent in return array in"
1263 " MATMUL intrinsic for dimension 2:"
1264 " is %ld, should be %ld",
1265 (long int) ret_extent, (long int) arg_extent);
1266 }
1267 }
1268
1269
1270 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1271 {
1272 /* One-dimensional result may be addressed in the code below
1273 either as a row or a column matrix. We want both cases to
1274 work. */
1275 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1276 }
1277 else
1278 {
1279 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1280 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1281 }
1282
1283
1284 if (GFC_DESCRIPTOR_RANK (a) == 1)
1285 {
1286 /* Treat it as a a row matrix A[1,count]. */
1287 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1288 aystride = 1;
1289
1290 xcount = 1;
1291 count = GFC_DESCRIPTOR_EXTENT(a,0);
1292 }
1293 else
1294 {
1295 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1296 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1297
1298 count = GFC_DESCRIPTOR_EXTENT(a,1);
1299 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1300 }
1301
1302 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1303 {
1304 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1305 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1306 }
1307
1308 if (GFC_DESCRIPTOR_RANK (b) == 1)
1309 {
1310 /* Treat it as a column matrix B[count,1] */
1311 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1312
1313 /* bystride should never be used for 1-dimensional b.
1314 in case it is we want it to cause a segfault, rather than
1315 an incorrect result. */
1316 bystride = 0xDEADBEEF;
1317 ycount = 1;
1318 }
1319 else
1320 {
1321 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1322 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1323 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1324 }
1325
1326 abase = a->base_addr;
1327 bbase = b->base_addr;
1328 dest = retarray->base_addr;
1329
1330 /* Now that everything is set up, we perform the multiplication
1331 itself. */
1332
1333 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1334 #define min(a,b) ((a) <= (b) ? (a) : (b))
1335 #define max(a,b) ((a) >= (b) ? (a) : (b))
1336
1337 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1338 && (bxstride == 1 || bystride == 1)
1339 && (((float) xcount) * ((float) ycount) * ((float) count)
1340 > POW3(blas_limit)))
1341 {
1342 const int m = xcount, n = ycount, k = count, ldc = rystride;
1343 const GFC_INTEGER_1 one = 1, zero = 0;
1344 const int lda = (axstride == 1) ? aystride : axstride,
1345 ldb = (bxstride == 1) ? bystride : bxstride;
1346
1347 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1348 {
1349 assert (gemm != NULL);
1350 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1351 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1352 &ldc, 1, 1);
1353 return;
1354 }
1355 }
1356
1357 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1358 {
1359 /* This block of code implements a tuned matmul, derived from
1360 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1361
1362 Bo Kagstrom and Per Ling
1363 Department of Computing Science
1364 Umea University
1365 S-901 87 Umea, Sweden
1366
1367 from netlib.org, translated to C, and modified for matmul.m4. */
1368
1369 const GFC_INTEGER_1 *a, *b;
1370 GFC_INTEGER_1 *c;
1371 const index_type m = xcount, n = ycount, k = count;
1372
1373 /* System generated locals */
1374 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1375 i1, i2, i3, i4, i5, i6;
1376
1377 /* Local variables */
1378 GFC_INTEGER_1 t1[65536], /* was [256][256] */
1379 f11, f12, f21, f22, f31, f32, f41, f42,
1380 f13, f14, f23, f24, f33, f34, f43, f44;
1381 index_type i, j, l, ii, jj, ll;
1382 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1383
1384 a = abase;
1385 b = bbase;
1386 c = retarray->base_addr;
1387
1388 /* Parameter adjustments */
1389 c_dim1 = rystride;
1390 c_offset = 1 + c_dim1;
1391 c -= c_offset;
1392 a_dim1 = aystride;
1393 a_offset = 1 + a_dim1;
1394 a -= a_offset;
1395 b_dim1 = bystride;
1396 b_offset = 1 + b_dim1;
1397 b -= b_offset;
1398
1399 /* Early exit if possible */
1400 if (m == 0 || n == 0 || k == 0)
1401 return;
1402
1403 /* Empty c first. */
1404 for (j=1; j<=n; j++)
1405 for (i=1; i<=m; i++)
1406 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
1407
1408 /* Start turning the crank. */
1409 i1 = n;
1410 for (jj = 1; jj <= i1; jj += 512)
1411 {
1412 /* Computing MIN */
1413 i2 = 512;
1414 i3 = n - jj + 1;
1415 jsec = min(i2,i3);
1416 ujsec = jsec - jsec % 4;
1417 i2 = k;
1418 for (ll = 1; ll <= i2; ll += 256)
1419 {
1420 /* Computing MIN */
1421 i3 = 256;
1422 i4 = k - ll + 1;
1423 lsec = min(i3,i4);
1424 ulsec = lsec - lsec % 2;
1425
1426 i3 = m;
1427 for (ii = 1; ii <= i3; ii += 256)
1428 {
1429 /* Computing MIN */
1430 i4 = 256;
1431 i5 = m - ii + 1;
1432 isec = min(i4,i5);
1433 uisec = isec - isec % 2;
1434 i4 = ll + ulsec - 1;
1435 for (l = ll; l <= i4; l += 2)
1436 {
1437 i5 = ii + uisec - 1;
1438 for (i = ii; i <= i5; i += 2)
1439 {
1440 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1441 a[i + l * a_dim1];
1442 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1443 a[i + (l + 1) * a_dim1];
1444 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1445 a[i + 1 + l * a_dim1];
1446 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1447 a[i + 1 + (l + 1) * a_dim1];
1448 }
1449 if (uisec < isec)
1450 {
1451 t1[l - ll + 1 + (isec << 8) - 257] =
1452 a[ii + isec - 1 + l * a_dim1];
1453 t1[l - ll + 2 + (isec << 8) - 257] =
1454 a[ii + isec - 1 + (l + 1) * a_dim1];
1455 }
1456 }
1457 if (ulsec < lsec)
1458 {
1459 i4 = ii + isec - 1;
1460 for (i = ii; i<= i4; ++i)
1461 {
1462 t1[lsec + ((i - ii + 1) << 8) - 257] =
1463 a[i + (ll + lsec - 1) * a_dim1];
1464 }
1465 }
1466
1467 uisec = isec - isec % 4;
1468 i4 = jj + ujsec - 1;
1469 for (j = jj; j <= i4; j += 4)
1470 {
1471 i5 = ii + uisec - 1;
1472 for (i = ii; i <= i5; i += 4)
1473 {
1474 f11 = c[i + j * c_dim1];
1475 f21 = c[i + 1 + j * c_dim1];
1476 f12 = c[i + (j + 1) * c_dim1];
1477 f22 = c[i + 1 + (j + 1) * c_dim1];
1478 f13 = c[i + (j + 2) * c_dim1];
1479 f23 = c[i + 1 + (j + 2) * c_dim1];
1480 f14 = c[i + (j + 3) * c_dim1];
1481 f24 = c[i + 1 + (j + 3) * c_dim1];
1482 f31 = c[i + 2 + j * c_dim1];
1483 f41 = c[i + 3 + j * c_dim1];
1484 f32 = c[i + 2 + (j + 1) * c_dim1];
1485 f42 = c[i + 3 + (j + 1) * c_dim1];
1486 f33 = c[i + 2 + (j + 2) * c_dim1];
1487 f43 = c[i + 3 + (j + 2) * c_dim1];
1488 f34 = c[i + 2 + (j + 3) * c_dim1];
1489 f44 = c[i + 3 + (j + 3) * c_dim1];
1490 i6 = ll + lsec - 1;
1491 for (l = ll; l <= i6; ++l)
1492 {
1493 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1494 * b[l + j * b_dim1];
1495 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1496 * b[l + j * b_dim1];
1497 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1498 * b[l + (j + 1) * b_dim1];
1499 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1500 * b[l + (j + 1) * b_dim1];
1501 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1502 * b[l + (j + 2) * b_dim1];
1503 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1504 * b[l + (j + 2) * b_dim1];
1505 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1506 * b[l + (j + 3) * b_dim1];
1507 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1508 * b[l + (j + 3) * b_dim1];
1509 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1510 * b[l + j * b_dim1];
1511 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1512 * b[l + j * b_dim1];
1513 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1514 * b[l + (j + 1) * b_dim1];
1515 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1516 * b[l + (j + 1) * b_dim1];
1517 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1518 * b[l + (j + 2) * b_dim1];
1519 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1520 * b[l + (j + 2) * b_dim1];
1521 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1522 * b[l + (j + 3) * b_dim1];
1523 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1524 * b[l + (j + 3) * b_dim1];
1525 }
1526 c[i + j * c_dim1] = f11;
1527 c[i + 1 + j * c_dim1] = f21;
1528 c[i + (j + 1) * c_dim1] = f12;
1529 c[i + 1 + (j + 1) * c_dim1] = f22;
1530 c[i + (j + 2) * c_dim1] = f13;
1531 c[i + 1 + (j + 2) * c_dim1] = f23;
1532 c[i + (j + 3) * c_dim1] = f14;
1533 c[i + 1 + (j + 3) * c_dim1] = f24;
1534 c[i + 2 + j * c_dim1] = f31;
1535 c[i + 3 + j * c_dim1] = f41;
1536 c[i + 2 + (j + 1) * c_dim1] = f32;
1537 c[i + 3 + (j + 1) * c_dim1] = f42;
1538 c[i + 2 + (j + 2) * c_dim1] = f33;
1539 c[i + 3 + (j + 2) * c_dim1] = f43;
1540 c[i + 2 + (j + 3) * c_dim1] = f34;
1541 c[i + 3 + (j + 3) * c_dim1] = f44;
1542 }
1543 if (uisec < isec)
1544 {
1545 i5 = ii + isec - 1;
1546 for (i = ii + uisec; i <= i5; ++i)
1547 {
1548 f11 = c[i + j * c_dim1];
1549 f12 = c[i + (j + 1) * c_dim1];
1550 f13 = c[i + (j + 2) * c_dim1];
1551 f14 = c[i + (j + 3) * c_dim1];
1552 i6 = ll + lsec - 1;
1553 for (l = ll; l <= i6; ++l)
1554 {
1555 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1556 257] * b[l + j * b_dim1];
1557 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1558 257] * b[l + (j + 1) * b_dim1];
1559 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1560 257] * b[l + (j + 2) * b_dim1];
1561 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1562 257] * b[l + (j + 3) * b_dim1];
1563 }
1564 c[i + j * c_dim1] = f11;
1565 c[i + (j + 1) * c_dim1] = f12;
1566 c[i + (j + 2) * c_dim1] = f13;
1567 c[i + (j + 3) * c_dim1] = f14;
1568 }
1569 }
1570 }
1571 if (ujsec < jsec)
1572 {
1573 i4 = jj + jsec - 1;
1574 for (j = jj + ujsec; j <= i4; ++j)
1575 {
1576 i5 = ii + uisec - 1;
1577 for (i = ii; i <= i5; i += 4)
1578 {
1579 f11 = c[i + j * c_dim1];
1580 f21 = c[i + 1 + j * c_dim1];
1581 f31 = c[i + 2 + j * c_dim1];
1582 f41 = c[i + 3 + j * c_dim1];
1583 i6 = ll + lsec - 1;
1584 for (l = ll; l <= i6; ++l)
1585 {
1586 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1587 257] * b[l + j * b_dim1];
1588 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1589 257] * b[l + j * b_dim1];
1590 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1591 257] * b[l + j * b_dim1];
1592 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1593 257] * b[l + j * b_dim1];
1594 }
1595 c[i + j * c_dim1] = f11;
1596 c[i + 1 + j * c_dim1] = f21;
1597 c[i + 2 + j * c_dim1] = f31;
1598 c[i + 3 + j * c_dim1] = f41;
1599 }
1600 i5 = ii + isec - 1;
1601 for (i = ii + uisec; i <= i5; ++i)
1602 {
1603 f11 = c[i + j * c_dim1];
1604 i6 = ll + lsec - 1;
1605 for (l = ll; l <= i6; ++l)
1606 {
1607 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1608 257] * b[l + j * b_dim1];
1609 }
1610 c[i + j * c_dim1] = f11;
1611 }
1612 }
1613 }
1614 }
1615 }
1616 }
1617 return;
1618 }
1619 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1620 {
1621 if (GFC_DESCRIPTOR_RANK (a) != 1)
1622 {
1623 const GFC_INTEGER_1 *restrict abase_x;
1624 const GFC_INTEGER_1 *restrict bbase_y;
1625 GFC_INTEGER_1 *restrict dest_y;
1626 GFC_INTEGER_1 s;
1627
1628 for (y = 0; y < ycount; y++)
1629 {
1630 bbase_y = &bbase[y*bystride];
1631 dest_y = &dest[y*rystride];
1632 for (x = 0; x < xcount; x++)
1633 {
1634 abase_x = &abase[x*axstride];
1635 s = (GFC_INTEGER_1) 0;
1636 for (n = 0; n < count; n++)
1637 s += abase_x[n] * bbase_y[n];
1638 dest_y[x] = s;
1639 }
1640 }
1641 }
1642 else
1643 {
1644 const GFC_INTEGER_1 *restrict bbase_y;
1645 GFC_INTEGER_1 s;
1646
1647 for (y = 0; y < ycount; y++)
1648 {
1649 bbase_y = &bbase[y*bystride];
1650 s = (GFC_INTEGER_1) 0;
1651 for (n = 0; n < count; n++)
1652 s += abase[n*axstride] * bbase_y[n];
1653 dest[y*rystride] = s;
1654 }
1655 }
1656 }
1657 else if (axstride < aystride)
1658 {
1659 for (y = 0; y < ycount; y++)
1660 for (x = 0; x < xcount; x++)
1661 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
1662
1663 for (y = 0; y < ycount; y++)
1664 for (n = 0; n < count; n++)
1665 for (x = 0; x < xcount; x++)
1666 /* dest[x,y] += a[x,n] * b[n,y] */
1667 dest[x*rxstride + y*rystride] +=
1668 abase[x*axstride + n*aystride] *
1669 bbase[n*bxstride + y*bystride];
1670 }
1671 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1672 {
1673 const GFC_INTEGER_1 *restrict bbase_y;
1674 GFC_INTEGER_1 s;
1675
1676 for (y = 0; y < ycount; y++)
1677 {
1678 bbase_y = &bbase[y*bystride];
1679 s = (GFC_INTEGER_1) 0;
1680 for (n = 0; n < count; n++)
1681 s += abase[n*axstride] * bbase_y[n*bxstride];
1682 dest[y*rxstride] = s;
1683 }
1684 }
1685 else
1686 {
1687 const GFC_INTEGER_1 *restrict abase_x;
1688 const GFC_INTEGER_1 *restrict bbase_y;
1689 GFC_INTEGER_1 *restrict dest_y;
1690 GFC_INTEGER_1 s;
1691
1692 for (y = 0; y < ycount; y++)
1693 {
1694 bbase_y = &bbase[y*bystride];
1695 dest_y = &dest[y*rystride];
1696 for (x = 0; x < xcount; x++)
1697 {
1698 abase_x = &abase[x*axstride];
1699 s = (GFC_INTEGER_1) 0;
1700 for (n = 0; n < count; n++)
1701 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1702 dest_y[x*rxstride] = s;
1703 }
1704 }
1705 }
1706 }
1707 #undef POW3
1708 #undef min
1709 #undef max
1710
1711 #endif /* HAVE_AVX512F */
1712
1713 /* Function to fall back to if there is no special processor-specific version. */
1714 static void
1715 matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
1716 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
1717 int blas_limit, blas_call gemm)
1718 {
1719 const GFC_INTEGER_1 * restrict abase;
1720 const GFC_INTEGER_1 * restrict bbase;
1721 GFC_INTEGER_1 * restrict dest;
1722
1723 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1724 index_type x, y, n, count, xcount, ycount;
1725
1726 assert (GFC_DESCRIPTOR_RANK (a) == 2
1727 || GFC_DESCRIPTOR_RANK (b) == 2);
1728
1729 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1730
1731 Either A or B (but not both) can be rank 1:
1732
1733 o One-dimensional argument A is implicitly treated as a row matrix
1734 dimensioned [1,count], so xcount=1.
1735
1736 o One-dimensional argument B is implicitly treated as a column matrix
1737 dimensioned [count, 1], so ycount=1.
1738 */
1739
1740 if (retarray->base_addr == NULL)
1741 {
1742 if (GFC_DESCRIPTOR_RANK (a) == 1)
1743 {
1744 GFC_DIMENSION_SET(retarray->dim[0], 0,
1745 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1746 }
1747 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1748 {
1749 GFC_DIMENSION_SET(retarray->dim[0], 0,
1750 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1751 }
1752 else
1753 {
1754 GFC_DIMENSION_SET(retarray->dim[0], 0,
1755 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1756
1757 GFC_DIMENSION_SET(retarray->dim[1], 0,
1758 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1759 GFC_DESCRIPTOR_EXTENT(retarray,0));
1760 }
1761
1762 retarray->base_addr
1763 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
1764 retarray->offset = 0;
1765 }
1766 else if (unlikely (compile_options.bounds_check))
1767 {
1768 index_type ret_extent, arg_extent;
1769
1770 if (GFC_DESCRIPTOR_RANK (a) == 1)
1771 {
1772 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1773 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1774 if (arg_extent != ret_extent)
1775 runtime_error ("Incorrect extent in return array in"
1776 " MATMUL intrinsic: is %ld, should be %ld",
1777 (long int) ret_extent, (long int) arg_extent);
1778 }
1779 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1780 {
1781 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1782 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1783 if (arg_extent != ret_extent)
1784 runtime_error ("Incorrect extent in return array in"
1785 " MATMUL intrinsic: is %ld, should be %ld",
1786 (long int) ret_extent, (long int) arg_extent);
1787 }
1788 else
1789 {
1790 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1791 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1792 if (arg_extent != ret_extent)
1793 runtime_error ("Incorrect extent in return array in"
1794 " MATMUL intrinsic for dimension 1:"
1795 " is %ld, should be %ld",
1796 (long int) ret_extent, (long int) arg_extent);
1797
1798 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1799 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1800 if (arg_extent != ret_extent)
1801 runtime_error ("Incorrect extent in return array in"
1802 " MATMUL intrinsic for dimension 2:"
1803 " is %ld, should be %ld",
1804 (long int) ret_extent, (long int) arg_extent);
1805 }
1806 }
1807
1808
1809 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1810 {
1811 /* One-dimensional result may be addressed in the code below
1812 either as a row or a column matrix. We want both cases to
1813 work. */
1814 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1815 }
1816 else
1817 {
1818 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1819 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1820 }
1821
1822
1823 if (GFC_DESCRIPTOR_RANK (a) == 1)
1824 {
1825 /* Treat it as a a row matrix A[1,count]. */
1826 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1827 aystride = 1;
1828
1829 xcount = 1;
1830 count = GFC_DESCRIPTOR_EXTENT(a,0);
1831 }
1832 else
1833 {
1834 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1835 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1836
1837 count = GFC_DESCRIPTOR_EXTENT(a,1);
1838 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1839 }
1840
1841 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1842 {
1843 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1844 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1845 }
1846
1847 if (GFC_DESCRIPTOR_RANK (b) == 1)
1848 {
1849 /* Treat it as a column matrix B[count,1] */
1850 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1851
1852 /* bystride should never be used for 1-dimensional b.
1853 in case it is we want it to cause a segfault, rather than
1854 an incorrect result. */
1855 bystride = 0xDEADBEEF;
1856 ycount = 1;
1857 }
1858 else
1859 {
1860 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1861 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1862 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1863 }
1864
1865 abase = a->base_addr;
1866 bbase = b->base_addr;
1867 dest = retarray->base_addr;
1868
1869 /* Now that everything is set up, we perform the multiplication
1870 itself. */
1871
1872 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1873 #define min(a,b) ((a) <= (b) ? (a) : (b))
1874 #define max(a,b) ((a) >= (b) ? (a) : (b))
1875
1876 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1877 && (bxstride == 1 || bystride == 1)
1878 && (((float) xcount) * ((float) ycount) * ((float) count)
1879 > POW3(blas_limit)))
1880 {
1881 const int m = xcount, n = ycount, k = count, ldc = rystride;
1882 const GFC_INTEGER_1 one = 1, zero = 0;
1883 const int lda = (axstride == 1) ? aystride : axstride,
1884 ldb = (bxstride == 1) ? bystride : bxstride;
1885
1886 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1887 {
1888 assert (gemm != NULL);
1889 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1890 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1891 &ldc, 1, 1);
1892 return;
1893 }
1894 }
1895
1896 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1897 {
1898 /* This block of code implements a tuned matmul, derived from
1899 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1900
1901 Bo Kagstrom and Per Ling
1902 Department of Computing Science
1903 Umea University
1904 S-901 87 Umea, Sweden
1905
1906 from netlib.org, translated to C, and modified for matmul.m4. */
1907
1908 const GFC_INTEGER_1 *a, *b;
1909 GFC_INTEGER_1 *c;
1910 const index_type m = xcount, n = ycount, k = count;
1911
1912 /* System generated locals */
1913 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1914 i1, i2, i3, i4, i5, i6;
1915
1916 /* Local variables */
1917 GFC_INTEGER_1 t1[65536], /* was [256][256] */
1918 f11, f12, f21, f22, f31, f32, f41, f42,
1919 f13, f14, f23, f24, f33, f34, f43, f44;
1920 index_type i, j, l, ii, jj, ll;
1921 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1922
1923 a = abase;
1924 b = bbase;
1925 c = retarray->base_addr;
1926
1927 /* Parameter adjustments */
1928 c_dim1 = rystride;
1929 c_offset = 1 + c_dim1;
1930 c -= c_offset;
1931 a_dim1 = aystride;
1932 a_offset = 1 + a_dim1;
1933 a -= a_offset;
1934 b_dim1 = bystride;
1935 b_offset = 1 + b_dim1;
1936 b -= b_offset;
1937
1938 /* Early exit if possible */
1939 if (m == 0 || n == 0 || k == 0)
1940 return;
1941
1942 /* Empty c first. */
1943 for (j=1; j<=n; j++)
1944 for (i=1; i<=m; i++)
1945 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
1946
1947 /* Start turning the crank. */
1948 i1 = n;
1949 for (jj = 1; jj <= i1; jj += 512)
1950 {
1951 /* Computing MIN */
1952 i2 = 512;
1953 i3 = n - jj + 1;
1954 jsec = min(i2,i3);
1955 ujsec = jsec - jsec % 4;
1956 i2 = k;
1957 for (ll = 1; ll <= i2; ll += 256)
1958 {
1959 /* Computing MIN */
1960 i3 = 256;
1961 i4 = k - ll + 1;
1962 lsec = min(i3,i4);
1963 ulsec = lsec - lsec % 2;
1964
1965 i3 = m;
1966 for (ii = 1; ii <= i3; ii += 256)
1967 {
1968 /* Computing MIN */
1969 i4 = 256;
1970 i5 = m - ii + 1;
1971 isec = min(i4,i5);
1972 uisec = isec - isec % 2;
1973 i4 = ll + ulsec - 1;
1974 for (l = ll; l <= i4; l += 2)
1975 {
1976 i5 = ii + uisec - 1;
1977 for (i = ii; i <= i5; i += 2)
1978 {
1979 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1980 a[i + l * a_dim1];
1981 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1982 a[i + (l + 1) * a_dim1];
1983 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1984 a[i + 1 + l * a_dim1];
1985 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1986 a[i + 1 + (l + 1) * a_dim1];
1987 }
1988 if (uisec < isec)
1989 {
1990 t1[l - ll + 1 + (isec << 8) - 257] =
1991 a[ii + isec - 1 + l * a_dim1];
1992 t1[l - ll + 2 + (isec << 8) - 257] =
1993 a[ii + isec - 1 + (l + 1) * a_dim1];
1994 }
1995 }
1996 if (ulsec < lsec)
1997 {
1998 i4 = ii + isec - 1;
1999 for (i = ii; i<= i4; ++i)
2000 {
2001 t1[lsec + ((i - ii + 1) << 8) - 257] =
2002 a[i + (ll + lsec - 1) * a_dim1];
2003 }
2004 }
2005
2006 uisec = isec - isec % 4;
2007 i4 = jj + ujsec - 1;
2008 for (j = jj; j <= i4; j += 4)
2009 {
2010 i5 = ii + uisec - 1;
2011 for (i = ii; i <= i5; i += 4)
2012 {
2013 f11 = c[i + j * c_dim1];
2014 f21 = c[i + 1 + j * c_dim1];
2015 f12 = c[i + (j + 1) * c_dim1];
2016 f22 = c[i + 1 + (j + 1) * c_dim1];
2017 f13 = c[i + (j + 2) * c_dim1];
2018 f23 = c[i + 1 + (j + 2) * c_dim1];
2019 f14 = c[i + (j + 3) * c_dim1];
2020 f24 = c[i + 1 + (j + 3) * c_dim1];
2021 f31 = c[i + 2 + j * c_dim1];
2022 f41 = c[i + 3 + j * c_dim1];
2023 f32 = c[i + 2 + (j + 1) * c_dim1];
2024 f42 = c[i + 3 + (j + 1) * c_dim1];
2025 f33 = c[i + 2 + (j + 2) * c_dim1];
2026 f43 = c[i + 3 + (j + 2) * c_dim1];
2027 f34 = c[i + 2 + (j + 3) * c_dim1];
2028 f44 = c[i + 3 + (j + 3) * c_dim1];
2029 i6 = ll + lsec - 1;
2030 for (l = ll; l <= i6; ++l)
2031 {
2032 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2033 * b[l + j * b_dim1];
2034 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2035 * b[l + j * b_dim1];
2036 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2037 * b[l + (j + 1) * b_dim1];
2038 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2039 * b[l + (j + 1) * b_dim1];
2040 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2041 * b[l + (j + 2) * b_dim1];
2042 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2043 * b[l + (j + 2) * b_dim1];
2044 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2045 * b[l + (j + 3) * b_dim1];
2046 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2047 * b[l + (j + 3) * b_dim1];
2048 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2049 * b[l + j * b_dim1];
2050 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2051 * b[l + j * b_dim1];
2052 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2053 * b[l + (j + 1) * b_dim1];
2054 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2055 * b[l + (j + 1) * b_dim1];
2056 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2057 * b[l + (j + 2) * b_dim1];
2058 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2059 * b[l + (j + 2) * b_dim1];
2060 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2061 * b[l + (j + 3) * b_dim1];
2062 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2063 * b[l + (j + 3) * b_dim1];
2064 }
2065 c[i + j * c_dim1] = f11;
2066 c[i + 1 + j * c_dim1] = f21;
2067 c[i + (j + 1) * c_dim1] = f12;
2068 c[i + 1 + (j + 1) * c_dim1] = f22;
2069 c[i + (j + 2) * c_dim1] = f13;
2070 c[i + 1 + (j + 2) * c_dim1] = f23;
2071 c[i + (j + 3) * c_dim1] = f14;
2072 c[i + 1 + (j + 3) * c_dim1] = f24;
2073 c[i + 2 + j * c_dim1] = f31;
2074 c[i + 3 + j * c_dim1] = f41;
2075 c[i + 2 + (j + 1) * c_dim1] = f32;
2076 c[i + 3 + (j + 1) * c_dim1] = f42;
2077 c[i + 2 + (j + 2) * c_dim1] = f33;
2078 c[i + 3 + (j + 2) * c_dim1] = f43;
2079 c[i + 2 + (j + 3) * c_dim1] = f34;
2080 c[i + 3 + (j + 3) * c_dim1] = f44;
2081 }
2082 if (uisec < isec)
2083 {
2084 i5 = ii + isec - 1;
2085 for (i = ii + uisec; i <= i5; ++i)
2086 {
2087 f11 = c[i + j * c_dim1];
2088 f12 = c[i + (j + 1) * c_dim1];
2089 f13 = c[i + (j + 2) * c_dim1];
2090 f14 = c[i + (j + 3) * c_dim1];
2091 i6 = ll + lsec - 1;
2092 for (l = ll; l <= i6; ++l)
2093 {
2094 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2095 257] * b[l + j * b_dim1];
2096 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2097 257] * b[l + (j + 1) * b_dim1];
2098 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2099 257] * b[l + (j + 2) * b_dim1];
2100 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2101 257] * b[l + (j + 3) * b_dim1];
2102 }
2103 c[i + j * c_dim1] = f11;
2104 c[i + (j + 1) * c_dim1] = f12;
2105 c[i + (j + 2) * c_dim1] = f13;
2106 c[i + (j + 3) * c_dim1] = f14;
2107 }
2108 }
2109 }
2110 if (ujsec < jsec)
2111 {
2112 i4 = jj + jsec - 1;
2113 for (j = jj + ujsec; j <= i4; ++j)
2114 {
2115 i5 = ii + uisec - 1;
2116 for (i = ii; i <= i5; i += 4)
2117 {
2118 f11 = c[i + j * c_dim1];
2119 f21 = c[i + 1 + j * c_dim1];
2120 f31 = c[i + 2 + j * c_dim1];
2121 f41 = c[i + 3 + j * c_dim1];
2122 i6 = ll + lsec - 1;
2123 for (l = ll; l <= i6; ++l)
2124 {
2125 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2126 257] * b[l + j * b_dim1];
2127 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2128 257] * b[l + j * b_dim1];
2129 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2130 257] * b[l + j * b_dim1];
2131 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2132 257] * b[l + j * b_dim1];
2133 }
2134 c[i + j * c_dim1] = f11;
2135 c[i + 1 + j * c_dim1] = f21;
2136 c[i + 2 + j * c_dim1] = f31;
2137 c[i + 3 + j * c_dim1] = f41;
2138 }
2139 i5 = ii + isec - 1;
2140 for (i = ii + uisec; i <= i5; ++i)
2141 {
2142 f11 = c[i + j * c_dim1];
2143 i6 = ll + lsec - 1;
2144 for (l = ll; l <= i6; ++l)
2145 {
2146 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2147 257] * b[l + j * b_dim1];
2148 }
2149 c[i + j * c_dim1] = f11;
2150 }
2151 }
2152 }
2153 }
2154 }
2155 }
2156 return;
2157 }
2158 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2159 {
2160 if (GFC_DESCRIPTOR_RANK (a) != 1)
2161 {
2162 const GFC_INTEGER_1 *restrict abase_x;
2163 const GFC_INTEGER_1 *restrict bbase_y;
2164 GFC_INTEGER_1 *restrict dest_y;
2165 GFC_INTEGER_1 s;
2166
2167 for (y = 0; y < ycount; y++)
2168 {
2169 bbase_y = &bbase[y*bystride];
2170 dest_y = &dest[y*rystride];
2171 for (x = 0; x < xcount; x++)
2172 {
2173 abase_x = &abase[x*axstride];
2174 s = (GFC_INTEGER_1) 0;
2175 for (n = 0; n < count; n++)
2176 s += abase_x[n] * bbase_y[n];
2177 dest_y[x] = s;
2178 }
2179 }
2180 }
2181 else
2182 {
2183 const GFC_INTEGER_1 *restrict bbase_y;
2184 GFC_INTEGER_1 s;
2185
2186 for (y = 0; y < ycount; y++)
2187 {
2188 bbase_y = &bbase[y*bystride];
2189 s = (GFC_INTEGER_1) 0;
2190 for (n = 0; n < count; n++)
2191 s += abase[n*axstride] * bbase_y[n];
2192 dest[y*rystride] = s;
2193 }
2194 }
2195 }
2196 else if (axstride < aystride)
2197 {
2198 for (y = 0; y < ycount; y++)
2199 for (x = 0; x < xcount; x++)
2200 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
2201
2202 for (y = 0; y < ycount; y++)
2203 for (n = 0; n < count; n++)
2204 for (x = 0; x < xcount; x++)
2205 /* dest[x,y] += a[x,n] * b[n,y] */
2206 dest[x*rxstride + y*rystride] +=
2207 abase[x*axstride + n*aystride] *
2208 bbase[n*bxstride + y*bystride];
2209 }
2210 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2211 {
2212 const GFC_INTEGER_1 *restrict bbase_y;
2213 GFC_INTEGER_1 s;
2214
2215 for (y = 0; y < ycount; y++)
2216 {
2217 bbase_y = &bbase[y*bystride];
2218 s = (GFC_INTEGER_1) 0;
2219 for (n = 0; n < count; n++)
2220 s += abase[n*axstride] * bbase_y[n*bxstride];
2221 dest[y*rxstride] = s;
2222 }
2223 }
2224 else
2225 {
2226 const GFC_INTEGER_1 *restrict abase_x;
2227 const GFC_INTEGER_1 *restrict bbase_y;
2228 GFC_INTEGER_1 *restrict dest_y;
2229 GFC_INTEGER_1 s;
2230
2231 for (y = 0; y < ycount; y++)
2232 {
2233 bbase_y = &bbase[y*bystride];
2234 dest_y = &dest[y*rystride];
2235 for (x = 0; x < xcount; x++)
2236 {
2237 abase_x = &abase[x*axstride];
2238 s = (GFC_INTEGER_1) 0;
2239 for (n = 0; n < count; n++)
2240 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2241 dest_y[x*rxstride] = s;
2242 }
2243 }
2244 }
2245 }
2246 #undef POW3
2247 #undef min
2248 #undef max
2249
2250
2251 /* Compiling main function, with selection code for the processor. */
2252
2253 /* Currently, this is i386 only. Adjust for other architectures. */
2254
2255 #include <config/i386/cpuinfo.h>
2256 void matmul_i1 (gfc_array_i1 * const restrict retarray,
2257 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
2258 int blas_limit, blas_call gemm)
2259 {
2260 static void (*matmul_p) (gfc_array_i1 * const restrict retarray,
2261 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
2262 int blas_limit, blas_call gemm) = NULL;
2263
2264 if (matmul_p == NULL)
2265 {
2266 matmul_p = matmul_i1_vanilla;
2267 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2268 {
2269 /* Run down the available processors in order of preference. */
2270 #ifdef HAVE_AVX512F
2271 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2272 {
2273 matmul_p = matmul_i1_avx512f;
2274 goto tailcall;
2275 }
2276
2277 #endif /* HAVE_AVX512F */
2278
2279 #ifdef HAVE_AVX2
2280 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2281 {
2282 matmul_p = matmul_i1_avx2;
2283 goto tailcall;
2284 }
2285
2286 #endif
2287
2288 #ifdef HAVE_AVX
2289 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2290 {
2291 matmul_p = matmul_i1_avx;
2292 goto tailcall;
2293 }
2294 #endif /* HAVE_AVX */
2295 }
2296 }
2297
2298 tailcall:
2299 (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2300 }
2301
2302 #else /* Just the vanilla function. */
2303
2304 void
2305 matmul_i1 (gfc_array_i1 * const restrict retarray,
2306 gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
2307 int blas_limit, blas_call gemm)
2308 {
2309 const GFC_INTEGER_1 * restrict abase;
2310 const GFC_INTEGER_1 * restrict bbase;
2311 GFC_INTEGER_1 * restrict dest;
2312
2313 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2314 index_type x, y, n, count, xcount, ycount;
2315
2316 assert (GFC_DESCRIPTOR_RANK (a) == 2
2317 || GFC_DESCRIPTOR_RANK (b) == 2);
2318
2319 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2320
2321 Either A or B (but not both) can be rank 1:
2322
2323 o One-dimensional argument A is implicitly treated as a row matrix
2324 dimensioned [1,count], so xcount=1.
2325
2326 o One-dimensional argument B is implicitly treated as a column matrix
2327 dimensioned [count, 1], so ycount=1.
2328 */
2329
2330 if (retarray->base_addr == NULL)
2331 {
2332 if (GFC_DESCRIPTOR_RANK (a) == 1)
2333 {
2334 GFC_DIMENSION_SET(retarray->dim[0], 0,
2335 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2336 }
2337 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2338 {
2339 GFC_DIMENSION_SET(retarray->dim[0], 0,
2340 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2341 }
2342 else
2343 {
2344 GFC_DIMENSION_SET(retarray->dim[0], 0,
2345 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2346
2347 GFC_DIMENSION_SET(retarray->dim[1], 0,
2348 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2349 GFC_DESCRIPTOR_EXTENT(retarray,0));
2350 }
2351
2352 retarray->base_addr
2353 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
2354 retarray->offset = 0;
2355 }
2356 else if (unlikely (compile_options.bounds_check))
2357 {
2358 index_type ret_extent, arg_extent;
2359
2360 if (GFC_DESCRIPTOR_RANK (a) == 1)
2361 {
2362 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2363 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2364 if (arg_extent != ret_extent)
2365 runtime_error ("Incorrect extent in return array in"
2366 " MATMUL intrinsic: is %ld, should be %ld",
2367 (long int) ret_extent, (long int) arg_extent);
2368 }
2369 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2370 {
2371 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2372 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2373 if (arg_extent != ret_extent)
2374 runtime_error ("Incorrect extent in return array in"
2375 " MATMUL intrinsic: is %ld, should be %ld",
2376 (long int) ret_extent, (long int) arg_extent);
2377 }
2378 else
2379 {
2380 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2381 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2382 if (arg_extent != ret_extent)
2383 runtime_error ("Incorrect extent in return array in"
2384 " MATMUL intrinsic for dimension 1:"
2385 " is %ld, should be %ld",
2386 (long int) ret_extent, (long int) arg_extent);
2387
2388 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2389 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2390 if (arg_extent != ret_extent)
2391 runtime_error ("Incorrect extent in return array in"
2392 " MATMUL intrinsic for dimension 2:"
2393 " is %ld, should be %ld",
2394 (long int) ret_extent, (long int) arg_extent);
2395 }
2396 }
2397
2398
2399 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2400 {
2401 /* One-dimensional result may be addressed in the code below
2402 either as a row or a column matrix. We want both cases to
2403 work. */
2404 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2405 }
2406 else
2407 {
2408 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2409 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2410 }
2411
2412
2413 if (GFC_DESCRIPTOR_RANK (a) == 1)
2414 {
2415 /* Treat it as a a row matrix A[1,count]. */
2416 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2417 aystride = 1;
2418
2419 xcount = 1;
2420 count = GFC_DESCRIPTOR_EXTENT(a,0);
2421 }
2422 else
2423 {
2424 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2425 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2426
2427 count = GFC_DESCRIPTOR_EXTENT(a,1);
2428 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2429 }
2430
2431 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2432 {
2433 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2434 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2435 }
2436
2437 if (GFC_DESCRIPTOR_RANK (b) == 1)
2438 {
2439 /* Treat it as a column matrix B[count,1] */
2440 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2441
2442 /* bystride should never be used for 1-dimensional b.
2443 in case it is we want it to cause a segfault, rather than
2444 an incorrect result. */
2445 bystride = 0xDEADBEEF;
2446 ycount = 1;
2447 }
2448 else
2449 {
2450 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2451 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2452 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2453 }
2454
2455 abase = a->base_addr;
2456 bbase = b->base_addr;
2457 dest = retarray->base_addr;
2458
2459 /* Now that everything is set up, we perform the multiplication
2460 itself. */
2461
2462 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2463 #define min(a,b) ((a) <= (b) ? (a) : (b))
2464 #define max(a,b) ((a) >= (b) ? (a) : (b))
2465
2466 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2467 && (bxstride == 1 || bystride == 1)
2468 && (((float) xcount) * ((float) ycount) * ((float) count)
2469 > POW3(blas_limit)))
2470 {
2471 const int m = xcount, n = ycount, k = count, ldc = rystride;
2472 const GFC_INTEGER_1 one = 1, zero = 0;
2473 const int lda = (axstride == 1) ? aystride : axstride,
2474 ldb = (bxstride == 1) ? bystride : bxstride;
2475
2476 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2477 {
2478 assert (gemm != NULL);
2479 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2480 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2481 &ldc, 1, 1);
2482 return;
2483 }
2484 }
2485
2486 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2487 {
2488 /* This block of code implements a tuned matmul, derived from
2489 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2490
2491 Bo Kagstrom and Per Ling
2492 Department of Computing Science
2493 Umea University
2494 S-901 87 Umea, Sweden
2495
2496 from netlib.org, translated to C, and modified for matmul.m4. */
2497
2498 const GFC_INTEGER_1 *a, *b;
2499 GFC_INTEGER_1 *c;
2500 const index_type m = xcount, n = ycount, k = count;
2501
2502 /* System generated locals */
2503 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2504 i1, i2, i3, i4, i5, i6;
2505
2506 /* Local variables */
2507 GFC_INTEGER_1 t1[65536], /* was [256][256] */
2508 f11, f12, f21, f22, f31, f32, f41, f42,
2509 f13, f14, f23, f24, f33, f34, f43, f44;
2510 index_type i, j, l, ii, jj, ll;
2511 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2512
2513 a = abase;
2514 b = bbase;
2515 c = retarray->base_addr;
2516
2517 /* Parameter adjustments */
2518 c_dim1 = rystride;
2519 c_offset = 1 + c_dim1;
2520 c -= c_offset;
2521 a_dim1 = aystride;
2522 a_offset = 1 + a_dim1;
2523 a -= a_offset;
2524 b_dim1 = bystride;
2525 b_offset = 1 + b_dim1;
2526 b -= b_offset;
2527
2528 /* Early exit if possible */
2529 if (m == 0 || n == 0 || k == 0)
2530 return;
2531
2532 /* Empty c first. */
2533 for (j=1; j<=n; j++)
2534 for (i=1; i<=m; i++)
2535 c[i + j * c_dim1] = (GFC_INTEGER_1)0;
2536
2537 /* Start turning the crank. */
2538 i1 = n;
2539 for (jj = 1; jj <= i1; jj += 512)
2540 {
2541 /* Computing MIN */
2542 i2 = 512;
2543 i3 = n - jj + 1;
2544 jsec = min(i2,i3);
2545 ujsec = jsec - jsec % 4;
2546 i2 = k;
2547 for (ll = 1; ll <= i2; ll += 256)
2548 {
2549 /* Computing MIN */
2550 i3 = 256;
2551 i4 = k - ll + 1;
2552 lsec = min(i3,i4);
2553 ulsec = lsec - lsec % 2;
2554
2555 i3 = m;
2556 for (ii = 1; ii <= i3; ii += 256)
2557 {
2558 /* Computing MIN */
2559 i4 = 256;
2560 i5 = m - ii + 1;
2561 isec = min(i4,i5);
2562 uisec = isec - isec % 2;
2563 i4 = ll + ulsec - 1;
2564 for (l = ll; l <= i4; l += 2)
2565 {
2566 i5 = ii + uisec - 1;
2567 for (i = ii; i <= i5; i += 2)
2568 {
2569 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2570 a[i + l * a_dim1];
2571 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2572 a[i + (l + 1) * a_dim1];
2573 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2574 a[i + 1 + l * a_dim1];
2575 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2576 a[i + 1 + (l + 1) * a_dim1];
2577 }
2578 if (uisec < isec)
2579 {
2580 t1[l - ll + 1 + (isec << 8) - 257] =
2581 a[ii + isec - 1 + l * a_dim1];
2582 t1[l - ll + 2 + (isec << 8) - 257] =
2583 a[ii + isec - 1 + (l + 1) * a_dim1];
2584 }
2585 }
2586 if (ulsec < lsec)
2587 {
2588 i4 = ii + isec - 1;
2589 for (i = ii; i<= i4; ++i)
2590 {
2591 t1[lsec + ((i - ii + 1) << 8) - 257] =
2592 a[i + (ll + lsec - 1) * a_dim1];
2593 }
2594 }
2595
2596 uisec = isec - isec % 4;
2597 i4 = jj + ujsec - 1;
2598 for (j = jj; j <= i4; j += 4)
2599 {
2600 i5 = ii + uisec - 1;
2601 for (i = ii; i <= i5; i += 4)
2602 {
2603 f11 = c[i + j * c_dim1];
2604 f21 = c[i + 1 + j * c_dim1];
2605 f12 = c[i + (j + 1) * c_dim1];
2606 f22 = c[i + 1 + (j + 1) * c_dim1];
2607 f13 = c[i + (j + 2) * c_dim1];
2608 f23 = c[i + 1 + (j + 2) * c_dim1];
2609 f14 = c[i + (j + 3) * c_dim1];
2610 f24 = c[i + 1 + (j + 3) * c_dim1];
2611 f31 = c[i + 2 + j * c_dim1];
2612 f41 = c[i + 3 + j * c_dim1];
2613 f32 = c[i + 2 + (j + 1) * c_dim1];
2614 f42 = c[i + 3 + (j + 1) * c_dim1];
2615 f33 = c[i + 2 + (j + 2) * c_dim1];
2616 f43 = c[i + 3 + (j + 2) * c_dim1];
2617 f34 = c[i + 2 + (j + 3) * c_dim1];
2618 f44 = c[i + 3 + (j + 3) * c_dim1];
2619 i6 = ll + lsec - 1;
2620 for (l = ll; l <= i6; ++l)
2621 {
2622 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2623 * b[l + j * b_dim1];
2624 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2625 * b[l + j * b_dim1];
2626 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627 * b[l + (j + 1) * b_dim1];
2628 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629 * b[l + (j + 1) * b_dim1];
2630 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631 * b[l + (j + 2) * b_dim1];
2632 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633 * b[l + (j + 2) * b_dim1];
2634 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635 * b[l + (j + 3) * b_dim1];
2636 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637 * b[l + (j + 3) * b_dim1];
2638 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2639 * b[l + j * b_dim1];
2640 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2641 * b[l + j * b_dim1];
2642 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643 * b[l + (j + 1) * b_dim1];
2644 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645 * b[l + (j + 1) * b_dim1];
2646 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647 * b[l + (j + 2) * b_dim1];
2648 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649 * b[l + (j + 2) * b_dim1];
2650 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651 * b[l + (j + 3) * b_dim1];
2652 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653 * b[l + (j + 3) * b_dim1];
2654 }
2655 c[i + j * c_dim1] = f11;
2656 c[i + 1 + j * c_dim1] = f21;
2657 c[i + (j + 1) * c_dim1] = f12;
2658 c[i + 1 + (j + 1) * c_dim1] = f22;
2659 c[i + (j + 2) * c_dim1] = f13;
2660 c[i + 1 + (j + 2) * c_dim1] = f23;
2661 c[i + (j + 3) * c_dim1] = f14;
2662 c[i + 1 + (j + 3) * c_dim1] = f24;
2663 c[i + 2 + j * c_dim1] = f31;
2664 c[i + 3 + j * c_dim1] = f41;
2665 c[i + 2 + (j + 1) * c_dim1] = f32;
2666 c[i + 3 + (j + 1) * c_dim1] = f42;
2667 c[i + 2 + (j + 2) * c_dim1] = f33;
2668 c[i + 3 + (j + 2) * c_dim1] = f43;
2669 c[i + 2 + (j + 3) * c_dim1] = f34;
2670 c[i + 3 + (j + 3) * c_dim1] = f44;
2671 }
2672 if (uisec < isec)
2673 {
2674 i5 = ii + isec - 1;
2675 for (i = ii + uisec; i <= i5; ++i)
2676 {
2677 f11 = c[i + j * c_dim1];
2678 f12 = c[i + (j + 1) * c_dim1];
2679 f13 = c[i + (j + 2) * c_dim1];
2680 f14 = c[i + (j + 3) * c_dim1];
2681 i6 = ll + lsec - 1;
2682 for (l = ll; l <= i6; ++l)
2683 {
2684 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2685 257] * b[l + j * b_dim1];
2686 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2687 257] * b[l + (j + 1) * b_dim1];
2688 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + (j + 2) * b_dim1];
2690 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691 257] * b[l + (j + 3) * b_dim1];
2692 }
2693 c[i + j * c_dim1] = f11;
2694 c[i + (j + 1) * c_dim1] = f12;
2695 c[i + (j + 2) * c_dim1] = f13;
2696 c[i + (j + 3) * c_dim1] = f14;
2697 }
2698 }
2699 }
2700 if (ujsec < jsec)
2701 {
2702 i4 = jj + jsec - 1;
2703 for (j = jj + ujsec; j <= i4; ++j)
2704 {
2705 i5 = ii + uisec - 1;
2706 for (i = ii; i <= i5; i += 4)
2707 {
2708 f11 = c[i + j * c_dim1];
2709 f21 = c[i + 1 + j * c_dim1];
2710 f31 = c[i + 2 + j * c_dim1];
2711 f41 = c[i + 3 + j * c_dim1];
2712 i6 = ll + lsec - 1;
2713 for (l = ll; l <= i6; ++l)
2714 {
2715 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2716 257] * b[l + j * b_dim1];
2717 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2718 257] * b[l + j * b_dim1];
2719 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2720 257] * b[l + j * b_dim1];
2721 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2722 257] * b[l + j * b_dim1];
2723 }
2724 c[i + j * c_dim1] = f11;
2725 c[i + 1 + j * c_dim1] = f21;
2726 c[i + 2 + j * c_dim1] = f31;
2727 c[i + 3 + j * c_dim1] = f41;
2728 }
2729 i5 = ii + isec - 1;
2730 for (i = ii + uisec; i <= i5; ++i)
2731 {
2732 f11 = c[i + j * c_dim1];
2733 i6 = ll + lsec - 1;
2734 for (l = ll; l <= i6; ++l)
2735 {
2736 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2737 257] * b[l + j * b_dim1];
2738 }
2739 c[i + j * c_dim1] = f11;
2740 }
2741 }
2742 }
2743 }
2744 }
2745 }
2746 return;
2747 }
2748 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2749 {
2750 if (GFC_DESCRIPTOR_RANK (a) != 1)
2751 {
2752 const GFC_INTEGER_1 *restrict abase_x;
2753 const GFC_INTEGER_1 *restrict bbase_y;
2754 GFC_INTEGER_1 *restrict dest_y;
2755 GFC_INTEGER_1 s;
2756
2757 for (y = 0; y < ycount; y++)
2758 {
2759 bbase_y = &bbase[y*bystride];
2760 dest_y = &dest[y*rystride];
2761 for (x = 0; x < xcount; x++)
2762 {
2763 abase_x = &abase[x*axstride];
2764 s = (GFC_INTEGER_1) 0;
2765 for (n = 0; n < count; n++)
2766 s += abase_x[n] * bbase_y[n];
2767 dest_y[x] = s;
2768 }
2769 }
2770 }
2771 else
2772 {
2773 const GFC_INTEGER_1 *restrict bbase_y;
2774 GFC_INTEGER_1 s;
2775
2776 for (y = 0; y < ycount; y++)
2777 {
2778 bbase_y = &bbase[y*bystride];
2779 s = (GFC_INTEGER_1) 0;
2780 for (n = 0; n < count; n++)
2781 s += abase[n*axstride] * bbase_y[n];
2782 dest[y*rystride] = s;
2783 }
2784 }
2785 }
2786 else if (axstride < aystride)
2787 {
2788 for (y = 0; y < ycount; y++)
2789 for (x = 0; x < xcount; x++)
2790 dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
2791
2792 for (y = 0; y < ycount; y++)
2793 for (n = 0; n < count; n++)
2794 for (x = 0; x < xcount; x++)
2795 /* dest[x,y] += a[x,n] * b[n,y] */
2796 dest[x*rxstride + y*rystride] +=
2797 abase[x*axstride + n*aystride] *
2798 bbase[n*bxstride + y*bystride];
2799 }
2800 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2801 {
2802 const GFC_INTEGER_1 *restrict bbase_y;
2803 GFC_INTEGER_1 s;
2804
2805 for (y = 0; y < ycount; y++)
2806 {
2807 bbase_y = &bbase[y*bystride];
2808 s = (GFC_INTEGER_1) 0;
2809 for (n = 0; n < count; n++)
2810 s += abase[n*axstride] * bbase_y[n*bxstride];
2811 dest[y*rxstride] = s;
2812 }
2813 }
2814 else
2815 {
2816 const GFC_INTEGER_1 *restrict abase_x;
2817 const GFC_INTEGER_1 *restrict bbase_y;
2818 GFC_INTEGER_1 *restrict dest_y;
2819 GFC_INTEGER_1 s;
2820
2821 for (y = 0; y < ycount; y++)
2822 {
2823 bbase_y = &bbase[y*bystride];
2824 dest_y = &dest[y*rystride];
2825 for (x = 0; x < xcount; x++)
2826 {
2827 abase_x = &abase[x*axstride];
2828 s = (GFC_INTEGER_1) 0;
2829 for (n = 0; n < count; n++)
2830 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2831 dest_y[x*rxstride] = s;
2832 }
2833 }
2834 }
2835 }
2836 #undef POW3
2837 #undef min
2838 #undef max
2839
2840 #endif
2841 #endif
2842