]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmul_i4.c
re PR target/80687 (VLA usage in libgfortran; nvptx target: "sorry, unimplemented...
[thirdparty/gcc.git] / libgfortran / generated / matmul_i4.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_INTEGER_4)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
39 const int *, const GFC_INTEGER_4 *, const int *,
40 const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
73 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_i4);
76
77 /* Put exhaustive list of possible architectures here here, ORed together. */
78
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81 #ifdef HAVE_AVX
82 static void
83 matmul_i4_avx (gfc_array_i4 * const restrict retarray,
84 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_i4_avx (gfc_array_i4 * const restrict retarray,
88 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90 {
91 const GFC_INTEGER_4 * restrict abase;
92 const GFC_INTEGER_4 * restrict bbase;
93 GFC_INTEGER_4 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110 */
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
169
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
177 }
178 }
179
180
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182 {
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187 }
188 else
189 {
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192 }
193
194
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
196 {
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
200
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
203 }
204 else
205 {
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211 }
212
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214 {
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride = 0xDEADBEEF;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_INTEGER_4 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
265 }
266 }
267
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
269 {
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
272
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
277
278 from netlib.org, translated to C, and modified for matmul.m4. */
279
280 const GFC_INTEGER_4 *a, *b;
281 GFC_INTEGER_4 *c;
282 const index_type m = xcount, n = ycount, k = count;
283
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
287
288 /* Local variables */
289 GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
290 f13, f14, f23, f24, f33, f34, f43, f44;
291 index_type i, j, l, ii, jj, ll;
292 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
293 GFC_INTEGER_4 *t1;
294
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
298
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
309
310 /* Early exit if possible */
311 if (m == 0 || n == 0 || k == 0)
312 return;
313
314 /* Adjust size of t1 to what is needed. */
315 index_type t1_dim;
316 t1_dim = (a_dim1-1) * 256 + b_dim1;
317 if (t1_dim > 65536)
318 t1_dim = 65536;
319
320 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
321
322 /* Empty c first. */
323 for (j=1; j<=n; j++)
324 for (i=1; i<=m; i++)
325 c[i + j * c_dim1] = (GFC_INTEGER_4)0;
326
327 /* Start turning the crank. */
328 i1 = n;
329 for (jj = 1; jj <= i1; jj += 512)
330 {
331 /* Computing MIN */
332 i2 = 512;
333 i3 = n - jj + 1;
334 jsec = min(i2,i3);
335 ujsec = jsec - jsec % 4;
336 i2 = k;
337 for (ll = 1; ll <= i2; ll += 256)
338 {
339 /* Computing MIN */
340 i3 = 256;
341 i4 = k - ll + 1;
342 lsec = min(i3,i4);
343 ulsec = lsec - lsec % 2;
344
345 i3 = m;
346 for (ii = 1; ii <= i3; ii += 256)
347 {
348 /* Computing MIN */
349 i4 = 256;
350 i5 = m - ii + 1;
351 isec = min(i4,i5);
352 uisec = isec - isec % 2;
353 i4 = ll + ulsec - 1;
354 for (l = ll; l <= i4; l += 2)
355 {
356 i5 = ii + uisec - 1;
357 for (i = ii; i <= i5; i += 2)
358 {
359 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
360 a[i + l * a_dim1];
361 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
362 a[i + (l + 1) * a_dim1];
363 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
364 a[i + 1 + l * a_dim1];
365 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
366 a[i + 1 + (l + 1) * a_dim1];
367 }
368 if (uisec < isec)
369 {
370 t1[l - ll + 1 + (isec << 8) - 257] =
371 a[ii + isec - 1 + l * a_dim1];
372 t1[l - ll + 2 + (isec << 8) - 257] =
373 a[ii + isec - 1 + (l + 1) * a_dim1];
374 }
375 }
376 if (ulsec < lsec)
377 {
378 i4 = ii + isec - 1;
379 for (i = ii; i<= i4; ++i)
380 {
381 t1[lsec + ((i - ii + 1) << 8) - 257] =
382 a[i + (ll + lsec - 1) * a_dim1];
383 }
384 }
385
386 uisec = isec - isec % 4;
387 i4 = jj + ujsec - 1;
388 for (j = jj; j <= i4; j += 4)
389 {
390 i5 = ii + uisec - 1;
391 for (i = ii; i <= i5; i += 4)
392 {
393 f11 = c[i + j * c_dim1];
394 f21 = c[i + 1 + j * c_dim1];
395 f12 = c[i + (j + 1) * c_dim1];
396 f22 = c[i + 1 + (j + 1) * c_dim1];
397 f13 = c[i + (j + 2) * c_dim1];
398 f23 = c[i + 1 + (j + 2) * c_dim1];
399 f14 = c[i + (j + 3) * c_dim1];
400 f24 = c[i + 1 + (j + 3) * c_dim1];
401 f31 = c[i + 2 + j * c_dim1];
402 f41 = c[i + 3 + j * c_dim1];
403 f32 = c[i + 2 + (j + 1) * c_dim1];
404 f42 = c[i + 3 + (j + 1) * c_dim1];
405 f33 = c[i + 2 + (j + 2) * c_dim1];
406 f43 = c[i + 3 + (j + 2) * c_dim1];
407 f34 = c[i + 2 + (j + 3) * c_dim1];
408 f44 = c[i + 3 + (j + 3) * c_dim1];
409 i6 = ll + lsec - 1;
410 for (l = ll; l <= i6; ++l)
411 {
412 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
413 * b[l + j * b_dim1];
414 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
415 * b[l + j * b_dim1];
416 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
417 * b[l + (j + 1) * b_dim1];
418 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
419 * b[l + (j + 1) * b_dim1];
420 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
421 * b[l + (j + 2) * b_dim1];
422 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
423 * b[l + (j + 2) * b_dim1];
424 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
425 * b[l + (j + 3) * b_dim1];
426 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
427 * b[l + (j + 3) * b_dim1];
428 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
429 * b[l + j * b_dim1];
430 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
431 * b[l + j * b_dim1];
432 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
433 * b[l + (j + 1) * b_dim1];
434 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
435 * b[l + (j + 1) * b_dim1];
436 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
437 * b[l + (j + 2) * b_dim1];
438 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
439 * b[l + (j + 2) * b_dim1];
440 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
441 * b[l + (j + 3) * b_dim1];
442 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
443 * b[l + (j + 3) * b_dim1];
444 }
445 c[i + j * c_dim1] = f11;
446 c[i + 1 + j * c_dim1] = f21;
447 c[i + (j + 1) * c_dim1] = f12;
448 c[i + 1 + (j + 1) * c_dim1] = f22;
449 c[i + (j + 2) * c_dim1] = f13;
450 c[i + 1 + (j + 2) * c_dim1] = f23;
451 c[i + (j + 3) * c_dim1] = f14;
452 c[i + 1 + (j + 3) * c_dim1] = f24;
453 c[i + 2 + j * c_dim1] = f31;
454 c[i + 3 + j * c_dim1] = f41;
455 c[i + 2 + (j + 1) * c_dim1] = f32;
456 c[i + 3 + (j + 1) * c_dim1] = f42;
457 c[i + 2 + (j + 2) * c_dim1] = f33;
458 c[i + 3 + (j + 2) * c_dim1] = f43;
459 c[i + 2 + (j + 3) * c_dim1] = f34;
460 c[i + 3 + (j + 3) * c_dim1] = f44;
461 }
462 if (uisec < isec)
463 {
464 i5 = ii + isec - 1;
465 for (i = ii + uisec; i <= i5; ++i)
466 {
467 f11 = c[i + j * c_dim1];
468 f12 = c[i + (j + 1) * c_dim1];
469 f13 = c[i + (j + 2) * c_dim1];
470 f14 = c[i + (j + 3) * c_dim1];
471 i6 = ll + lsec - 1;
472 for (l = ll; l <= i6; ++l)
473 {
474 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
475 257] * b[l + j * b_dim1];
476 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
477 257] * b[l + (j + 1) * b_dim1];
478 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
479 257] * b[l + (j + 2) * b_dim1];
480 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
481 257] * b[l + (j + 3) * b_dim1];
482 }
483 c[i + j * c_dim1] = f11;
484 c[i + (j + 1) * c_dim1] = f12;
485 c[i + (j + 2) * c_dim1] = f13;
486 c[i + (j + 3) * c_dim1] = f14;
487 }
488 }
489 }
490 if (ujsec < jsec)
491 {
492 i4 = jj + jsec - 1;
493 for (j = jj + ujsec; j <= i4; ++j)
494 {
495 i5 = ii + uisec - 1;
496 for (i = ii; i <= i5; i += 4)
497 {
498 f11 = c[i + j * c_dim1];
499 f21 = c[i + 1 + j * c_dim1];
500 f31 = c[i + 2 + j * c_dim1];
501 f41 = c[i + 3 + j * c_dim1];
502 i6 = ll + lsec - 1;
503 for (l = ll; l <= i6; ++l)
504 {
505 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
506 257] * b[l + j * b_dim1];
507 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
508 257] * b[l + j * b_dim1];
509 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
510 257] * b[l + j * b_dim1];
511 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
512 257] * b[l + j * b_dim1];
513 }
514 c[i + j * c_dim1] = f11;
515 c[i + 1 + j * c_dim1] = f21;
516 c[i + 2 + j * c_dim1] = f31;
517 c[i + 3 + j * c_dim1] = f41;
518 }
519 i5 = ii + isec - 1;
520 for (i = ii + uisec; i <= i5; ++i)
521 {
522 f11 = c[i + j * c_dim1];
523 i6 = ll + lsec - 1;
524 for (l = ll; l <= i6; ++l)
525 {
526 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
527 257] * b[l + j * b_dim1];
528 }
529 c[i + j * c_dim1] = f11;
530 }
531 }
532 }
533 }
534 }
535 }
536 free(t1);
537 return;
538 }
539 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
540 {
541 if (GFC_DESCRIPTOR_RANK (a) != 1)
542 {
543 const GFC_INTEGER_4 *restrict abase_x;
544 const GFC_INTEGER_4 *restrict bbase_y;
545 GFC_INTEGER_4 *restrict dest_y;
546 GFC_INTEGER_4 s;
547
548 for (y = 0; y < ycount; y++)
549 {
550 bbase_y = &bbase[y*bystride];
551 dest_y = &dest[y*rystride];
552 for (x = 0; x < xcount; x++)
553 {
554 abase_x = &abase[x*axstride];
555 s = (GFC_INTEGER_4) 0;
556 for (n = 0; n < count; n++)
557 s += abase_x[n] * bbase_y[n];
558 dest_y[x] = s;
559 }
560 }
561 }
562 else
563 {
564 const GFC_INTEGER_4 *restrict bbase_y;
565 GFC_INTEGER_4 s;
566
567 for (y = 0; y < ycount; y++)
568 {
569 bbase_y = &bbase[y*bystride];
570 s = (GFC_INTEGER_4) 0;
571 for (n = 0; n < count; n++)
572 s += abase[n*axstride] * bbase_y[n];
573 dest[y*rystride] = s;
574 }
575 }
576 }
577 else if (axstride < aystride)
578 {
579 for (y = 0; y < ycount; y++)
580 for (x = 0; x < xcount; x++)
581 dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
582
583 for (y = 0; y < ycount; y++)
584 for (n = 0; n < count; n++)
585 for (x = 0; x < xcount; x++)
586 /* dest[x,y] += a[x,n] * b[n,y] */
587 dest[x*rxstride + y*rystride] +=
588 abase[x*axstride + n*aystride] *
589 bbase[n*bxstride + y*bystride];
590 }
591 else if (GFC_DESCRIPTOR_RANK (a) == 1)
592 {
593 const GFC_INTEGER_4 *restrict bbase_y;
594 GFC_INTEGER_4 s;
595
596 for (y = 0; y < ycount; y++)
597 {
598 bbase_y = &bbase[y*bystride];
599 s = (GFC_INTEGER_4) 0;
600 for (n = 0; n < count; n++)
601 s += abase[n*axstride] * bbase_y[n*bxstride];
602 dest[y*rxstride] = s;
603 }
604 }
605 else
606 {
607 const GFC_INTEGER_4 *restrict abase_x;
608 const GFC_INTEGER_4 *restrict bbase_y;
609 GFC_INTEGER_4 *restrict dest_y;
610 GFC_INTEGER_4 s;
611
612 for (y = 0; y < ycount; y++)
613 {
614 bbase_y = &bbase[y*bystride];
615 dest_y = &dest[y*rystride];
616 for (x = 0; x < xcount; x++)
617 {
618 abase_x = &abase[x*axstride];
619 s = (GFC_INTEGER_4) 0;
620 for (n = 0; n < count; n++)
621 s += abase_x[n*aystride] * bbase_y[n*bxstride];
622 dest_y[x*rxstride] = s;
623 }
624 }
625 }
626 }
627 #undef POW3
628 #undef min
629 #undef max
630
631 #endif /* HAVE_AVX */
632
633 #ifdef HAVE_AVX2
634 static void
635 matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
636 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
637 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
638 static void
639 matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
640 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
641 int blas_limit, blas_call gemm)
642 {
643 const GFC_INTEGER_4 * restrict abase;
644 const GFC_INTEGER_4 * restrict bbase;
645 GFC_INTEGER_4 * restrict dest;
646
647 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
648 index_type x, y, n, count, xcount, ycount;
649
650 assert (GFC_DESCRIPTOR_RANK (a) == 2
651 || GFC_DESCRIPTOR_RANK (b) == 2);
652
653 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
654
655 Either A or B (but not both) can be rank 1:
656
657 o One-dimensional argument A is implicitly treated as a row matrix
658 dimensioned [1,count], so xcount=1.
659
660 o One-dimensional argument B is implicitly treated as a column matrix
661 dimensioned [count, 1], so ycount=1.
662 */
663
664 if (retarray->base_addr == NULL)
665 {
666 if (GFC_DESCRIPTOR_RANK (a) == 1)
667 {
668 GFC_DIMENSION_SET(retarray->dim[0], 0,
669 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
670 }
671 else if (GFC_DESCRIPTOR_RANK (b) == 1)
672 {
673 GFC_DIMENSION_SET(retarray->dim[0], 0,
674 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
675 }
676 else
677 {
678 GFC_DIMENSION_SET(retarray->dim[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
680
681 GFC_DIMENSION_SET(retarray->dim[1], 0,
682 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
683 GFC_DESCRIPTOR_EXTENT(retarray,0));
684 }
685
686 retarray->base_addr
687 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
688 retarray->offset = 0;
689 }
690 else if (unlikely (compile_options.bounds_check))
691 {
692 index_type ret_extent, arg_extent;
693
694 if (GFC_DESCRIPTOR_RANK (a) == 1)
695 {
696 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent, (long int) arg_extent);
702 }
703 else if (GFC_DESCRIPTOR_RANK (b) == 1)
704 {
705 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
706 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
707 if (arg_extent != ret_extent)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic: is %ld, should be %ld",
710 (long int) ret_extent, (long int) arg_extent);
711 }
712 else
713 {
714 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
715 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
716 if (arg_extent != ret_extent)
717 runtime_error ("Incorrect extent in return array in"
718 " MATMUL intrinsic for dimension 1:"
719 " is %ld, should be %ld",
720 (long int) ret_extent, (long int) arg_extent);
721
722 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
723 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
724 if (arg_extent != ret_extent)
725 runtime_error ("Incorrect extent in return array in"
726 " MATMUL intrinsic for dimension 2:"
727 " is %ld, should be %ld",
728 (long int) ret_extent, (long int) arg_extent);
729 }
730 }
731
732
733 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
734 {
735 /* One-dimensional result may be addressed in the code below
736 either as a row or a column matrix. We want both cases to
737 work. */
738 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
739 }
740 else
741 {
742 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
743 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
744 }
745
746
747 if (GFC_DESCRIPTOR_RANK (a) == 1)
748 {
749 /* Treat it as a a row matrix A[1,count]. */
750 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
751 aystride = 1;
752
753 xcount = 1;
754 count = GFC_DESCRIPTOR_EXTENT(a,0);
755 }
756 else
757 {
758 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
759 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
760
761 count = GFC_DESCRIPTOR_EXTENT(a,1);
762 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
763 }
764
765 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
766 {
767 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
768 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
769 }
770
771 if (GFC_DESCRIPTOR_RANK (b) == 1)
772 {
773 /* Treat it as a column matrix B[count,1] */
774 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
775
776 /* bystride should never be used for 1-dimensional b.
777 in case it is we want it to cause a segfault, rather than
778 an incorrect result. */
779 bystride = 0xDEADBEEF;
780 ycount = 1;
781 }
782 else
783 {
784 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
785 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
786 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
787 }
788
789 abase = a->base_addr;
790 bbase = b->base_addr;
791 dest = retarray->base_addr;
792
793 /* Now that everything is set up, we perform the multiplication
794 itself. */
795
796 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
797 #define min(a,b) ((a) <= (b) ? (a) : (b))
798 #define max(a,b) ((a) >= (b) ? (a) : (b))
799
800 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
801 && (bxstride == 1 || bystride == 1)
802 && (((float) xcount) * ((float) ycount) * ((float) count)
803 > POW3(blas_limit)))
804 {
805 const int m = xcount, n = ycount, k = count, ldc = rystride;
806 const GFC_INTEGER_4 one = 1, zero = 0;
807 const int lda = (axstride == 1) ? aystride : axstride,
808 ldb = (bxstride == 1) ? bystride : bxstride;
809
810 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
811 {
812 assert (gemm != NULL);
813 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
814 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
815 &ldc, 1, 1);
816 return;
817 }
818 }
819
820 if (rxstride == 1 && axstride == 1 && bxstride == 1)
821 {
822 /* This block of code implements a tuned matmul, derived from
823 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
824
825 Bo Kagstrom and Per Ling
826 Department of Computing Science
827 Umea University
828 S-901 87 Umea, Sweden
829
830 from netlib.org, translated to C, and modified for matmul.m4. */
831
832 const GFC_INTEGER_4 *a, *b;
833 GFC_INTEGER_4 *c;
834 const index_type m = xcount, n = ycount, k = count;
835
836 /* System generated locals */
837 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
838 i1, i2, i3, i4, i5, i6;
839
840 /* Local variables */
841 GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
842 f13, f14, f23, f24, f33, f34, f43, f44;
843 index_type i, j, l, ii, jj, ll;
844 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
845 GFC_INTEGER_4 *t1;
846
847 a = abase;
848 b = bbase;
849 c = retarray->base_addr;
850
851 /* Parameter adjustments */
852 c_dim1 = rystride;
853 c_offset = 1 + c_dim1;
854 c -= c_offset;
855 a_dim1 = aystride;
856 a_offset = 1 + a_dim1;
857 a -= a_offset;
858 b_dim1 = bystride;
859 b_offset = 1 + b_dim1;
860 b -= b_offset;
861
862 /* Early exit if possible */
863 if (m == 0 || n == 0 || k == 0)
864 return;
865
866 /* Adjust size of t1 to what is needed. */
867 index_type t1_dim;
868 t1_dim = (a_dim1-1) * 256 + b_dim1;
869 if (t1_dim > 65536)
870 t1_dim = 65536;
871
872 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
873
874 /* Empty c first. */
875 for (j=1; j<=n; j++)
876 for (i=1; i<=m; i++)
877 c[i + j * c_dim1] = (GFC_INTEGER_4)0;
878
879 /* Start turning the crank. */
880 i1 = n;
881 for (jj = 1; jj <= i1; jj += 512)
882 {
883 /* Computing MIN */
884 i2 = 512;
885 i3 = n - jj + 1;
886 jsec = min(i2,i3);
887 ujsec = jsec - jsec % 4;
888 i2 = k;
889 for (ll = 1; ll <= i2; ll += 256)
890 {
891 /* Computing MIN */
892 i3 = 256;
893 i4 = k - ll + 1;
894 lsec = min(i3,i4);
895 ulsec = lsec - lsec % 2;
896
897 i3 = m;
898 for (ii = 1; ii <= i3; ii += 256)
899 {
900 /* Computing MIN */
901 i4 = 256;
902 i5 = m - ii + 1;
903 isec = min(i4,i5);
904 uisec = isec - isec % 2;
905 i4 = ll + ulsec - 1;
906 for (l = ll; l <= i4; l += 2)
907 {
908 i5 = ii + uisec - 1;
909 for (i = ii; i <= i5; i += 2)
910 {
911 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
912 a[i + l * a_dim1];
913 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
914 a[i + (l + 1) * a_dim1];
915 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
916 a[i + 1 + l * a_dim1];
917 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
918 a[i + 1 + (l + 1) * a_dim1];
919 }
920 if (uisec < isec)
921 {
922 t1[l - ll + 1 + (isec << 8) - 257] =
923 a[ii + isec - 1 + l * a_dim1];
924 t1[l - ll + 2 + (isec << 8) - 257] =
925 a[ii + isec - 1 + (l + 1) * a_dim1];
926 }
927 }
928 if (ulsec < lsec)
929 {
930 i4 = ii + isec - 1;
931 for (i = ii; i<= i4; ++i)
932 {
933 t1[lsec + ((i - ii + 1) << 8) - 257] =
934 a[i + (ll + lsec - 1) * a_dim1];
935 }
936 }
937
938 uisec = isec - isec % 4;
939 i4 = jj + ujsec - 1;
940 for (j = jj; j <= i4; j += 4)
941 {
942 i5 = ii + uisec - 1;
943 for (i = ii; i <= i5; i += 4)
944 {
945 f11 = c[i + j * c_dim1];
946 f21 = c[i + 1 + j * c_dim1];
947 f12 = c[i + (j + 1) * c_dim1];
948 f22 = c[i + 1 + (j + 1) * c_dim1];
949 f13 = c[i + (j + 2) * c_dim1];
950 f23 = c[i + 1 + (j + 2) * c_dim1];
951 f14 = c[i + (j + 3) * c_dim1];
952 f24 = c[i + 1 + (j + 3) * c_dim1];
953 f31 = c[i + 2 + j * c_dim1];
954 f41 = c[i + 3 + j * c_dim1];
955 f32 = c[i + 2 + (j + 1) * c_dim1];
956 f42 = c[i + 3 + (j + 1) * c_dim1];
957 f33 = c[i + 2 + (j + 2) * c_dim1];
958 f43 = c[i + 3 + (j + 2) * c_dim1];
959 f34 = c[i + 2 + (j + 3) * c_dim1];
960 f44 = c[i + 3 + (j + 3) * c_dim1];
961 i6 = ll + lsec - 1;
962 for (l = ll; l <= i6; ++l)
963 {
964 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
965 * b[l + j * b_dim1];
966 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
967 * b[l + j * b_dim1];
968 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
969 * b[l + (j + 1) * b_dim1];
970 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
971 * b[l + (j + 1) * b_dim1];
972 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
973 * b[l + (j + 2) * b_dim1];
974 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
975 * b[l + (j + 2) * b_dim1];
976 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
977 * b[l + (j + 3) * b_dim1];
978 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
979 * b[l + (j + 3) * b_dim1];
980 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
981 * b[l + j * b_dim1];
982 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
983 * b[l + j * b_dim1];
984 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
985 * b[l + (j + 1) * b_dim1];
986 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
987 * b[l + (j + 1) * b_dim1];
988 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
989 * b[l + (j + 2) * b_dim1];
990 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
991 * b[l + (j + 2) * b_dim1];
992 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
993 * b[l + (j + 3) * b_dim1];
994 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
995 * b[l + (j + 3) * b_dim1];
996 }
997 c[i + j * c_dim1] = f11;
998 c[i + 1 + j * c_dim1] = f21;
999 c[i + (j + 1) * c_dim1] = f12;
1000 c[i + 1 + (j + 1) * c_dim1] = f22;
1001 c[i + (j + 2) * c_dim1] = f13;
1002 c[i + 1 + (j + 2) * c_dim1] = f23;
1003 c[i + (j + 3) * c_dim1] = f14;
1004 c[i + 1 + (j + 3) * c_dim1] = f24;
1005 c[i + 2 + j * c_dim1] = f31;
1006 c[i + 3 + j * c_dim1] = f41;
1007 c[i + 2 + (j + 1) * c_dim1] = f32;
1008 c[i + 3 + (j + 1) * c_dim1] = f42;
1009 c[i + 2 + (j + 2) * c_dim1] = f33;
1010 c[i + 3 + (j + 2) * c_dim1] = f43;
1011 c[i + 2 + (j + 3) * c_dim1] = f34;
1012 c[i + 3 + (j + 3) * c_dim1] = f44;
1013 }
1014 if (uisec < isec)
1015 {
1016 i5 = ii + isec - 1;
1017 for (i = ii + uisec; i <= i5; ++i)
1018 {
1019 f11 = c[i + j * c_dim1];
1020 f12 = c[i + (j + 1) * c_dim1];
1021 f13 = c[i + (j + 2) * c_dim1];
1022 f14 = c[i + (j + 3) * c_dim1];
1023 i6 = ll + lsec - 1;
1024 for (l = ll; l <= i6; ++l)
1025 {
1026 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027 257] * b[l + j * b_dim1];
1028 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029 257] * b[l + (j + 1) * b_dim1];
1030 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031 257] * b[l + (j + 2) * b_dim1];
1032 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033 257] * b[l + (j + 3) * b_dim1];
1034 }
1035 c[i + j * c_dim1] = f11;
1036 c[i + (j + 1) * c_dim1] = f12;
1037 c[i + (j + 2) * c_dim1] = f13;
1038 c[i + (j + 3) * c_dim1] = f14;
1039 }
1040 }
1041 }
1042 if (ujsec < jsec)
1043 {
1044 i4 = jj + jsec - 1;
1045 for (j = jj + ujsec; j <= i4; ++j)
1046 {
1047 i5 = ii + uisec - 1;
1048 for (i = ii; i <= i5; i += 4)
1049 {
1050 f11 = c[i + j * c_dim1];
1051 f21 = c[i + 1 + j * c_dim1];
1052 f31 = c[i + 2 + j * c_dim1];
1053 f41 = c[i + 3 + j * c_dim1];
1054 i6 = ll + lsec - 1;
1055 for (l = ll; l <= i6; ++l)
1056 {
1057 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058 257] * b[l + j * b_dim1];
1059 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060 257] * b[l + j * b_dim1];
1061 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062 257] * b[l + j * b_dim1];
1063 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064 257] * b[l + j * b_dim1];
1065 }
1066 c[i + j * c_dim1] = f11;
1067 c[i + 1 + j * c_dim1] = f21;
1068 c[i + 2 + j * c_dim1] = f31;
1069 c[i + 3 + j * c_dim1] = f41;
1070 }
1071 i5 = ii + isec - 1;
1072 for (i = ii + uisec; i <= i5; ++i)
1073 {
1074 f11 = c[i + j * c_dim1];
1075 i6 = ll + lsec - 1;
1076 for (l = ll; l <= i6; ++l)
1077 {
1078 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079 257] * b[l + j * b_dim1];
1080 }
1081 c[i + j * c_dim1] = f11;
1082 }
1083 }
1084 }
1085 }
1086 }
1087 }
1088 free(t1);
1089 return;
1090 }
1091 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092 {
1093 if (GFC_DESCRIPTOR_RANK (a) != 1)
1094 {
1095 const GFC_INTEGER_4 *restrict abase_x;
1096 const GFC_INTEGER_4 *restrict bbase_y;
1097 GFC_INTEGER_4 *restrict dest_y;
1098 GFC_INTEGER_4 s;
1099
1100 for (y = 0; y < ycount; y++)
1101 {
1102 bbase_y = &bbase[y*bystride];
1103 dest_y = &dest[y*rystride];
1104 for (x = 0; x < xcount; x++)
1105 {
1106 abase_x = &abase[x*axstride];
1107 s = (GFC_INTEGER_4) 0;
1108 for (n = 0; n < count; n++)
1109 s += abase_x[n] * bbase_y[n];
1110 dest_y[x] = s;
1111 }
1112 }
1113 }
1114 else
1115 {
1116 const GFC_INTEGER_4 *restrict bbase_y;
1117 GFC_INTEGER_4 s;
1118
1119 for (y = 0; y < ycount; y++)
1120 {
1121 bbase_y = &bbase[y*bystride];
1122 s = (GFC_INTEGER_4) 0;
1123 for (n = 0; n < count; n++)
1124 s += abase[n*axstride] * bbase_y[n];
1125 dest[y*rystride] = s;
1126 }
1127 }
1128 }
1129 else if (axstride < aystride)
1130 {
1131 for (y = 0; y < ycount; y++)
1132 for (x = 0; x < xcount; x++)
1133 dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
1134
1135 for (y = 0; y < ycount; y++)
1136 for (n = 0; n < count; n++)
1137 for (x = 0; x < xcount; x++)
1138 /* dest[x,y] += a[x,n] * b[n,y] */
1139 dest[x*rxstride + y*rystride] +=
1140 abase[x*axstride + n*aystride] *
1141 bbase[n*bxstride + y*bystride];
1142 }
1143 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1144 {
1145 const GFC_INTEGER_4 *restrict bbase_y;
1146 GFC_INTEGER_4 s;
1147
1148 for (y = 0; y < ycount; y++)
1149 {
1150 bbase_y = &bbase[y*bystride];
1151 s = (GFC_INTEGER_4) 0;
1152 for (n = 0; n < count; n++)
1153 s += abase[n*axstride] * bbase_y[n*bxstride];
1154 dest[y*rxstride] = s;
1155 }
1156 }
1157 else
1158 {
1159 const GFC_INTEGER_4 *restrict abase_x;
1160 const GFC_INTEGER_4 *restrict bbase_y;
1161 GFC_INTEGER_4 *restrict dest_y;
1162 GFC_INTEGER_4 s;
1163
1164 for (y = 0; y < ycount; y++)
1165 {
1166 bbase_y = &bbase[y*bystride];
1167 dest_y = &dest[y*rystride];
1168 for (x = 0; x < xcount; x++)
1169 {
1170 abase_x = &abase[x*axstride];
1171 s = (GFC_INTEGER_4) 0;
1172 for (n = 0; n < count; n++)
1173 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174 dest_y[x*rxstride] = s;
1175 }
1176 }
1177 }
1178 }
1179 #undef POW3
1180 #undef min
1181 #undef max
1182
1183 #endif /* HAVE_AVX2 */
1184
1185 #ifdef HAVE_AVX512F
1186 static void
1187 matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
1188 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
1189 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1190 static void
1191 matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
1192 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
1193 int blas_limit, blas_call gemm)
1194 {
1195 const GFC_INTEGER_4 * restrict abase;
1196 const GFC_INTEGER_4 * restrict bbase;
1197 GFC_INTEGER_4 * restrict dest;
1198
1199 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1200 index_type x, y, n, count, xcount, ycount;
1201
1202 assert (GFC_DESCRIPTOR_RANK (a) == 2
1203 || GFC_DESCRIPTOR_RANK (b) == 2);
1204
1205 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1206
1207 Either A or B (but not both) can be rank 1:
1208
1209 o One-dimensional argument A is implicitly treated as a row matrix
1210 dimensioned [1,count], so xcount=1.
1211
1212 o One-dimensional argument B is implicitly treated as a column matrix
1213 dimensioned [count, 1], so ycount=1.
1214 */
1215
1216 if (retarray->base_addr == NULL)
1217 {
1218 if (GFC_DESCRIPTOR_RANK (a) == 1)
1219 {
1220 GFC_DIMENSION_SET(retarray->dim[0], 0,
1221 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1222 }
1223 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1224 {
1225 GFC_DIMENSION_SET(retarray->dim[0], 0,
1226 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1227 }
1228 else
1229 {
1230 GFC_DIMENSION_SET(retarray->dim[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1232
1233 GFC_DIMENSION_SET(retarray->dim[1], 0,
1234 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1235 GFC_DESCRIPTOR_EXTENT(retarray,0));
1236 }
1237
1238 retarray->base_addr
1239 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
1240 retarray->offset = 0;
1241 }
1242 else if (unlikely (compile_options.bounds_check))
1243 {
1244 index_type ret_extent, arg_extent;
1245
1246 if (GFC_DESCRIPTOR_RANK (a) == 1)
1247 {
1248 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1249 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250 if (arg_extent != ret_extent)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic: is %ld, should be %ld",
1253 (long int) ret_extent, (long int) arg_extent);
1254 }
1255 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1256 {
1257 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1258 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1259 if (arg_extent != ret_extent)
1260 runtime_error ("Incorrect extent in return array in"
1261 " MATMUL intrinsic: is %ld, should be %ld",
1262 (long int) ret_extent, (long int) arg_extent);
1263 }
1264 else
1265 {
1266 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1267 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1268 if (arg_extent != ret_extent)
1269 runtime_error ("Incorrect extent in return array in"
1270 " MATMUL intrinsic for dimension 1:"
1271 " is %ld, should be %ld",
1272 (long int) ret_extent, (long int) arg_extent);
1273
1274 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1275 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1276 if (arg_extent != ret_extent)
1277 runtime_error ("Incorrect extent in return array in"
1278 " MATMUL intrinsic for dimension 2:"
1279 " is %ld, should be %ld",
1280 (long int) ret_extent, (long int) arg_extent);
1281 }
1282 }
1283
1284
1285 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1286 {
1287 /* One-dimensional result may be addressed in the code below
1288 either as a row or a column matrix. We want both cases to
1289 work. */
1290 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1291 }
1292 else
1293 {
1294 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1295 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1296 }
1297
1298
1299 if (GFC_DESCRIPTOR_RANK (a) == 1)
1300 {
1301 /* Treat it as a a row matrix A[1,count]. */
1302 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1303 aystride = 1;
1304
1305 xcount = 1;
1306 count = GFC_DESCRIPTOR_EXTENT(a,0);
1307 }
1308 else
1309 {
1310 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1311 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1312
1313 count = GFC_DESCRIPTOR_EXTENT(a,1);
1314 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1315 }
1316
1317 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1318 {
1319 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1320 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1321 }
1322
1323 if (GFC_DESCRIPTOR_RANK (b) == 1)
1324 {
1325 /* Treat it as a column matrix B[count,1] */
1326 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1327
1328 /* bystride should never be used for 1-dimensional b.
1329 in case it is we want it to cause a segfault, rather than
1330 an incorrect result. */
1331 bystride = 0xDEADBEEF;
1332 ycount = 1;
1333 }
1334 else
1335 {
1336 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1338 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1339 }
1340
1341 abase = a->base_addr;
1342 bbase = b->base_addr;
1343 dest = retarray->base_addr;
1344
1345 /* Now that everything is set up, we perform the multiplication
1346 itself. */
1347
1348 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1349 #define min(a,b) ((a) <= (b) ? (a) : (b))
1350 #define max(a,b) ((a) >= (b) ? (a) : (b))
1351
1352 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1353 && (bxstride == 1 || bystride == 1)
1354 && (((float) xcount) * ((float) ycount) * ((float) count)
1355 > POW3(blas_limit)))
1356 {
1357 const int m = xcount, n = ycount, k = count, ldc = rystride;
1358 const GFC_INTEGER_4 one = 1, zero = 0;
1359 const int lda = (axstride == 1) ? aystride : axstride,
1360 ldb = (bxstride == 1) ? bystride : bxstride;
1361
1362 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1363 {
1364 assert (gemm != NULL);
1365 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1366 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1367 &ldc, 1, 1);
1368 return;
1369 }
1370 }
1371
1372 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1373 {
1374 /* This block of code implements a tuned matmul, derived from
1375 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1376
1377 Bo Kagstrom and Per Ling
1378 Department of Computing Science
1379 Umea University
1380 S-901 87 Umea, Sweden
1381
1382 from netlib.org, translated to C, and modified for matmul.m4. */
1383
1384 const GFC_INTEGER_4 *a, *b;
1385 GFC_INTEGER_4 *c;
1386 const index_type m = xcount, n = ycount, k = count;
1387
1388 /* System generated locals */
1389 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1390 i1, i2, i3, i4, i5, i6;
1391
1392 /* Local variables */
1393 GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
1394 f13, f14, f23, f24, f33, f34, f43, f44;
1395 index_type i, j, l, ii, jj, ll;
1396 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1397 GFC_INTEGER_4 *t1;
1398
1399 a = abase;
1400 b = bbase;
1401 c = retarray->base_addr;
1402
1403 /* Parameter adjustments */
1404 c_dim1 = rystride;
1405 c_offset = 1 + c_dim1;
1406 c -= c_offset;
1407 a_dim1 = aystride;
1408 a_offset = 1 + a_dim1;
1409 a -= a_offset;
1410 b_dim1 = bystride;
1411 b_offset = 1 + b_dim1;
1412 b -= b_offset;
1413
1414 /* Early exit if possible */
1415 if (m == 0 || n == 0 || k == 0)
1416 return;
1417
1418 /* Adjust size of t1 to what is needed. */
1419 index_type t1_dim;
1420 t1_dim = (a_dim1-1) * 256 + b_dim1;
1421 if (t1_dim > 65536)
1422 t1_dim = 65536;
1423
1424 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
1425
1426 /* Empty c first. */
1427 for (j=1; j<=n; j++)
1428 for (i=1; i<=m; i++)
1429 c[i + j * c_dim1] = (GFC_INTEGER_4)0;
1430
1431 /* Start turning the crank. */
1432 i1 = n;
1433 for (jj = 1; jj <= i1; jj += 512)
1434 {
1435 /* Computing MIN */
1436 i2 = 512;
1437 i3 = n - jj + 1;
1438 jsec = min(i2,i3);
1439 ujsec = jsec - jsec % 4;
1440 i2 = k;
1441 for (ll = 1; ll <= i2; ll += 256)
1442 {
1443 /* Computing MIN */
1444 i3 = 256;
1445 i4 = k - ll + 1;
1446 lsec = min(i3,i4);
1447 ulsec = lsec - lsec % 2;
1448
1449 i3 = m;
1450 for (ii = 1; ii <= i3; ii += 256)
1451 {
1452 /* Computing MIN */
1453 i4 = 256;
1454 i5 = m - ii + 1;
1455 isec = min(i4,i5);
1456 uisec = isec - isec % 2;
1457 i4 = ll + ulsec - 1;
1458 for (l = ll; l <= i4; l += 2)
1459 {
1460 i5 = ii + uisec - 1;
1461 for (i = ii; i <= i5; i += 2)
1462 {
1463 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1464 a[i + l * a_dim1];
1465 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1466 a[i + (l + 1) * a_dim1];
1467 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1468 a[i + 1 + l * a_dim1];
1469 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1470 a[i + 1 + (l + 1) * a_dim1];
1471 }
1472 if (uisec < isec)
1473 {
1474 t1[l - ll + 1 + (isec << 8) - 257] =
1475 a[ii + isec - 1 + l * a_dim1];
1476 t1[l - ll + 2 + (isec << 8) - 257] =
1477 a[ii + isec - 1 + (l + 1) * a_dim1];
1478 }
1479 }
1480 if (ulsec < lsec)
1481 {
1482 i4 = ii + isec - 1;
1483 for (i = ii; i<= i4; ++i)
1484 {
1485 t1[lsec + ((i - ii + 1) << 8) - 257] =
1486 a[i + (ll + lsec - 1) * a_dim1];
1487 }
1488 }
1489
1490 uisec = isec - isec % 4;
1491 i4 = jj + ujsec - 1;
1492 for (j = jj; j <= i4; j += 4)
1493 {
1494 i5 = ii + uisec - 1;
1495 for (i = ii; i <= i5; i += 4)
1496 {
1497 f11 = c[i + j * c_dim1];
1498 f21 = c[i + 1 + j * c_dim1];
1499 f12 = c[i + (j + 1) * c_dim1];
1500 f22 = c[i + 1 + (j + 1) * c_dim1];
1501 f13 = c[i + (j + 2) * c_dim1];
1502 f23 = c[i + 1 + (j + 2) * c_dim1];
1503 f14 = c[i + (j + 3) * c_dim1];
1504 f24 = c[i + 1 + (j + 3) * c_dim1];
1505 f31 = c[i + 2 + j * c_dim1];
1506 f41 = c[i + 3 + j * c_dim1];
1507 f32 = c[i + 2 + (j + 1) * c_dim1];
1508 f42 = c[i + 3 + (j + 1) * c_dim1];
1509 f33 = c[i + 2 + (j + 2) * c_dim1];
1510 f43 = c[i + 3 + (j + 2) * c_dim1];
1511 f34 = c[i + 2 + (j + 3) * c_dim1];
1512 f44 = c[i + 3 + (j + 3) * c_dim1];
1513 i6 = ll + lsec - 1;
1514 for (l = ll; l <= i6; ++l)
1515 {
1516 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1517 * b[l + j * b_dim1];
1518 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1519 * b[l + j * b_dim1];
1520 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1521 * b[l + (j + 1) * b_dim1];
1522 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1523 * b[l + (j + 1) * b_dim1];
1524 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1525 * b[l + (j + 2) * b_dim1];
1526 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1527 * b[l + (j + 2) * b_dim1];
1528 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1529 * b[l + (j + 3) * b_dim1];
1530 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1531 * b[l + (j + 3) * b_dim1];
1532 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1533 * b[l + j * b_dim1];
1534 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1535 * b[l + j * b_dim1];
1536 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1537 * b[l + (j + 1) * b_dim1];
1538 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1539 * b[l + (j + 1) * b_dim1];
1540 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1541 * b[l + (j + 2) * b_dim1];
1542 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1543 * b[l + (j + 2) * b_dim1];
1544 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1545 * b[l + (j + 3) * b_dim1];
1546 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1547 * b[l + (j + 3) * b_dim1];
1548 }
1549 c[i + j * c_dim1] = f11;
1550 c[i + 1 + j * c_dim1] = f21;
1551 c[i + (j + 1) * c_dim1] = f12;
1552 c[i + 1 + (j + 1) * c_dim1] = f22;
1553 c[i + (j + 2) * c_dim1] = f13;
1554 c[i + 1 + (j + 2) * c_dim1] = f23;
1555 c[i + (j + 3) * c_dim1] = f14;
1556 c[i + 1 + (j + 3) * c_dim1] = f24;
1557 c[i + 2 + j * c_dim1] = f31;
1558 c[i + 3 + j * c_dim1] = f41;
1559 c[i + 2 + (j + 1) * c_dim1] = f32;
1560 c[i + 3 + (j + 1) * c_dim1] = f42;
1561 c[i + 2 + (j + 2) * c_dim1] = f33;
1562 c[i + 3 + (j + 2) * c_dim1] = f43;
1563 c[i + 2 + (j + 3) * c_dim1] = f34;
1564 c[i + 3 + (j + 3) * c_dim1] = f44;
1565 }
1566 if (uisec < isec)
1567 {
1568 i5 = ii + isec - 1;
1569 for (i = ii + uisec; i <= i5; ++i)
1570 {
1571 f11 = c[i + j * c_dim1];
1572 f12 = c[i + (j + 1) * c_dim1];
1573 f13 = c[i + (j + 2) * c_dim1];
1574 f14 = c[i + (j + 3) * c_dim1];
1575 i6 = ll + lsec - 1;
1576 for (l = ll; l <= i6; ++l)
1577 {
1578 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1579 257] * b[l + j * b_dim1];
1580 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1581 257] * b[l + (j + 1) * b_dim1];
1582 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1583 257] * b[l + (j + 2) * b_dim1];
1584 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1585 257] * b[l + (j + 3) * b_dim1];
1586 }
1587 c[i + j * c_dim1] = f11;
1588 c[i + (j + 1) * c_dim1] = f12;
1589 c[i + (j + 2) * c_dim1] = f13;
1590 c[i + (j + 3) * c_dim1] = f14;
1591 }
1592 }
1593 }
1594 if (ujsec < jsec)
1595 {
1596 i4 = jj + jsec - 1;
1597 for (j = jj + ujsec; j <= i4; ++j)
1598 {
1599 i5 = ii + uisec - 1;
1600 for (i = ii; i <= i5; i += 4)
1601 {
1602 f11 = c[i + j * c_dim1];
1603 f21 = c[i + 1 + j * c_dim1];
1604 f31 = c[i + 2 + j * c_dim1];
1605 f41 = c[i + 3 + j * c_dim1];
1606 i6 = ll + lsec - 1;
1607 for (l = ll; l <= i6; ++l)
1608 {
1609 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1610 257] * b[l + j * b_dim1];
1611 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1612 257] * b[l + j * b_dim1];
1613 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1614 257] * b[l + j * b_dim1];
1615 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1616 257] * b[l + j * b_dim1];
1617 }
1618 c[i + j * c_dim1] = f11;
1619 c[i + 1 + j * c_dim1] = f21;
1620 c[i + 2 + j * c_dim1] = f31;
1621 c[i + 3 + j * c_dim1] = f41;
1622 }
1623 i5 = ii + isec - 1;
1624 for (i = ii + uisec; i <= i5; ++i)
1625 {
1626 f11 = c[i + j * c_dim1];
1627 i6 = ll + lsec - 1;
1628 for (l = ll; l <= i6; ++l)
1629 {
1630 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1631 257] * b[l + j * b_dim1];
1632 }
1633 c[i + j * c_dim1] = f11;
1634 }
1635 }
1636 }
1637 }
1638 }
1639 }
1640 free(t1);
1641 return;
1642 }
1643 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1644 {
1645 if (GFC_DESCRIPTOR_RANK (a) != 1)
1646 {
1647 const GFC_INTEGER_4 *restrict abase_x;
1648 const GFC_INTEGER_4 *restrict bbase_y;
1649 GFC_INTEGER_4 *restrict dest_y;
1650 GFC_INTEGER_4 s;
1651
1652 for (y = 0; y < ycount; y++)
1653 {
1654 bbase_y = &bbase[y*bystride];
1655 dest_y = &dest[y*rystride];
1656 for (x = 0; x < xcount; x++)
1657 {
1658 abase_x = &abase[x*axstride];
1659 s = (GFC_INTEGER_4) 0;
1660 for (n = 0; n < count; n++)
1661 s += abase_x[n] * bbase_y[n];
1662 dest_y[x] = s;
1663 }
1664 }
1665 }
1666 else
1667 {
1668 const GFC_INTEGER_4 *restrict bbase_y;
1669 GFC_INTEGER_4 s;
1670
1671 for (y = 0; y < ycount; y++)
1672 {
1673 bbase_y = &bbase[y*bystride];
1674 s = (GFC_INTEGER_4) 0;
1675 for (n = 0; n < count; n++)
1676 s += abase[n*axstride] * bbase_y[n];
1677 dest[y*rystride] = s;
1678 }
1679 }
1680 }
1681 else if (axstride < aystride)
1682 {
1683 for (y = 0; y < ycount; y++)
1684 for (x = 0; x < xcount; x++)
1685 dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
1686
1687 for (y = 0; y < ycount; y++)
1688 for (n = 0; n < count; n++)
1689 for (x = 0; x < xcount; x++)
1690 /* dest[x,y] += a[x,n] * b[n,y] */
1691 dest[x*rxstride + y*rystride] +=
1692 abase[x*axstride + n*aystride] *
1693 bbase[n*bxstride + y*bystride];
1694 }
1695 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1696 {
1697 const GFC_INTEGER_4 *restrict bbase_y;
1698 GFC_INTEGER_4 s;
1699
1700 for (y = 0; y < ycount; y++)
1701 {
1702 bbase_y = &bbase[y*bystride];
1703 s = (GFC_INTEGER_4) 0;
1704 for (n = 0; n < count; n++)
1705 s += abase[n*axstride] * bbase_y[n*bxstride];
1706 dest[y*rxstride] = s;
1707 }
1708 }
1709 else
1710 {
1711 const GFC_INTEGER_4 *restrict abase_x;
1712 const GFC_INTEGER_4 *restrict bbase_y;
1713 GFC_INTEGER_4 *restrict dest_y;
1714 GFC_INTEGER_4 s;
1715
1716 for (y = 0; y < ycount; y++)
1717 {
1718 bbase_y = &bbase[y*bystride];
1719 dest_y = &dest[y*rystride];
1720 for (x = 0; x < xcount; x++)
1721 {
1722 abase_x = &abase[x*axstride];
1723 s = (GFC_INTEGER_4) 0;
1724 for (n = 0; n < count; n++)
1725 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1726 dest_y[x*rxstride] = s;
1727 }
1728 }
1729 }
1730 }
1731 #undef POW3
1732 #undef min
1733 #undef max
1734
1735 #endif /* HAVE_AVX512F */
1736
1737 /* Function to fall back to if there is no special processor-specific version. */
1738 static void
1739 matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
1740 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
1741 int blas_limit, blas_call gemm)
1742 {
1743 const GFC_INTEGER_4 * restrict abase;
1744 const GFC_INTEGER_4 * restrict bbase;
1745 GFC_INTEGER_4 * restrict dest;
1746
1747 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1748 index_type x, y, n, count, xcount, ycount;
1749
1750 assert (GFC_DESCRIPTOR_RANK (a) == 2
1751 || GFC_DESCRIPTOR_RANK (b) == 2);
1752
1753 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1754
1755 Either A or B (but not both) can be rank 1:
1756
1757 o One-dimensional argument A is implicitly treated as a row matrix
1758 dimensioned [1,count], so xcount=1.
1759
1760 o One-dimensional argument B is implicitly treated as a column matrix
1761 dimensioned [count, 1], so ycount=1.
1762 */
1763
1764 if (retarray->base_addr == NULL)
1765 {
1766 if (GFC_DESCRIPTOR_RANK (a) == 1)
1767 {
1768 GFC_DIMENSION_SET(retarray->dim[0], 0,
1769 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1770 }
1771 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1772 {
1773 GFC_DIMENSION_SET(retarray->dim[0], 0,
1774 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1775 }
1776 else
1777 {
1778 GFC_DIMENSION_SET(retarray->dim[0], 0,
1779 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1780
1781 GFC_DIMENSION_SET(retarray->dim[1], 0,
1782 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1783 GFC_DESCRIPTOR_EXTENT(retarray,0));
1784 }
1785
1786 retarray->base_addr
1787 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
1788 retarray->offset = 0;
1789 }
1790 else if (unlikely (compile_options.bounds_check))
1791 {
1792 index_type ret_extent, arg_extent;
1793
1794 if (GFC_DESCRIPTOR_RANK (a) == 1)
1795 {
1796 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1797 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1798 if (arg_extent != ret_extent)
1799 runtime_error ("Incorrect extent in return array in"
1800 " MATMUL intrinsic: is %ld, should be %ld",
1801 (long int) ret_extent, (long int) arg_extent);
1802 }
1803 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1804 {
1805 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1806 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1807 if (arg_extent != ret_extent)
1808 runtime_error ("Incorrect extent in return array in"
1809 " MATMUL intrinsic: is %ld, should be %ld",
1810 (long int) ret_extent, (long int) arg_extent);
1811 }
1812 else
1813 {
1814 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1815 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1816 if (arg_extent != ret_extent)
1817 runtime_error ("Incorrect extent in return array in"
1818 " MATMUL intrinsic for dimension 1:"
1819 " is %ld, should be %ld",
1820 (long int) ret_extent, (long int) arg_extent);
1821
1822 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1823 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1824 if (arg_extent != ret_extent)
1825 runtime_error ("Incorrect extent in return array in"
1826 " MATMUL intrinsic for dimension 2:"
1827 " is %ld, should be %ld",
1828 (long int) ret_extent, (long int) arg_extent);
1829 }
1830 }
1831
1832
1833 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1834 {
1835 /* One-dimensional result may be addressed in the code below
1836 either as a row or a column matrix. We want both cases to
1837 work. */
1838 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1839 }
1840 else
1841 {
1842 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1843 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1844 }
1845
1846
1847 if (GFC_DESCRIPTOR_RANK (a) == 1)
1848 {
1849 /* Treat it as a a row matrix A[1,count]. */
1850 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1851 aystride = 1;
1852
1853 xcount = 1;
1854 count = GFC_DESCRIPTOR_EXTENT(a,0);
1855 }
1856 else
1857 {
1858 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1859 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1860
1861 count = GFC_DESCRIPTOR_EXTENT(a,1);
1862 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1863 }
1864
1865 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1866 {
1867 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1868 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1869 }
1870
1871 if (GFC_DESCRIPTOR_RANK (b) == 1)
1872 {
1873 /* Treat it as a column matrix B[count,1] */
1874 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1875
1876 /* bystride should never be used for 1-dimensional b.
1877 in case it is we want it to cause a segfault, rather than
1878 an incorrect result. */
1879 bystride = 0xDEADBEEF;
1880 ycount = 1;
1881 }
1882 else
1883 {
1884 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1885 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1886 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1887 }
1888
1889 abase = a->base_addr;
1890 bbase = b->base_addr;
1891 dest = retarray->base_addr;
1892
1893 /* Now that everything is set up, we perform the multiplication
1894 itself. */
1895
1896 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1897 #define min(a,b) ((a) <= (b) ? (a) : (b))
1898 #define max(a,b) ((a) >= (b) ? (a) : (b))
1899
1900 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1901 && (bxstride == 1 || bystride == 1)
1902 && (((float) xcount) * ((float) ycount) * ((float) count)
1903 > POW3(blas_limit)))
1904 {
1905 const int m = xcount, n = ycount, k = count, ldc = rystride;
1906 const GFC_INTEGER_4 one = 1, zero = 0;
1907 const int lda = (axstride == 1) ? aystride : axstride,
1908 ldb = (bxstride == 1) ? bystride : bxstride;
1909
1910 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1911 {
1912 assert (gemm != NULL);
1913 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1914 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1915 &ldc, 1, 1);
1916 return;
1917 }
1918 }
1919
1920 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1921 {
1922 /* This block of code implements a tuned matmul, derived from
1923 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1924
1925 Bo Kagstrom and Per Ling
1926 Department of Computing Science
1927 Umea University
1928 S-901 87 Umea, Sweden
1929
1930 from netlib.org, translated to C, and modified for matmul.m4. */
1931
1932 const GFC_INTEGER_4 *a, *b;
1933 GFC_INTEGER_4 *c;
1934 const index_type m = xcount, n = ycount, k = count;
1935
1936 /* System generated locals */
1937 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1938 i1, i2, i3, i4, i5, i6;
1939
1940 /* Local variables */
1941 GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
1942 f13, f14, f23, f24, f33, f34, f43, f44;
1943 index_type i, j, l, ii, jj, ll;
1944 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1945 GFC_INTEGER_4 *t1;
1946
1947 a = abase;
1948 b = bbase;
1949 c = retarray->base_addr;
1950
1951 /* Parameter adjustments */
1952 c_dim1 = rystride;
1953 c_offset = 1 + c_dim1;
1954 c -= c_offset;
1955 a_dim1 = aystride;
1956 a_offset = 1 + a_dim1;
1957 a -= a_offset;
1958 b_dim1 = bystride;
1959 b_offset = 1 + b_dim1;
1960 b -= b_offset;
1961
1962 /* Early exit if possible */
1963 if (m == 0 || n == 0 || k == 0)
1964 return;
1965
1966 /* Adjust size of t1 to what is needed. */
1967 index_type t1_dim;
1968 t1_dim = (a_dim1-1) * 256 + b_dim1;
1969 if (t1_dim > 65536)
1970 t1_dim = 65536;
1971
1972 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
1973
1974 /* Empty c first. */
1975 for (j=1; j<=n; j++)
1976 for (i=1; i<=m; i++)
1977 c[i + j * c_dim1] = (GFC_INTEGER_4)0;
1978
1979 /* Start turning the crank. */
1980 i1 = n;
1981 for (jj = 1; jj <= i1; jj += 512)
1982 {
1983 /* Computing MIN */
1984 i2 = 512;
1985 i3 = n - jj + 1;
1986 jsec = min(i2,i3);
1987 ujsec = jsec - jsec % 4;
1988 i2 = k;
1989 for (ll = 1; ll <= i2; ll += 256)
1990 {
1991 /* Computing MIN */
1992 i3 = 256;
1993 i4 = k - ll + 1;
1994 lsec = min(i3,i4);
1995 ulsec = lsec - lsec % 2;
1996
1997 i3 = m;
1998 for (ii = 1; ii <= i3; ii += 256)
1999 {
2000 /* Computing MIN */
2001 i4 = 256;
2002 i5 = m - ii + 1;
2003 isec = min(i4,i5);
2004 uisec = isec - isec % 2;
2005 i4 = ll + ulsec - 1;
2006 for (l = ll; l <= i4; l += 2)
2007 {
2008 i5 = ii + uisec - 1;
2009 for (i = ii; i <= i5; i += 2)
2010 {
2011 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2012 a[i + l * a_dim1];
2013 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2014 a[i + (l + 1) * a_dim1];
2015 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2016 a[i + 1 + l * a_dim1];
2017 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2018 a[i + 1 + (l + 1) * a_dim1];
2019 }
2020 if (uisec < isec)
2021 {
2022 t1[l - ll + 1 + (isec << 8) - 257] =
2023 a[ii + isec - 1 + l * a_dim1];
2024 t1[l - ll + 2 + (isec << 8) - 257] =
2025 a[ii + isec - 1 + (l + 1) * a_dim1];
2026 }
2027 }
2028 if (ulsec < lsec)
2029 {
2030 i4 = ii + isec - 1;
2031 for (i = ii; i<= i4; ++i)
2032 {
2033 t1[lsec + ((i - ii + 1) << 8) - 257] =
2034 a[i + (ll + lsec - 1) * a_dim1];
2035 }
2036 }
2037
2038 uisec = isec - isec % 4;
2039 i4 = jj + ujsec - 1;
2040 for (j = jj; j <= i4; j += 4)
2041 {
2042 i5 = ii + uisec - 1;
2043 for (i = ii; i <= i5; i += 4)
2044 {
2045 f11 = c[i + j * c_dim1];
2046 f21 = c[i + 1 + j * c_dim1];
2047 f12 = c[i + (j + 1) * c_dim1];
2048 f22 = c[i + 1 + (j + 1) * c_dim1];
2049 f13 = c[i + (j + 2) * c_dim1];
2050 f23 = c[i + 1 + (j + 2) * c_dim1];
2051 f14 = c[i + (j + 3) * c_dim1];
2052 f24 = c[i + 1 + (j + 3) * c_dim1];
2053 f31 = c[i + 2 + j * c_dim1];
2054 f41 = c[i + 3 + j * c_dim1];
2055 f32 = c[i + 2 + (j + 1) * c_dim1];
2056 f42 = c[i + 3 + (j + 1) * c_dim1];
2057 f33 = c[i + 2 + (j + 2) * c_dim1];
2058 f43 = c[i + 3 + (j + 2) * c_dim1];
2059 f34 = c[i + 2 + (j + 3) * c_dim1];
2060 f44 = c[i + 3 + (j + 3) * c_dim1];
2061 i6 = ll + lsec - 1;
2062 for (l = ll; l <= i6; ++l)
2063 {
2064 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2065 * b[l + j * b_dim1];
2066 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2067 * b[l + j * b_dim1];
2068 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2069 * b[l + (j + 1) * b_dim1];
2070 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2071 * b[l + (j + 1) * b_dim1];
2072 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2073 * b[l + (j + 2) * b_dim1];
2074 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2075 * b[l + (j + 2) * b_dim1];
2076 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2077 * b[l + (j + 3) * b_dim1];
2078 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2079 * b[l + (j + 3) * b_dim1];
2080 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2081 * b[l + j * b_dim1];
2082 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2083 * b[l + j * b_dim1];
2084 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2085 * b[l + (j + 1) * b_dim1];
2086 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2087 * b[l + (j + 1) * b_dim1];
2088 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2089 * b[l + (j + 2) * b_dim1];
2090 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2091 * b[l + (j + 2) * b_dim1];
2092 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2093 * b[l + (j + 3) * b_dim1];
2094 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2095 * b[l + (j + 3) * b_dim1];
2096 }
2097 c[i + j * c_dim1] = f11;
2098 c[i + 1 + j * c_dim1] = f21;
2099 c[i + (j + 1) * c_dim1] = f12;
2100 c[i + 1 + (j + 1) * c_dim1] = f22;
2101 c[i + (j + 2) * c_dim1] = f13;
2102 c[i + 1 + (j + 2) * c_dim1] = f23;
2103 c[i + (j + 3) * c_dim1] = f14;
2104 c[i + 1 + (j + 3) * c_dim1] = f24;
2105 c[i + 2 + j * c_dim1] = f31;
2106 c[i + 3 + j * c_dim1] = f41;
2107 c[i + 2 + (j + 1) * c_dim1] = f32;
2108 c[i + 3 + (j + 1) * c_dim1] = f42;
2109 c[i + 2 + (j + 2) * c_dim1] = f33;
2110 c[i + 3 + (j + 2) * c_dim1] = f43;
2111 c[i + 2 + (j + 3) * c_dim1] = f34;
2112 c[i + 3 + (j + 3) * c_dim1] = f44;
2113 }
2114 if (uisec < isec)
2115 {
2116 i5 = ii + isec - 1;
2117 for (i = ii + uisec; i <= i5; ++i)
2118 {
2119 f11 = c[i + j * c_dim1];
2120 f12 = c[i + (j + 1) * c_dim1];
2121 f13 = c[i + (j + 2) * c_dim1];
2122 f14 = c[i + (j + 3) * c_dim1];
2123 i6 = ll + lsec - 1;
2124 for (l = ll; l <= i6; ++l)
2125 {
2126 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2127 257] * b[l + j * b_dim1];
2128 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2129 257] * b[l + (j + 1) * b_dim1];
2130 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2131 257] * b[l + (j + 2) * b_dim1];
2132 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2133 257] * b[l + (j + 3) * b_dim1];
2134 }
2135 c[i + j * c_dim1] = f11;
2136 c[i + (j + 1) * c_dim1] = f12;
2137 c[i + (j + 2) * c_dim1] = f13;
2138 c[i + (j + 3) * c_dim1] = f14;
2139 }
2140 }
2141 }
2142 if (ujsec < jsec)
2143 {
2144 i4 = jj + jsec - 1;
2145 for (j = jj + ujsec; j <= i4; ++j)
2146 {
2147 i5 = ii + uisec - 1;
2148 for (i = ii; i <= i5; i += 4)
2149 {
2150 f11 = c[i + j * c_dim1];
2151 f21 = c[i + 1 + j * c_dim1];
2152 f31 = c[i + 2 + j * c_dim1];
2153 f41 = c[i + 3 + j * c_dim1];
2154 i6 = ll + lsec - 1;
2155 for (l = ll; l <= i6; ++l)
2156 {
2157 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2158 257] * b[l + j * b_dim1];
2159 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2160 257] * b[l + j * b_dim1];
2161 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2162 257] * b[l + j * b_dim1];
2163 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2164 257] * b[l + j * b_dim1];
2165 }
2166 c[i + j * c_dim1] = f11;
2167 c[i + 1 + j * c_dim1] = f21;
2168 c[i + 2 + j * c_dim1] = f31;
2169 c[i + 3 + j * c_dim1] = f41;
2170 }
2171 i5 = ii + isec - 1;
2172 for (i = ii + uisec; i <= i5; ++i)
2173 {
2174 f11 = c[i + j * c_dim1];
2175 i6 = ll + lsec - 1;
2176 for (l = ll; l <= i6; ++l)
2177 {
2178 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2179 257] * b[l + j * b_dim1];
2180 }
2181 c[i + j * c_dim1] = f11;
2182 }
2183 }
2184 }
2185 }
2186 }
2187 }
2188 free(t1);
2189 return;
2190 }
2191 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2192 {
2193 if (GFC_DESCRIPTOR_RANK (a) != 1)
2194 {
2195 const GFC_INTEGER_4 *restrict abase_x;
2196 const GFC_INTEGER_4 *restrict bbase_y;
2197 GFC_INTEGER_4 *restrict dest_y;
2198 GFC_INTEGER_4 s;
2199
2200 for (y = 0; y < ycount; y++)
2201 {
2202 bbase_y = &bbase[y*bystride];
2203 dest_y = &dest[y*rystride];
2204 for (x = 0; x < xcount; x++)
2205 {
2206 abase_x = &abase[x*axstride];
2207 s = (GFC_INTEGER_4) 0;
2208 for (n = 0; n < count; n++)
2209 s += abase_x[n] * bbase_y[n];
2210 dest_y[x] = s;
2211 }
2212 }
2213 }
2214 else
2215 {
2216 const GFC_INTEGER_4 *restrict bbase_y;
2217 GFC_INTEGER_4 s;
2218
2219 for (y = 0; y < ycount; y++)
2220 {
2221 bbase_y = &bbase[y*bystride];
2222 s = (GFC_INTEGER_4) 0;
2223 for (n = 0; n < count; n++)
2224 s += abase[n*axstride] * bbase_y[n];
2225 dest[y*rystride] = s;
2226 }
2227 }
2228 }
2229 else if (axstride < aystride)
2230 {
2231 for (y = 0; y < ycount; y++)
2232 for (x = 0; x < xcount; x++)
2233 dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
2234
2235 for (y = 0; y < ycount; y++)
2236 for (n = 0; n < count; n++)
2237 for (x = 0; x < xcount; x++)
2238 /* dest[x,y] += a[x,n] * b[n,y] */
2239 dest[x*rxstride + y*rystride] +=
2240 abase[x*axstride + n*aystride] *
2241 bbase[n*bxstride + y*bystride];
2242 }
2243 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2244 {
2245 const GFC_INTEGER_4 *restrict bbase_y;
2246 GFC_INTEGER_4 s;
2247
2248 for (y = 0; y < ycount; y++)
2249 {
2250 bbase_y = &bbase[y*bystride];
2251 s = (GFC_INTEGER_4) 0;
2252 for (n = 0; n < count; n++)
2253 s += abase[n*axstride] * bbase_y[n*bxstride];
2254 dest[y*rxstride] = s;
2255 }
2256 }
2257 else
2258 {
2259 const GFC_INTEGER_4 *restrict abase_x;
2260 const GFC_INTEGER_4 *restrict bbase_y;
2261 GFC_INTEGER_4 *restrict dest_y;
2262 GFC_INTEGER_4 s;
2263
2264 for (y = 0; y < ycount; y++)
2265 {
2266 bbase_y = &bbase[y*bystride];
2267 dest_y = &dest[y*rystride];
2268 for (x = 0; x < xcount; x++)
2269 {
2270 abase_x = &abase[x*axstride];
2271 s = (GFC_INTEGER_4) 0;
2272 for (n = 0; n < count; n++)
2273 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2274 dest_y[x*rxstride] = s;
2275 }
2276 }
2277 }
2278 }
2279 #undef POW3
2280 #undef min
2281 #undef max
2282
2283
2284 /* Compiling main function, with selection code for the processor. */
2285
2286 /* Currently, this is i386 only. Adjust for other architectures. */
2287
2288 #include <config/i386/cpuinfo.h>
2289 void matmul_i4 (gfc_array_i4 * const restrict retarray,
2290 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2291 int blas_limit, blas_call gemm)
2292 {
2293 static void (*matmul_p) (gfc_array_i4 * const restrict retarray,
2294 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2295 int blas_limit, blas_call gemm);
2296
2297 void (*matmul_fn) (gfc_array_i4 * const restrict retarray,
2298 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2299 int blas_limit, blas_call gemm);
2300
2301 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2302 if (matmul_fn == NULL)
2303 {
2304 matmul_fn = matmul_i4_vanilla;
2305 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2306 {
2307 /* Run down the available processors in order of preference. */
2308 #ifdef HAVE_AVX512F
2309 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2310 {
2311 matmul_fn = matmul_i4_avx512f;
2312 goto store;
2313 }
2314
2315 #endif /* HAVE_AVX512F */
2316
2317 #ifdef HAVE_AVX2
2318 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2319 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2320 {
2321 matmul_fn = matmul_i4_avx2;
2322 goto store;
2323 }
2324
2325 #endif
2326
2327 #ifdef HAVE_AVX
2328 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2329 {
2330 matmul_fn = matmul_i4_avx;
2331 goto store;
2332 }
2333 #endif /* HAVE_AVX */
2334 }
2335 store:
2336 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2337 }
2338
2339 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2340 }
2341
2342 #else /* Just the vanilla function. */
2343
2344 void
2345 matmul_i4 (gfc_array_i4 * const restrict retarray,
2346 gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
2347 int blas_limit, blas_call gemm)
2348 {
2349 const GFC_INTEGER_4 * restrict abase;
2350 const GFC_INTEGER_4 * restrict bbase;
2351 GFC_INTEGER_4 * restrict dest;
2352
2353 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2354 index_type x, y, n, count, xcount, ycount;
2355
2356 assert (GFC_DESCRIPTOR_RANK (a) == 2
2357 || GFC_DESCRIPTOR_RANK (b) == 2);
2358
2359 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2360
2361 Either A or B (but not both) can be rank 1:
2362
2363 o One-dimensional argument A is implicitly treated as a row matrix
2364 dimensioned [1,count], so xcount=1.
2365
2366 o One-dimensional argument B is implicitly treated as a column matrix
2367 dimensioned [count, 1], so ycount=1.
2368 */
2369
2370 if (retarray->base_addr == NULL)
2371 {
2372 if (GFC_DESCRIPTOR_RANK (a) == 1)
2373 {
2374 GFC_DIMENSION_SET(retarray->dim[0], 0,
2375 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2376 }
2377 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2378 {
2379 GFC_DIMENSION_SET(retarray->dim[0], 0,
2380 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2381 }
2382 else
2383 {
2384 GFC_DIMENSION_SET(retarray->dim[0], 0,
2385 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2386
2387 GFC_DIMENSION_SET(retarray->dim[1], 0,
2388 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2389 GFC_DESCRIPTOR_EXTENT(retarray,0));
2390 }
2391
2392 retarray->base_addr
2393 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
2394 retarray->offset = 0;
2395 }
2396 else if (unlikely (compile_options.bounds_check))
2397 {
2398 index_type ret_extent, arg_extent;
2399
2400 if (GFC_DESCRIPTOR_RANK (a) == 1)
2401 {
2402 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2403 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2404 if (arg_extent != ret_extent)
2405 runtime_error ("Incorrect extent in return array in"
2406 " MATMUL intrinsic: is %ld, should be %ld",
2407 (long int) ret_extent, (long int) arg_extent);
2408 }
2409 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2410 {
2411 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2412 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2413 if (arg_extent != ret_extent)
2414 runtime_error ("Incorrect extent in return array in"
2415 " MATMUL intrinsic: is %ld, should be %ld",
2416 (long int) ret_extent, (long int) arg_extent);
2417 }
2418 else
2419 {
2420 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2421 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2422 if (arg_extent != ret_extent)
2423 runtime_error ("Incorrect extent in return array in"
2424 " MATMUL intrinsic for dimension 1:"
2425 " is %ld, should be %ld",
2426 (long int) ret_extent, (long int) arg_extent);
2427
2428 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2429 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2430 if (arg_extent != ret_extent)
2431 runtime_error ("Incorrect extent in return array in"
2432 " MATMUL intrinsic for dimension 2:"
2433 " is %ld, should be %ld",
2434 (long int) ret_extent, (long int) arg_extent);
2435 }
2436 }
2437
2438
2439 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2440 {
2441 /* One-dimensional result may be addressed in the code below
2442 either as a row or a column matrix. We want both cases to
2443 work. */
2444 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2445 }
2446 else
2447 {
2448 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2449 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2450 }
2451
2452
2453 if (GFC_DESCRIPTOR_RANK (a) == 1)
2454 {
2455 /* Treat it as a a row matrix A[1,count]. */
2456 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2457 aystride = 1;
2458
2459 xcount = 1;
2460 count = GFC_DESCRIPTOR_EXTENT(a,0);
2461 }
2462 else
2463 {
2464 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2465 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2466
2467 count = GFC_DESCRIPTOR_EXTENT(a,1);
2468 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2469 }
2470
2471 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2472 {
2473 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2474 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2475 }
2476
2477 if (GFC_DESCRIPTOR_RANK (b) == 1)
2478 {
2479 /* Treat it as a column matrix B[count,1] */
2480 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2481
2482 /* bystride should never be used for 1-dimensional b.
2483 in case it is we want it to cause a segfault, rather than
2484 an incorrect result. */
2485 bystride = 0xDEADBEEF;
2486 ycount = 1;
2487 }
2488 else
2489 {
2490 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2491 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2492 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2493 }
2494
2495 abase = a->base_addr;
2496 bbase = b->base_addr;
2497 dest = retarray->base_addr;
2498
2499 /* Now that everything is set up, we perform the multiplication
2500 itself. */
2501
2502 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2503 #define min(a,b) ((a) <= (b) ? (a) : (b))
2504 #define max(a,b) ((a) >= (b) ? (a) : (b))
2505
2506 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2507 && (bxstride == 1 || bystride == 1)
2508 && (((float) xcount) * ((float) ycount) * ((float) count)
2509 > POW3(blas_limit)))
2510 {
2511 const int m = xcount, n = ycount, k = count, ldc = rystride;
2512 const GFC_INTEGER_4 one = 1, zero = 0;
2513 const int lda = (axstride == 1) ? aystride : axstride,
2514 ldb = (bxstride == 1) ? bystride : bxstride;
2515
2516 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2517 {
2518 assert (gemm != NULL);
2519 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2520 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2521 &ldc, 1, 1);
2522 return;
2523 }
2524 }
2525
2526 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2527 {
2528 /* This block of code implements a tuned matmul, derived from
2529 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2530
2531 Bo Kagstrom and Per Ling
2532 Department of Computing Science
2533 Umea University
2534 S-901 87 Umea, Sweden
2535
2536 from netlib.org, translated to C, and modified for matmul.m4. */
2537
2538 const GFC_INTEGER_4 *a, *b;
2539 GFC_INTEGER_4 *c;
2540 const index_type m = xcount, n = ycount, k = count;
2541
2542 /* System generated locals */
2543 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2544 i1, i2, i3, i4, i5, i6;
2545
2546 /* Local variables */
2547 GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
2548 f13, f14, f23, f24, f33, f34, f43, f44;
2549 index_type i, j, l, ii, jj, ll;
2550 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2551 GFC_INTEGER_4 *t1;
2552
2553 a = abase;
2554 b = bbase;
2555 c = retarray->base_addr;
2556
2557 /* Parameter adjustments */
2558 c_dim1 = rystride;
2559 c_offset = 1 + c_dim1;
2560 c -= c_offset;
2561 a_dim1 = aystride;
2562 a_offset = 1 + a_dim1;
2563 a -= a_offset;
2564 b_dim1 = bystride;
2565 b_offset = 1 + b_dim1;
2566 b -= b_offset;
2567
2568 /* Early exit if possible */
2569 if (m == 0 || n == 0 || k == 0)
2570 return;
2571
2572 /* Adjust size of t1 to what is needed. */
2573 index_type t1_dim;
2574 t1_dim = (a_dim1-1) * 256 + b_dim1;
2575 if (t1_dim > 65536)
2576 t1_dim = 65536;
2577
2578 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
2579
2580 /* Empty c first. */
2581 for (j=1; j<=n; j++)
2582 for (i=1; i<=m; i++)
2583 c[i + j * c_dim1] = (GFC_INTEGER_4)0;
2584
2585 /* Start turning the crank. */
2586 i1 = n;
2587 for (jj = 1; jj <= i1; jj += 512)
2588 {
2589 /* Computing MIN */
2590 i2 = 512;
2591 i3 = n - jj + 1;
2592 jsec = min(i2,i3);
2593 ujsec = jsec - jsec % 4;
2594 i2 = k;
2595 for (ll = 1; ll <= i2; ll += 256)
2596 {
2597 /* Computing MIN */
2598 i3 = 256;
2599 i4 = k - ll + 1;
2600 lsec = min(i3,i4);
2601 ulsec = lsec - lsec % 2;
2602
2603 i3 = m;
2604 for (ii = 1; ii <= i3; ii += 256)
2605 {
2606 /* Computing MIN */
2607 i4 = 256;
2608 i5 = m - ii + 1;
2609 isec = min(i4,i5);
2610 uisec = isec - isec % 2;
2611 i4 = ll + ulsec - 1;
2612 for (l = ll; l <= i4; l += 2)
2613 {
2614 i5 = ii + uisec - 1;
2615 for (i = ii; i <= i5; i += 2)
2616 {
2617 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2618 a[i + l * a_dim1];
2619 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2620 a[i + (l + 1) * a_dim1];
2621 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2622 a[i + 1 + l * a_dim1];
2623 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2624 a[i + 1 + (l + 1) * a_dim1];
2625 }
2626 if (uisec < isec)
2627 {
2628 t1[l - ll + 1 + (isec << 8) - 257] =
2629 a[ii + isec - 1 + l * a_dim1];
2630 t1[l - ll + 2 + (isec << 8) - 257] =
2631 a[ii + isec - 1 + (l + 1) * a_dim1];
2632 }
2633 }
2634 if (ulsec < lsec)
2635 {
2636 i4 = ii + isec - 1;
2637 for (i = ii; i<= i4; ++i)
2638 {
2639 t1[lsec + ((i - ii + 1) << 8) - 257] =
2640 a[i + (ll + lsec - 1) * a_dim1];
2641 }
2642 }
2643
2644 uisec = isec - isec % 4;
2645 i4 = jj + ujsec - 1;
2646 for (j = jj; j <= i4; j += 4)
2647 {
2648 i5 = ii + uisec - 1;
2649 for (i = ii; i <= i5; i += 4)
2650 {
2651 f11 = c[i + j * c_dim1];
2652 f21 = c[i + 1 + j * c_dim1];
2653 f12 = c[i + (j + 1) * c_dim1];
2654 f22 = c[i + 1 + (j + 1) * c_dim1];
2655 f13 = c[i + (j + 2) * c_dim1];
2656 f23 = c[i + 1 + (j + 2) * c_dim1];
2657 f14 = c[i + (j + 3) * c_dim1];
2658 f24 = c[i + 1 + (j + 3) * c_dim1];
2659 f31 = c[i + 2 + j * c_dim1];
2660 f41 = c[i + 3 + j * c_dim1];
2661 f32 = c[i + 2 + (j + 1) * c_dim1];
2662 f42 = c[i + 3 + (j + 1) * c_dim1];
2663 f33 = c[i + 2 + (j + 2) * c_dim1];
2664 f43 = c[i + 3 + (j + 2) * c_dim1];
2665 f34 = c[i + 2 + (j + 3) * c_dim1];
2666 f44 = c[i + 3 + (j + 3) * c_dim1];
2667 i6 = ll + lsec - 1;
2668 for (l = ll; l <= i6; ++l)
2669 {
2670 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2671 * b[l + j * b_dim1];
2672 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2673 * b[l + j * b_dim1];
2674 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2675 * b[l + (j + 1) * b_dim1];
2676 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2677 * b[l + (j + 1) * b_dim1];
2678 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2679 * b[l + (j + 2) * b_dim1];
2680 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2681 * b[l + (j + 2) * b_dim1];
2682 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2683 * b[l + (j + 3) * b_dim1];
2684 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2685 * b[l + (j + 3) * b_dim1];
2686 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2687 * b[l + j * b_dim1];
2688 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2689 * b[l + j * b_dim1];
2690 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2691 * b[l + (j + 1) * b_dim1];
2692 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2693 * b[l + (j + 1) * b_dim1];
2694 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2695 * b[l + (j + 2) * b_dim1];
2696 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2697 * b[l + (j + 2) * b_dim1];
2698 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2699 * b[l + (j + 3) * b_dim1];
2700 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2701 * b[l + (j + 3) * b_dim1];
2702 }
2703 c[i + j * c_dim1] = f11;
2704 c[i + 1 + j * c_dim1] = f21;
2705 c[i + (j + 1) * c_dim1] = f12;
2706 c[i + 1 + (j + 1) * c_dim1] = f22;
2707 c[i + (j + 2) * c_dim1] = f13;
2708 c[i + 1 + (j + 2) * c_dim1] = f23;
2709 c[i + (j + 3) * c_dim1] = f14;
2710 c[i + 1 + (j + 3) * c_dim1] = f24;
2711 c[i + 2 + j * c_dim1] = f31;
2712 c[i + 3 + j * c_dim1] = f41;
2713 c[i + 2 + (j + 1) * c_dim1] = f32;
2714 c[i + 3 + (j + 1) * c_dim1] = f42;
2715 c[i + 2 + (j + 2) * c_dim1] = f33;
2716 c[i + 3 + (j + 2) * c_dim1] = f43;
2717 c[i + 2 + (j + 3) * c_dim1] = f34;
2718 c[i + 3 + (j + 3) * c_dim1] = f44;
2719 }
2720 if (uisec < isec)
2721 {
2722 i5 = ii + isec - 1;
2723 for (i = ii + uisec; i <= i5; ++i)
2724 {
2725 f11 = c[i + j * c_dim1];
2726 f12 = c[i + (j + 1) * c_dim1];
2727 f13 = c[i + (j + 2) * c_dim1];
2728 f14 = c[i + (j + 3) * c_dim1];
2729 i6 = ll + lsec - 1;
2730 for (l = ll; l <= i6; ++l)
2731 {
2732 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2733 257] * b[l + j * b_dim1];
2734 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2735 257] * b[l + (j + 1) * b_dim1];
2736 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2737 257] * b[l + (j + 2) * b_dim1];
2738 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2739 257] * b[l + (j + 3) * b_dim1];
2740 }
2741 c[i + j * c_dim1] = f11;
2742 c[i + (j + 1) * c_dim1] = f12;
2743 c[i + (j + 2) * c_dim1] = f13;
2744 c[i + (j + 3) * c_dim1] = f14;
2745 }
2746 }
2747 }
2748 if (ujsec < jsec)
2749 {
2750 i4 = jj + jsec - 1;
2751 for (j = jj + ujsec; j <= i4; ++j)
2752 {
2753 i5 = ii + uisec - 1;
2754 for (i = ii; i <= i5; i += 4)
2755 {
2756 f11 = c[i + j * c_dim1];
2757 f21 = c[i + 1 + j * c_dim1];
2758 f31 = c[i + 2 + j * c_dim1];
2759 f41 = c[i + 3 + j * c_dim1];
2760 i6 = ll + lsec - 1;
2761 for (l = ll; l <= i6; ++l)
2762 {
2763 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2764 257] * b[l + j * b_dim1];
2765 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2766 257] * b[l + j * b_dim1];
2767 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2768 257] * b[l + j * b_dim1];
2769 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2770 257] * b[l + j * b_dim1];
2771 }
2772 c[i + j * c_dim1] = f11;
2773 c[i + 1 + j * c_dim1] = f21;
2774 c[i + 2 + j * c_dim1] = f31;
2775 c[i + 3 + j * c_dim1] = f41;
2776 }
2777 i5 = ii + isec - 1;
2778 for (i = ii + uisec; i <= i5; ++i)
2779 {
2780 f11 = c[i + j * c_dim1];
2781 i6 = ll + lsec - 1;
2782 for (l = ll; l <= i6; ++l)
2783 {
2784 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2785 257] * b[l + j * b_dim1];
2786 }
2787 c[i + j * c_dim1] = f11;
2788 }
2789 }
2790 }
2791 }
2792 }
2793 }
2794 free(t1);
2795 return;
2796 }
2797 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2798 {
2799 if (GFC_DESCRIPTOR_RANK (a) != 1)
2800 {
2801 const GFC_INTEGER_4 *restrict abase_x;
2802 const GFC_INTEGER_4 *restrict bbase_y;
2803 GFC_INTEGER_4 *restrict dest_y;
2804 GFC_INTEGER_4 s;
2805
2806 for (y = 0; y < ycount; y++)
2807 {
2808 bbase_y = &bbase[y*bystride];
2809 dest_y = &dest[y*rystride];
2810 for (x = 0; x < xcount; x++)
2811 {
2812 abase_x = &abase[x*axstride];
2813 s = (GFC_INTEGER_4) 0;
2814 for (n = 0; n < count; n++)
2815 s += abase_x[n] * bbase_y[n];
2816 dest_y[x] = s;
2817 }
2818 }
2819 }
2820 else
2821 {
2822 const GFC_INTEGER_4 *restrict bbase_y;
2823 GFC_INTEGER_4 s;
2824
2825 for (y = 0; y < ycount; y++)
2826 {
2827 bbase_y = &bbase[y*bystride];
2828 s = (GFC_INTEGER_4) 0;
2829 for (n = 0; n < count; n++)
2830 s += abase[n*axstride] * bbase_y[n];
2831 dest[y*rystride] = s;
2832 }
2833 }
2834 }
2835 else if (axstride < aystride)
2836 {
2837 for (y = 0; y < ycount; y++)
2838 for (x = 0; x < xcount; x++)
2839 dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
2840
2841 for (y = 0; y < ycount; y++)
2842 for (n = 0; n < count; n++)
2843 for (x = 0; x < xcount; x++)
2844 /* dest[x,y] += a[x,n] * b[n,y] */
2845 dest[x*rxstride + y*rystride] +=
2846 abase[x*axstride + n*aystride] *
2847 bbase[n*bxstride + y*bystride];
2848 }
2849 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2850 {
2851 const GFC_INTEGER_4 *restrict bbase_y;
2852 GFC_INTEGER_4 s;
2853
2854 for (y = 0; y < ycount; y++)
2855 {
2856 bbase_y = &bbase[y*bystride];
2857 s = (GFC_INTEGER_4) 0;
2858 for (n = 0; n < count; n++)
2859 s += abase[n*axstride] * bbase_y[n*bxstride];
2860 dest[y*rxstride] = s;
2861 }
2862 }
2863 else
2864 {
2865 const GFC_INTEGER_4 *restrict abase_x;
2866 const GFC_INTEGER_4 *restrict bbase_y;
2867 GFC_INTEGER_4 *restrict dest_y;
2868 GFC_INTEGER_4 s;
2869
2870 for (y = 0; y < ycount; y++)
2871 {
2872 bbase_y = &bbase[y*bystride];
2873 dest_y = &dest[y*rystride];
2874 for (x = 0; x < xcount; x++)
2875 {
2876 abase_x = &abase[x*axstride];
2877 s = (GFC_INTEGER_4) 0;
2878 for (n = 0; n < count; n++)
2879 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2880 dest_y[x*rxstride] = s;
2881 }
2882 }
2883 }
2884 }
2885 #undef POW3
2886 #undef min
2887 #undef max
2888
2889 #endif
2890 #endif
2891