]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgfortran/generated/matmul_i8.c
name-lookup.h (pushdecl_with_scope): Replace with ...
[thirdparty/gcc.git] / libgfortran / generated / matmul_i8.c
CommitLineData
6de9cd9a 1/* Implementation of the MATMUL intrinsic
cbe34bb5 2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
6de9cd9a
DN
3 Contributed by Paul Brook <paul@nowt.org>
4
21d1335b 5This file is part of the GNU Fortran runtime library (libgfortran).
6de9cd9a
DN
6
7Libgfortran is free software; you can redistribute it and/or
57dea9f6 8modify it under the terms of the GNU General Public
6de9cd9a 9License as published by the Free Software Foundation; either
748086b7 10version 3 of the License, or (at your option) any later version.
6de9cd9a
DN
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
57dea9f6 15GNU General Public License for more details.
6de9cd9a 16
748086b7
JJ
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24<http://www.gnu.org/licenses/>. */
6de9cd9a 25
36ae8a61 26#include "libgfortran.h"
410d3bba 27#include <string.h>
6de9cd9a 28#include <assert.h>
36ae8a61 29
6de9cd9a 30
644cb69f
FXC
31#if defined (HAVE_GFC_INTEGER_8)
32
5a0aad31 33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
5d70ab07 34 passed to us by the front-end, in which case we call it for large
5a0aad31
FXC
35 matrices. */
36
37typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
39 const int *, const GFC_INTEGER_8 *, const int *,
40 const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
41 int, int);
42
1524f80b
RS
43/* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
410d3bba
VL
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
1524f80b
RS
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
410d3bba 60 DO I=1,M
1524f80b
RS
61 S = 0
62 DO K=1,COUNT
5a0aad31 63 S = S+A(I,K)*B(K,J)
1524f80b
RS
64 C(I,J) = S
65 ENDIF
410d3bba
VL
66*/
67
5a0aad31
FXC
68/* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
85206901 72extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
5a0aad31
FXC
73 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
7f68c75f 75export_proto(matmul_i8);
7d7b8bfe 76
31cfd832
TK
77/* Put exhaustive list of possible architectures here here, ORed together. */
78
79#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81#ifdef HAVE_AVX
82static void
83matmul_i8_avx (gfc_array_i8 * const restrict retarray,
84 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86static void
87matmul_i8_avx (gfc_array_i8 * const restrict retarray,
88 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90{
91 const GFC_INTEGER_8 * restrict abase;
92 const GFC_INTEGER_8 * restrict bbase;
93 GFC_INTEGER_8 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110*/
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
169
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
177 }
178 }
179
180
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182 {
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187 }
188 else
189 {
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192 }
193
194
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
196 {
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
200
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
203 }
204 else
205 {
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211 }
212
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214 {
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride = 0xDEADBEEF;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245#define min(a,b) ((a) <= (b) ? (a) : (b))
246#define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_INTEGER_8 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
265 }
266 }
267
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
269 {
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
272
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
277
278 from netlib.org, translated to C, and modified for matmul.m4. */
279
280 const GFC_INTEGER_8 *a, *b;
281 GFC_INTEGER_8 *c;
282 const index_type m = xcount, n = ycount, k = count;
283
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
287
288 /* Local variables */
289 GFC_INTEGER_8 t1[65536], /* was [256][256] */
290 f11, f12, f21, f22, f31, f32, f41, f42,
291 f13, f14, f23, f24, f33, f34, f43, f44;
292 index_type i, j, l, ii, jj, ll;
293 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
294
295 a = abase;
296 b = bbase;
297 c = retarray->base_addr;
298
299 /* Parameter adjustments */
300 c_dim1 = rystride;
301 c_offset = 1 + c_dim1;
302 c -= c_offset;
303 a_dim1 = aystride;
304 a_offset = 1 + a_dim1;
305 a -= a_offset;
306 b_dim1 = bystride;
307 b_offset = 1 + b_dim1;
308 b -= b_offset;
309
310 /* Early exit if possible */
311 if (m == 0 || n == 0 || k == 0)
312 return;
313
314 /* Empty c first. */
315 for (j=1; j<=n; j++)
316 for (i=1; i<=m; i++)
317 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
318
319 /* Start turning the crank. */
320 i1 = n;
321 for (jj = 1; jj <= i1; jj += 512)
322 {
323 /* Computing MIN */
324 i2 = 512;
325 i3 = n - jj + 1;
326 jsec = min(i2,i3);
327 ujsec = jsec - jsec % 4;
328 i2 = k;
329 for (ll = 1; ll <= i2; ll += 256)
330 {
331 /* Computing MIN */
332 i3 = 256;
333 i4 = k - ll + 1;
334 lsec = min(i3,i4);
335 ulsec = lsec - lsec % 2;
336
337 i3 = m;
338 for (ii = 1; ii <= i3; ii += 256)
339 {
340 /* Computing MIN */
341 i4 = 256;
342 i5 = m - ii + 1;
343 isec = min(i4,i5);
344 uisec = isec - isec % 2;
345 i4 = ll + ulsec - 1;
346 for (l = ll; l <= i4; l += 2)
347 {
348 i5 = ii + uisec - 1;
349 for (i = ii; i <= i5; i += 2)
350 {
351 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
352 a[i + l * a_dim1];
353 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
354 a[i + (l + 1) * a_dim1];
355 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
356 a[i + 1 + l * a_dim1];
357 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
358 a[i + 1 + (l + 1) * a_dim1];
359 }
360 if (uisec < isec)
361 {
362 t1[l - ll + 1 + (isec << 8) - 257] =
363 a[ii + isec - 1 + l * a_dim1];
364 t1[l - ll + 2 + (isec << 8) - 257] =
365 a[ii + isec - 1 + (l + 1) * a_dim1];
366 }
367 }
368 if (ulsec < lsec)
369 {
370 i4 = ii + isec - 1;
371 for (i = ii; i<= i4; ++i)
372 {
373 t1[lsec + ((i - ii + 1) << 8) - 257] =
374 a[i + (ll + lsec - 1) * a_dim1];
375 }
376 }
377
378 uisec = isec - isec % 4;
379 i4 = jj + ujsec - 1;
380 for (j = jj; j <= i4; j += 4)
381 {
382 i5 = ii + uisec - 1;
383 for (i = ii; i <= i5; i += 4)
384 {
385 f11 = c[i + j * c_dim1];
386 f21 = c[i + 1 + j * c_dim1];
387 f12 = c[i + (j + 1) * c_dim1];
388 f22 = c[i + 1 + (j + 1) * c_dim1];
389 f13 = c[i + (j + 2) * c_dim1];
390 f23 = c[i + 1 + (j + 2) * c_dim1];
391 f14 = c[i + (j + 3) * c_dim1];
392 f24 = c[i + 1 + (j + 3) * c_dim1];
393 f31 = c[i + 2 + j * c_dim1];
394 f41 = c[i + 3 + j * c_dim1];
395 f32 = c[i + 2 + (j + 1) * c_dim1];
396 f42 = c[i + 3 + (j + 1) * c_dim1];
397 f33 = c[i + 2 + (j + 2) * c_dim1];
398 f43 = c[i + 3 + (j + 2) * c_dim1];
399 f34 = c[i + 2 + (j + 3) * c_dim1];
400 f44 = c[i + 3 + (j + 3) * c_dim1];
401 i6 = ll + lsec - 1;
402 for (l = ll; l <= i6; ++l)
403 {
404 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
405 * b[l + j * b_dim1];
406 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
407 * b[l + j * b_dim1];
408 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
409 * b[l + (j + 1) * b_dim1];
410 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
411 * b[l + (j + 1) * b_dim1];
412 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
413 * b[l + (j + 2) * b_dim1];
414 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
415 * b[l + (j + 2) * b_dim1];
416 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
417 * b[l + (j + 3) * b_dim1];
418 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
419 * b[l + (j + 3) * b_dim1];
420 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
421 * b[l + j * b_dim1];
422 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
423 * b[l + j * b_dim1];
424 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
425 * b[l + (j + 1) * b_dim1];
426 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
427 * b[l + (j + 1) * b_dim1];
428 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
429 * b[l + (j + 2) * b_dim1];
430 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
431 * b[l + (j + 2) * b_dim1];
432 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
433 * b[l + (j + 3) * b_dim1];
434 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
435 * b[l + (j + 3) * b_dim1];
436 }
437 c[i + j * c_dim1] = f11;
438 c[i + 1 + j * c_dim1] = f21;
439 c[i + (j + 1) * c_dim1] = f12;
440 c[i + 1 + (j + 1) * c_dim1] = f22;
441 c[i + (j + 2) * c_dim1] = f13;
442 c[i + 1 + (j + 2) * c_dim1] = f23;
443 c[i + (j + 3) * c_dim1] = f14;
444 c[i + 1 + (j + 3) * c_dim1] = f24;
445 c[i + 2 + j * c_dim1] = f31;
446 c[i + 3 + j * c_dim1] = f41;
447 c[i + 2 + (j + 1) * c_dim1] = f32;
448 c[i + 3 + (j + 1) * c_dim1] = f42;
449 c[i + 2 + (j + 2) * c_dim1] = f33;
450 c[i + 3 + (j + 2) * c_dim1] = f43;
451 c[i + 2 + (j + 3) * c_dim1] = f34;
452 c[i + 3 + (j + 3) * c_dim1] = f44;
453 }
454 if (uisec < isec)
455 {
456 i5 = ii + isec - 1;
457 for (i = ii + uisec; i <= i5; ++i)
458 {
459 f11 = c[i + j * c_dim1];
460 f12 = c[i + (j + 1) * c_dim1];
461 f13 = c[i + (j + 2) * c_dim1];
462 f14 = c[i + (j + 3) * c_dim1];
463 i6 = ll + lsec - 1;
464 for (l = ll; l <= i6; ++l)
465 {
466 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
467 257] * b[l + j * b_dim1];
468 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
469 257] * b[l + (j + 1) * b_dim1];
470 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
471 257] * b[l + (j + 2) * b_dim1];
472 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
473 257] * b[l + (j + 3) * b_dim1];
474 }
475 c[i + j * c_dim1] = f11;
476 c[i + (j + 1) * c_dim1] = f12;
477 c[i + (j + 2) * c_dim1] = f13;
478 c[i + (j + 3) * c_dim1] = f14;
479 }
480 }
481 }
482 if (ujsec < jsec)
483 {
484 i4 = jj + jsec - 1;
485 for (j = jj + ujsec; j <= i4; ++j)
486 {
487 i5 = ii + uisec - 1;
488 for (i = ii; i <= i5; i += 4)
489 {
490 f11 = c[i + j * c_dim1];
491 f21 = c[i + 1 + j * c_dim1];
492 f31 = c[i + 2 + j * c_dim1];
493 f41 = c[i + 3 + j * c_dim1];
494 i6 = ll + lsec - 1;
495 for (l = ll; l <= i6; ++l)
496 {
497 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
498 257] * b[l + j * b_dim1];
499 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
500 257] * b[l + j * b_dim1];
501 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
502 257] * b[l + j * b_dim1];
503 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
504 257] * b[l + j * b_dim1];
505 }
506 c[i + j * c_dim1] = f11;
507 c[i + 1 + j * c_dim1] = f21;
508 c[i + 2 + j * c_dim1] = f31;
509 c[i + 3 + j * c_dim1] = f41;
510 }
511 i5 = ii + isec - 1;
512 for (i = ii + uisec; i <= i5; ++i)
513 {
514 f11 = c[i + j * c_dim1];
515 i6 = ll + lsec - 1;
516 for (l = ll; l <= i6; ++l)
517 {
518 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
519 257] * b[l + j * b_dim1];
520 }
521 c[i + j * c_dim1] = f11;
522 }
523 }
524 }
525 }
526 }
527 }
528 return;
529 }
530 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
531 {
532 if (GFC_DESCRIPTOR_RANK (a) != 1)
533 {
534 const GFC_INTEGER_8 *restrict abase_x;
535 const GFC_INTEGER_8 *restrict bbase_y;
536 GFC_INTEGER_8 *restrict dest_y;
537 GFC_INTEGER_8 s;
538
539 for (y = 0; y < ycount; y++)
540 {
541 bbase_y = &bbase[y*bystride];
542 dest_y = &dest[y*rystride];
543 for (x = 0; x < xcount; x++)
544 {
545 abase_x = &abase[x*axstride];
546 s = (GFC_INTEGER_8) 0;
547 for (n = 0; n < count; n++)
548 s += abase_x[n] * bbase_y[n];
549 dest_y[x] = s;
550 }
551 }
552 }
553 else
554 {
555 const GFC_INTEGER_8 *restrict bbase_y;
556 GFC_INTEGER_8 s;
557
558 for (y = 0; y < ycount; y++)
559 {
560 bbase_y = &bbase[y*bystride];
561 s = (GFC_INTEGER_8) 0;
562 for (n = 0; n < count; n++)
563 s += abase[n*axstride] * bbase_y[n];
564 dest[y*rystride] = s;
565 }
566 }
567 }
568 else if (axstride < aystride)
569 {
570 for (y = 0; y < ycount; y++)
571 for (x = 0; x < xcount; x++)
572 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
573
574 for (y = 0; y < ycount; y++)
575 for (n = 0; n < count; n++)
576 for (x = 0; x < xcount; x++)
577 /* dest[x,y] += a[x,n] * b[n,y] */
578 dest[x*rxstride + y*rystride] +=
579 abase[x*axstride + n*aystride] *
580 bbase[n*bxstride + y*bystride];
581 }
582 else if (GFC_DESCRIPTOR_RANK (a) == 1)
583 {
584 const GFC_INTEGER_8 *restrict bbase_y;
585 GFC_INTEGER_8 s;
586
587 for (y = 0; y < ycount; y++)
588 {
589 bbase_y = &bbase[y*bystride];
590 s = (GFC_INTEGER_8) 0;
591 for (n = 0; n < count; n++)
592 s += abase[n*axstride] * bbase_y[n*bxstride];
593 dest[y*rxstride] = s;
594 }
595 }
596 else
597 {
598 const GFC_INTEGER_8 *restrict abase_x;
599 const GFC_INTEGER_8 *restrict bbase_y;
600 GFC_INTEGER_8 *restrict dest_y;
601 GFC_INTEGER_8 s;
602
603 for (y = 0; y < ycount; y++)
604 {
605 bbase_y = &bbase[y*bystride];
606 dest_y = &dest[y*rystride];
607 for (x = 0; x < xcount; x++)
608 {
609 abase_x = &abase[x*axstride];
610 s = (GFC_INTEGER_8) 0;
611 for (n = 0; n < count; n++)
612 s += abase_x[n*aystride] * bbase_y[n*bxstride];
613 dest_y[x*rxstride] = s;
614 }
615 }
616 }
617}
618#undef POW3
619#undef min
620#undef max
621
622#endif /* HAVE_AVX */
623
624#ifdef HAVE_AVX2
625static void
626matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
627 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
6d03bdcc 628 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
31cfd832
TK
629static void
630matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
631 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
632 int blas_limit, blas_call gemm)
633{
634 const GFC_INTEGER_8 * restrict abase;
635 const GFC_INTEGER_8 * restrict bbase;
636 GFC_INTEGER_8 * restrict dest;
637
638 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
639 index_type x, y, n, count, xcount, ycount;
640
641 assert (GFC_DESCRIPTOR_RANK (a) == 2
642 || GFC_DESCRIPTOR_RANK (b) == 2);
643
644/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
645
646 Either A or B (but not both) can be rank 1:
647
648 o One-dimensional argument A is implicitly treated as a row matrix
649 dimensioned [1,count], so xcount=1.
650
651 o One-dimensional argument B is implicitly treated as a column matrix
652 dimensioned [count, 1], so ycount=1.
653*/
654
655 if (retarray->base_addr == NULL)
656 {
657 if (GFC_DESCRIPTOR_RANK (a) == 1)
658 {
659 GFC_DIMENSION_SET(retarray->dim[0], 0,
660 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
661 }
662 else if (GFC_DESCRIPTOR_RANK (b) == 1)
663 {
664 GFC_DIMENSION_SET(retarray->dim[0], 0,
665 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
666 }
667 else
668 {
669 GFC_DIMENSION_SET(retarray->dim[0], 0,
670 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
671
672 GFC_DIMENSION_SET(retarray->dim[1], 0,
673 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
674 GFC_DESCRIPTOR_EXTENT(retarray,0));
675 }
676
677 retarray->base_addr
678 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
679 retarray->offset = 0;
680 }
681 else if (unlikely (compile_options.bounds_check))
682 {
683 index_type ret_extent, arg_extent;
684
685 if (GFC_DESCRIPTOR_RANK (a) == 1)
686 {
687 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
689 if (arg_extent != ret_extent)
690 runtime_error ("Incorrect extent in return array in"
691 " MATMUL intrinsic: is %ld, should be %ld",
692 (long int) ret_extent, (long int) arg_extent);
693 }
694 else if (GFC_DESCRIPTOR_RANK (b) == 1)
695 {
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
699 runtime_error ("Incorrect extent in return array in"
700 " MATMUL intrinsic: is %ld, should be %ld",
701 (long int) ret_extent, (long int) arg_extent);
702 }
703 else
704 {
705 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
706 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
707 if (arg_extent != ret_extent)
708 runtime_error ("Incorrect extent in return array in"
709 " MATMUL intrinsic for dimension 1:"
710 " is %ld, should be %ld",
711 (long int) ret_extent, (long int) arg_extent);
712
713 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
714 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
715 if (arg_extent != ret_extent)
716 runtime_error ("Incorrect extent in return array in"
717 " MATMUL intrinsic for dimension 2:"
718 " is %ld, should be %ld",
719 (long int) ret_extent, (long int) arg_extent);
720 }
721 }
722
723
724 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
725 {
726 /* One-dimensional result may be addressed in the code below
727 either as a row or a column matrix. We want both cases to
728 work. */
729 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
730 }
731 else
732 {
733 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
734 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
735 }
736
737
738 if (GFC_DESCRIPTOR_RANK (a) == 1)
739 {
740 /* Treat it as a a row matrix A[1,count]. */
741 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
742 aystride = 1;
743
744 xcount = 1;
745 count = GFC_DESCRIPTOR_EXTENT(a,0);
746 }
747 else
748 {
749 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
750 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
751
752 count = GFC_DESCRIPTOR_EXTENT(a,1);
753 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
754 }
755
756 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
757 {
758 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
759 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
760 }
761
762 if (GFC_DESCRIPTOR_RANK (b) == 1)
763 {
764 /* Treat it as a column matrix B[count,1] */
765 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
766
767 /* bystride should never be used for 1-dimensional b.
768 in case it is we want it to cause a segfault, rather than
769 an incorrect result. */
770 bystride = 0xDEADBEEF;
771 ycount = 1;
772 }
773 else
774 {
775 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
776 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
777 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
778 }
779
780 abase = a->base_addr;
781 bbase = b->base_addr;
782 dest = retarray->base_addr;
783
784 /* Now that everything is set up, we perform the multiplication
785 itself. */
786
787#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
788#define min(a,b) ((a) <= (b) ? (a) : (b))
789#define max(a,b) ((a) >= (b) ? (a) : (b))
790
791 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
792 && (bxstride == 1 || bystride == 1)
793 && (((float) xcount) * ((float) ycount) * ((float) count)
794 > POW3(blas_limit)))
795 {
796 const int m = xcount, n = ycount, k = count, ldc = rystride;
797 const GFC_INTEGER_8 one = 1, zero = 0;
798 const int lda = (axstride == 1) ? aystride : axstride,
799 ldb = (bxstride == 1) ? bystride : bxstride;
800
801 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
802 {
803 assert (gemm != NULL);
804 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
805 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
806 &ldc, 1, 1);
807 return;
808 }
809 }
810
811 if (rxstride == 1 && axstride == 1 && bxstride == 1)
812 {
813 /* This block of code implements a tuned matmul, derived from
814 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
815
816 Bo Kagstrom and Per Ling
817 Department of Computing Science
818 Umea University
819 S-901 87 Umea, Sweden
820
821 from netlib.org, translated to C, and modified for matmul.m4. */
822
823 const GFC_INTEGER_8 *a, *b;
824 GFC_INTEGER_8 *c;
825 const index_type m = xcount, n = ycount, k = count;
826
827 /* System generated locals */
828 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
829 i1, i2, i3, i4, i5, i6;
830
831 /* Local variables */
832 GFC_INTEGER_8 t1[65536], /* was [256][256] */
833 f11, f12, f21, f22, f31, f32, f41, f42,
834 f13, f14, f23, f24, f33, f34, f43, f44;
835 index_type i, j, l, ii, jj, ll;
836 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
837
838 a = abase;
839 b = bbase;
840 c = retarray->base_addr;
841
842 /* Parameter adjustments */
843 c_dim1 = rystride;
844 c_offset = 1 + c_dim1;
845 c -= c_offset;
846 a_dim1 = aystride;
847 a_offset = 1 + a_dim1;
848 a -= a_offset;
849 b_dim1 = bystride;
850 b_offset = 1 + b_dim1;
851 b -= b_offset;
852
853 /* Early exit if possible */
854 if (m == 0 || n == 0 || k == 0)
855 return;
856
857 /* Empty c first. */
858 for (j=1; j<=n; j++)
859 for (i=1; i<=m; i++)
860 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
861
862 /* Start turning the crank. */
863 i1 = n;
864 for (jj = 1; jj <= i1; jj += 512)
865 {
866 /* Computing MIN */
867 i2 = 512;
868 i3 = n - jj + 1;
869 jsec = min(i2,i3);
870 ujsec = jsec - jsec % 4;
871 i2 = k;
872 for (ll = 1; ll <= i2; ll += 256)
873 {
874 /* Computing MIN */
875 i3 = 256;
876 i4 = k - ll + 1;
877 lsec = min(i3,i4);
878 ulsec = lsec - lsec % 2;
879
880 i3 = m;
881 for (ii = 1; ii <= i3; ii += 256)
882 {
883 /* Computing MIN */
884 i4 = 256;
885 i5 = m - ii + 1;
886 isec = min(i4,i5);
887 uisec = isec - isec % 2;
888 i4 = ll + ulsec - 1;
889 for (l = ll; l <= i4; l += 2)
890 {
891 i5 = ii + uisec - 1;
892 for (i = ii; i <= i5; i += 2)
893 {
894 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
895 a[i + l * a_dim1];
896 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
897 a[i + (l + 1) * a_dim1];
898 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
899 a[i + 1 + l * a_dim1];
900 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
901 a[i + 1 + (l + 1) * a_dim1];
902 }
903 if (uisec < isec)
904 {
905 t1[l - ll + 1 + (isec << 8) - 257] =
906 a[ii + isec - 1 + l * a_dim1];
907 t1[l - ll + 2 + (isec << 8) - 257] =
908 a[ii + isec - 1 + (l + 1) * a_dim1];
909 }
910 }
911 if (ulsec < lsec)
912 {
913 i4 = ii + isec - 1;
914 for (i = ii; i<= i4; ++i)
915 {
916 t1[lsec + ((i - ii + 1) << 8) - 257] =
917 a[i + (ll + lsec - 1) * a_dim1];
918 }
919 }
920
921 uisec = isec - isec % 4;
922 i4 = jj + ujsec - 1;
923 for (j = jj; j <= i4; j += 4)
924 {
925 i5 = ii + uisec - 1;
926 for (i = ii; i <= i5; i += 4)
927 {
928 f11 = c[i + j * c_dim1];
929 f21 = c[i + 1 + j * c_dim1];
930 f12 = c[i + (j + 1) * c_dim1];
931 f22 = c[i + 1 + (j + 1) * c_dim1];
932 f13 = c[i + (j + 2) * c_dim1];
933 f23 = c[i + 1 + (j + 2) * c_dim1];
934 f14 = c[i + (j + 3) * c_dim1];
935 f24 = c[i + 1 + (j + 3) * c_dim1];
936 f31 = c[i + 2 + j * c_dim1];
937 f41 = c[i + 3 + j * c_dim1];
938 f32 = c[i + 2 + (j + 1) * c_dim1];
939 f42 = c[i + 3 + (j + 1) * c_dim1];
940 f33 = c[i + 2 + (j + 2) * c_dim1];
941 f43 = c[i + 3 + (j + 2) * c_dim1];
942 f34 = c[i + 2 + (j + 3) * c_dim1];
943 f44 = c[i + 3 + (j + 3) * c_dim1];
944 i6 = ll + lsec - 1;
945 for (l = ll; l <= i6; ++l)
946 {
947 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
948 * b[l + j * b_dim1];
949 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
950 * b[l + j * b_dim1];
951 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
952 * b[l + (j + 1) * b_dim1];
953 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
954 * b[l + (j + 1) * b_dim1];
955 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
956 * b[l + (j + 2) * b_dim1];
957 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
958 * b[l + (j + 2) * b_dim1];
959 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
960 * b[l + (j + 3) * b_dim1];
961 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
962 * b[l + (j + 3) * b_dim1];
963 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
964 * b[l + j * b_dim1];
965 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
966 * b[l + j * b_dim1];
967 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
968 * b[l + (j + 1) * b_dim1];
969 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
970 * b[l + (j + 1) * b_dim1];
971 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
972 * b[l + (j + 2) * b_dim1];
973 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
974 * b[l + (j + 2) * b_dim1];
975 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
976 * b[l + (j + 3) * b_dim1];
977 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
978 * b[l + (j + 3) * b_dim1];
979 }
980 c[i + j * c_dim1] = f11;
981 c[i + 1 + j * c_dim1] = f21;
982 c[i + (j + 1) * c_dim1] = f12;
983 c[i + 1 + (j + 1) * c_dim1] = f22;
984 c[i + (j + 2) * c_dim1] = f13;
985 c[i + 1 + (j + 2) * c_dim1] = f23;
986 c[i + (j + 3) * c_dim1] = f14;
987 c[i + 1 + (j + 3) * c_dim1] = f24;
988 c[i + 2 + j * c_dim1] = f31;
989 c[i + 3 + j * c_dim1] = f41;
990 c[i + 2 + (j + 1) * c_dim1] = f32;
991 c[i + 3 + (j + 1) * c_dim1] = f42;
992 c[i + 2 + (j + 2) * c_dim1] = f33;
993 c[i + 3 + (j + 2) * c_dim1] = f43;
994 c[i + 2 + (j + 3) * c_dim1] = f34;
995 c[i + 3 + (j + 3) * c_dim1] = f44;
996 }
997 if (uisec < isec)
998 {
999 i5 = ii + isec - 1;
1000 for (i = ii + uisec; i <= i5; ++i)
1001 {
1002 f11 = c[i + j * c_dim1];
1003 f12 = c[i + (j + 1) * c_dim1];
1004 f13 = c[i + (j + 2) * c_dim1];
1005 f14 = c[i + (j + 3) * c_dim1];
1006 i6 = ll + lsec - 1;
1007 for (l = ll; l <= i6; ++l)
1008 {
1009 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1010 257] * b[l + j * b_dim1];
1011 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1012 257] * b[l + (j + 1) * b_dim1];
1013 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1014 257] * b[l + (j + 2) * b_dim1];
1015 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1016 257] * b[l + (j + 3) * b_dim1];
1017 }
1018 c[i + j * c_dim1] = f11;
1019 c[i + (j + 1) * c_dim1] = f12;
1020 c[i + (j + 2) * c_dim1] = f13;
1021 c[i + (j + 3) * c_dim1] = f14;
1022 }
1023 }
1024 }
1025 if (ujsec < jsec)
1026 {
1027 i4 = jj + jsec - 1;
1028 for (j = jj + ujsec; j <= i4; ++j)
1029 {
1030 i5 = ii + uisec - 1;
1031 for (i = ii; i <= i5; i += 4)
1032 {
1033 f11 = c[i + j * c_dim1];
1034 f21 = c[i + 1 + j * c_dim1];
1035 f31 = c[i + 2 + j * c_dim1];
1036 f41 = c[i + 3 + j * c_dim1];
1037 i6 = ll + lsec - 1;
1038 for (l = ll; l <= i6; ++l)
1039 {
1040 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041 257] * b[l + j * b_dim1];
1042 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1043 257] * b[l + j * b_dim1];
1044 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1045 257] * b[l + j * b_dim1];
1046 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1047 257] * b[l + j * b_dim1];
1048 }
1049 c[i + j * c_dim1] = f11;
1050 c[i + 1 + j * c_dim1] = f21;
1051 c[i + 2 + j * c_dim1] = f31;
1052 c[i + 3 + j * c_dim1] = f41;
1053 }
1054 i5 = ii + isec - 1;
1055 for (i = ii + uisec; i <= i5; ++i)
1056 {
1057 f11 = c[i + j * c_dim1];
1058 i6 = ll + lsec - 1;
1059 for (l = ll; l <= i6; ++l)
1060 {
1061 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1062 257] * b[l + j * b_dim1];
1063 }
1064 c[i + j * c_dim1] = f11;
1065 }
1066 }
1067 }
1068 }
1069 }
1070 }
1071 return;
1072 }
1073 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1074 {
1075 if (GFC_DESCRIPTOR_RANK (a) != 1)
1076 {
1077 const GFC_INTEGER_8 *restrict abase_x;
1078 const GFC_INTEGER_8 *restrict bbase_y;
1079 GFC_INTEGER_8 *restrict dest_y;
1080 GFC_INTEGER_8 s;
1081
1082 for (y = 0; y < ycount; y++)
1083 {
1084 bbase_y = &bbase[y*bystride];
1085 dest_y = &dest[y*rystride];
1086 for (x = 0; x < xcount; x++)
1087 {
1088 abase_x = &abase[x*axstride];
1089 s = (GFC_INTEGER_8) 0;
1090 for (n = 0; n < count; n++)
1091 s += abase_x[n] * bbase_y[n];
1092 dest_y[x] = s;
1093 }
1094 }
1095 }
1096 else
1097 {
1098 const GFC_INTEGER_8 *restrict bbase_y;
1099 GFC_INTEGER_8 s;
1100
1101 for (y = 0; y < ycount; y++)
1102 {
1103 bbase_y = &bbase[y*bystride];
1104 s = (GFC_INTEGER_8) 0;
1105 for (n = 0; n < count; n++)
1106 s += abase[n*axstride] * bbase_y[n];
1107 dest[y*rystride] = s;
1108 }
1109 }
1110 }
1111 else if (axstride < aystride)
1112 {
1113 for (y = 0; y < ycount; y++)
1114 for (x = 0; x < xcount; x++)
1115 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
1116
1117 for (y = 0; y < ycount; y++)
1118 for (n = 0; n < count; n++)
1119 for (x = 0; x < xcount; x++)
1120 /* dest[x,y] += a[x,n] * b[n,y] */
1121 dest[x*rxstride + y*rystride] +=
1122 abase[x*axstride + n*aystride] *
1123 bbase[n*bxstride + y*bystride];
1124 }
1125 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1126 {
1127 const GFC_INTEGER_8 *restrict bbase_y;
1128 GFC_INTEGER_8 s;
1129
1130 for (y = 0; y < ycount; y++)
1131 {
1132 bbase_y = &bbase[y*bystride];
1133 s = (GFC_INTEGER_8) 0;
1134 for (n = 0; n < count; n++)
1135 s += abase[n*axstride] * bbase_y[n*bxstride];
1136 dest[y*rxstride] = s;
1137 }
1138 }
1139 else
1140 {
1141 const GFC_INTEGER_8 *restrict abase_x;
1142 const GFC_INTEGER_8 *restrict bbase_y;
1143 GFC_INTEGER_8 *restrict dest_y;
1144 GFC_INTEGER_8 s;
1145
1146 for (y = 0; y < ycount; y++)
1147 {
1148 bbase_y = &bbase[y*bystride];
1149 dest_y = &dest[y*rystride];
1150 for (x = 0; x < xcount; x++)
1151 {
1152 abase_x = &abase[x*axstride];
1153 s = (GFC_INTEGER_8) 0;
1154 for (n = 0; n < count; n++)
1155 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1156 dest_y[x*rxstride] = s;
1157 }
1158 }
1159 }
1160}
1161#undef POW3
1162#undef min
1163#undef max
1164
1165#endif /* HAVE_AVX2 */
1166
1167#ifdef HAVE_AVX512F
1168static void
1169matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
1170 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
1171 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1172static void
1173matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
1174 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
1175 int blas_limit, blas_call gemm)
1176{
1177 const GFC_INTEGER_8 * restrict abase;
1178 const GFC_INTEGER_8 * restrict bbase;
1179 GFC_INTEGER_8 * restrict dest;
1180
1181 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1182 index_type x, y, n, count, xcount, ycount;
1183
1184 assert (GFC_DESCRIPTOR_RANK (a) == 2
1185 || GFC_DESCRIPTOR_RANK (b) == 2);
1186
1187/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1188
1189 Either A or B (but not both) can be rank 1:
1190
1191 o One-dimensional argument A is implicitly treated as a row matrix
1192 dimensioned [1,count], so xcount=1.
1193
1194 o One-dimensional argument B is implicitly treated as a column matrix
1195 dimensioned [count, 1], so ycount=1.
1196*/
1197
1198 if (retarray->base_addr == NULL)
1199 {
1200 if (GFC_DESCRIPTOR_RANK (a) == 1)
1201 {
1202 GFC_DIMENSION_SET(retarray->dim[0], 0,
1203 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1204 }
1205 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1206 {
1207 GFC_DIMENSION_SET(retarray->dim[0], 0,
1208 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1209 }
1210 else
1211 {
1212 GFC_DIMENSION_SET(retarray->dim[0], 0,
1213 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1214
1215 GFC_DIMENSION_SET(retarray->dim[1], 0,
1216 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1217 GFC_DESCRIPTOR_EXTENT(retarray,0));
1218 }
1219
1220 retarray->base_addr
1221 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
1222 retarray->offset = 0;
1223 }
1224 else if (unlikely (compile_options.bounds_check))
1225 {
1226 index_type ret_extent, arg_extent;
1227
1228 if (GFC_DESCRIPTOR_RANK (a) == 1)
1229 {
1230 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1231 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1232 if (arg_extent != ret_extent)
1233 runtime_error ("Incorrect extent in return array in"
1234 " MATMUL intrinsic: is %ld, should be %ld",
1235 (long int) ret_extent, (long int) arg_extent);
1236 }
1237 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1238 {
1239 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1240 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1241 if (arg_extent != ret_extent)
1242 runtime_error ("Incorrect extent in return array in"
1243 " MATMUL intrinsic: is %ld, should be %ld",
1244 (long int) ret_extent, (long int) arg_extent);
1245 }
1246 else
1247 {
1248 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1249 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250 if (arg_extent != ret_extent)
1251 runtime_error ("Incorrect extent in return array in"
1252 " MATMUL intrinsic for dimension 1:"
1253 " is %ld, should be %ld",
1254 (long int) ret_extent, (long int) arg_extent);
1255
1256 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1257 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1258 if (arg_extent != ret_extent)
1259 runtime_error ("Incorrect extent in return array in"
1260 " MATMUL intrinsic for dimension 2:"
1261 " is %ld, should be %ld",
1262 (long int) ret_extent, (long int) arg_extent);
1263 }
1264 }
1265
1266
1267 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1268 {
1269 /* One-dimensional result may be addressed in the code below
1270 either as a row or a column matrix. We want both cases to
1271 work. */
1272 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1273 }
1274 else
1275 {
1276 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1277 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1278 }
1279
1280
1281 if (GFC_DESCRIPTOR_RANK (a) == 1)
1282 {
1283 /* Treat it as a a row matrix A[1,count]. */
1284 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1285 aystride = 1;
1286
1287 xcount = 1;
1288 count = GFC_DESCRIPTOR_EXTENT(a,0);
1289 }
1290 else
1291 {
1292 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1294
1295 count = GFC_DESCRIPTOR_EXTENT(a,1);
1296 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1297 }
1298
1299 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1300 {
1301 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1302 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1303 }
1304
1305 if (GFC_DESCRIPTOR_RANK (b) == 1)
1306 {
1307 /* Treat it as a column matrix B[count,1] */
1308 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1309
1310 /* bystride should never be used for 1-dimensional b.
1311 in case it is we want it to cause a segfault, rather than
1312 an incorrect result. */
1313 bystride = 0xDEADBEEF;
1314 ycount = 1;
1315 }
1316 else
1317 {
1318 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1319 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1320 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1321 }
1322
1323 abase = a->base_addr;
1324 bbase = b->base_addr;
1325 dest = retarray->base_addr;
1326
1327 /* Now that everything is set up, we perform the multiplication
1328 itself. */
1329
1330#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1331#define min(a,b) ((a) <= (b) ? (a) : (b))
1332#define max(a,b) ((a) >= (b) ? (a) : (b))
1333
1334 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1335 && (bxstride == 1 || bystride == 1)
1336 && (((float) xcount) * ((float) ycount) * ((float) count)
1337 > POW3(blas_limit)))
1338 {
1339 const int m = xcount, n = ycount, k = count, ldc = rystride;
1340 const GFC_INTEGER_8 one = 1, zero = 0;
1341 const int lda = (axstride == 1) ? aystride : axstride,
1342 ldb = (bxstride == 1) ? bystride : bxstride;
1343
1344 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1345 {
1346 assert (gemm != NULL);
1347 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1348 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1349 &ldc, 1, 1);
1350 return;
1351 }
1352 }
1353
1354 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1355 {
1356 /* This block of code implements a tuned matmul, derived from
1357 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1358
1359 Bo Kagstrom and Per Ling
1360 Department of Computing Science
1361 Umea University
1362 S-901 87 Umea, Sweden
1363
1364 from netlib.org, translated to C, and modified for matmul.m4. */
1365
1366 const GFC_INTEGER_8 *a, *b;
1367 GFC_INTEGER_8 *c;
1368 const index_type m = xcount, n = ycount, k = count;
1369
1370 /* System generated locals */
1371 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1372 i1, i2, i3, i4, i5, i6;
1373
1374 /* Local variables */
1375 GFC_INTEGER_8 t1[65536], /* was [256][256] */
1376 f11, f12, f21, f22, f31, f32, f41, f42,
1377 f13, f14, f23, f24, f33, f34, f43, f44;
1378 index_type i, j, l, ii, jj, ll;
1379 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1380
1381 a = abase;
1382 b = bbase;
1383 c = retarray->base_addr;
1384
1385 /* Parameter adjustments */
1386 c_dim1 = rystride;
1387 c_offset = 1 + c_dim1;
1388 c -= c_offset;
1389 a_dim1 = aystride;
1390 a_offset = 1 + a_dim1;
1391 a -= a_offset;
1392 b_dim1 = bystride;
1393 b_offset = 1 + b_dim1;
1394 b -= b_offset;
1395
1396 /* Early exit if possible */
1397 if (m == 0 || n == 0 || k == 0)
1398 return;
1399
1400 /* Empty c first. */
1401 for (j=1; j<=n; j++)
1402 for (i=1; i<=m; i++)
1403 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
1404
1405 /* Start turning the crank. */
1406 i1 = n;
1407 for (jj = 1; jj <= i1; jj += 512)
1408 {
1409 /* Computing MIN */
1410 i2 = 512;
1411 i3 = n - jj + 1;
1412 jsec = min(i2,i3);
1413 ujsec = jsec - jsec % 4;
1414 i2 = k;
1415 for (ll = 1; ll <= i2; ll += 256)
1416 {
1417 /* Computing MIN */
1418 i3 = 256;
1419 i4 = k - ll + 1;
1420 lsec = min(i3,i4);
1421 ulsec = lsec - lsec % 2;
1422
1423 i3 = m;
1424 for (ii = 1; ii <= i3; ii += 256)
1425 {
1426 /* Computing MIN */
1427 i4 = 256;
1428 i5 = m - ii + 1;
1429 isec = min(i4,i5);
1430 uisec = isec - isec % 2;
1431 i4 = ll + ulsec - 1;
1432 for (l = ll; l <= i4; l += 2)
1433 {
1434 i5 = ii + uisec - 1;
1435 for (i = ii; i <= i5; i += 2)
1436 {
1437 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1438 a[i + l * a_dim1];
1439 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1440 a[i + (l + 1) * a_dim1];
1441 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1442 a[i + 1 + l * a_dim1];
1443 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1444 a[i + 1 + (l + 1) * a_dim1];
1445 }
1446 if (uisec < isec)
1447 {
1448 t1[l - ll + 1 + (isec << 8) - 257] =
1449 a[ii + isec - 1 + l * a_dim1];
1450 t1[l - ll + 2 + (isec << 8) - 257] =
1451 a[ii + isec - 1 + (l + 1) * a_dim1];
1452 }
1453 }
1454 if (ulsec < lsec)
1455 {
1456 i4 = ii + isec - 1;
1457 for (i = ii; i<= i4; ++i)
1458 {
1459 t1[lsec + ((i - ii + 1) << 8) - 257] =
1460 a[i + (ll + lsec - 1) * a_dim1];
1461 }
1462 }
1463
1464 uisec = isec - isec % 4;
1465 i4 = jj + ujsec - 1;
1466 for (j = jj; j <= i4; j += 4)
1467 {
1468 i5 = ii + uisec - 1;
1469 for (i = ii; i <= i5; i += 4)
1470 {
1471 f11 = c[i + j * c_dim1];
1472 f21 = c[i + 1 + j * c_dim1];
1473 f12 = c[i + (j + 1) * c_dim1];
1474 f22 = c[i + 1 + (j + 1) * c_dim1];
1475 f13 = c[i + (j + 2) * c_dim1];
1476 f23 = c[i + 1 + (j + 2) * c_dim1];
1477 f14 = c[i + (j + 3) * c_dim1];
1478 f24 = c[i + 1 + (j + 3) * c_dim1];
1479 f31 = c[i + 2 + j * c_dim1];
1480 f41 = c[i + 3 + j * c_dim1];
1481 f32 = c[i + 2 + (j + 1) * c_dim1];
1482 f42 = c[i + 3 + (j + 1) * c_dim1];
1483 f33 = c[i + 2 + (j + 2) * c_dim1];
1484 f43 = c[i + 3 + (j + 2) * c_dim1];
1485 f34 = c[i + 2 + (j + 3) * c_dim1];
1486 f44 = c[i + 3 + (j + 3) * c_dim1];
1487 i6 = ll + lsec - 1;
1488 for (l = ll; l <= i6; ++l)
1489 {
1490 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1491 * b[l + j * b_dim1];
1492 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1493 * b[l + j * b_dim1];
1494 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1495 * b[l + (j + 1) * b_dim1];
1496 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1497 * b[l + (j + 1) * b_dim1];
1498 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499 * b[l + (j + 2) * b_dim1];
1500 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501 * b[l + (j + 2) * b_dim1];
1502 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503 * b[l + (j + 3) * b_dim1];
1504 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505 * b[l + (j + 3) * b_dim1];
1506 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1507 * b[l + j * b_dim1];
1508 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1509 * b[l + j * b_dim1];
1510 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1511 * b[l + (j + 1) * b_dim1];
1512 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1513 * b[l + (j + 1) * b_dim1];
1514 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515 * b[l + (j + 2) * b_dim1];
1516 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517 * b[l + (j + 2) * b_dim1];
1518 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519 * b[l + (j + 3) * b_dim1];
1520 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521 * b[l + (j + 3) * b_dim1];
1522 }
1523 c[i + j * c_dim1] = f11;
1524 c[i + 1 + j * c_dim1] = f21;
1525 c[i + (j + 1) * c_dim1] = f12;
1526 c[i + 1 + (j + 1) * c_dim1] = f22;
1527 c[i + (j + 2) * c_dim1] = f13;
1528 c[i + 1 + (j + 2) * c_dim1] = f23;
1529 c[i + (j + 3) * c_dim1] = f14;
1530 c[i + 1 + (j + 3) * c_dim1] = f24;
1531 c[i + 2 + j * c_dim1] = f31;
1532 c[i + 3 + j * c_dim1] = f41;
1533 c[i + 2 + (j + 1) * c_dim1] = f32;
1534 c[i + 3 + (j + 1) * c_dim1] = f42;
1535 c[i + 2 + (j + 2) * c_dim1] = f33;
1536 c[i + 3 + (j + 2) * c_dim1] = f43;
1537 c[i + 2 + (j + 3) * c_dim1] = f34;
1538 c[i + 3 + (j + 3) * c_dim1] = f44;
1539 }
1540 if (uisec < isec)
1541 {
1542 i5 = ii + isec - 1;
1543 for (i = ii + uisec; i <= i5; ++i)
1544 {
1545 f11 = c[i + j * c_dim1];
1546 f12 = c[i + (j + 1) * c_dim1];
1547 f13 = c[i + (j + 2) * c_dim1];
1548 f14 = c[i + (j + 3) * c_dim1];
1549 i6 = ll + lsec - 1;
1550 for (l = ll; l <= i6; ++l)
1551 {
1552 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1553 257] * b[l + j * b_dim1];
1554 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1555 257] * b[l + (j + 1) * b_dim1];
1556 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1557 257] * b[l + (j + 2) * b_dim1];
1558 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1559 257] * b[l + (j + 3) * b_dim1];
1560 }
1561 c[i + j * c_dim1] = f11;
1562 c[i + (j + 1) * c_dim1] = f12;
1563 c[i + (j + 2) * c_dim1] = f13;
1564 c[i + (j + 3) * c_dim1] = f14;
1565 }
1566 }
1567 }
1568 if (ujsec < jsec)
1569 {
1570 i4 = jj + jsec - 1;
1571 for (j = jj + ujsec; j <= i4; ++j)
1572 {
1573 i5 = ii + uisec - 1;
1574 for (i = ii; i <= i5; i += 4)
1575 {
1576 f11 = c[i + j * c_dim1];
1577 f21 = c[i + 1 + j * c_dim1];
1578 f31 = c[i + 2 + j * c_dim1];
1579 f41 = c[i + 3 + j * c_dim1];
1580 i6 = ll + lsec - 1;
1581 for (l = ll; l <= i6; ++l)
1582 {
1583 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1584 257] * b[l + j * b_dim1];
1585 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1586 257] * b[l + j * b_dim1];
1587 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1588 257] * b[l + j * b_dim1];
1589 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1590 257] * b[l + j * b_dim1];
1591 }
1592 c[i + j * c_dim1] = f11;
1593 c[i + 1 + j * c_dim1] = f21;
1594 c[i + 2 + j * c_dim1] = f31;
1595 c[i + 3 + j * c_dim1] = f41;
1596 }
1597 i5 = ii + isec - 1;
1598 for (i = ii + uisec; i <= i5; ++i)
1599 {
1600 f11 = c[i + j * c_dim1];
1601 i6 = ll + lsec - 1;
1602 for (l = ll; l <= i6; ++l)
1603 {
1604 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1605 257] * b[l + j * b_dim1];
1606 }
1607 c[i + j * c_dim1] = f11;
1608 }
1609 }
1610 }
1611 }
1612 }
1613 }
1614 return;
1615 }
1616 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1617 {
1618 if (GFC_DESCRIPTOR_RANK (a) != 1)
1619 {
1620 const GFC_INTEGER_8 *restrict abase_x;
1621 const GFC_INTEGER_8 *restrict bbase_y;
1622 GFC_INTEGER_8 *restrict dest_y;
1623 GFC_INTEGER_8 s;
1624
1625 for (y = 0; y < ycount; y++)
1626 {
1627 bbase_y = &bbase[y*bystride];
1628 dest_y = &dest[y*rystride];
1629 for (x = 0; x < xcount; x++)
1630 {
1631 abase_x = &abase[x*axstride];
1632 s = (GFC_INTEGER_8) 0;
1633 for (n = 0; n < count; n++)
1634 s += abase_x[n] * bbase_y[n];
1635 dest_y[x] = s;
1636 }
1637 }
1638 }
1639 else
1640 {
1641 const GFC_INTEGER_8 *restrict bbase_y;
1642 GFC_INTEGER_8 s;
1643
1644 for (y = 0; y < ycount; y++)
1645 {
1646 bbase_y = &bbase[y*bystride];
1647 s = (GFC_INTEGER_8) 0;
1648 for (n = 0; n < count; n++)
1649 s += abase[n*axstride] * bbase_y[n];
1650 dest[y*rystride] = s;
1651 }
1652 }
1653 }
1654 else if (axstride < aystride)
1655 {
1656 for (y = 0; y < ycount; y++)
1657 for (x = 0; x < xcount; x++)
1658 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
1659
1660 for (y = 0; y < ycount; y++)
1661 for (n = 0; n < count; n++)
1662 for (x = 0; x < xcount; x++)
1663 /* dest[x,y] += a[x,n] * b[n,y] */
1664 dest[x*rxstride + y*rystride] +=
1665 abase[x*axstride + n*aystride] *
1666 bbase[n*bxstride + y*bystride];
1667 }
1668 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1669 {
1670 const GFC_INTEGER_8 *restrict bbase_y;
1671 GFC_INTEGER_8 s;
1672
1673 for (y = 0; y < ycount; y++)
1674 {
1675 bbase_y = &bbase[y*bystride];
1676 s = (GFC_INTEGER_8) 0;
1677 for (n = 0; n < count; n++)
1678 s += abase[n*axstride] * bbase_y[n*bxstride];
1679 dest[y*rxstride] = s;
1680 }
1681 }
1682 else
1683 {
1684 const GFC_INTEGER_8 *restrict abase_x;
1685 const GFC_INTEGER_8 *restrict bbase_y;
1686 GFC_INTEGER_8 *restrict dest_y;
1687 GFC_INTEGER_8 s;
1688
1689 for (y = 0; y < ycount; y++)
1690 {
1691 bbase_y = &bbase[y*bystride];
1692 dest_y = &dest[y*rystride];
1693 for (x = 0; x < xcount; x++)
1694 {
1695 abase_x = &abase[x*axstride];
1696 s = (GFC_INTEGER_8) 0;
1697 for (n = 0; n < count; n++)
1698 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1699 dest_y[x*rxstride] = s;
1700 }
1701 }
1702 }
1703}
1704#undef POW3
1705#undef min
1706#undef max
1707
1708#endif /* HAVE_AVX512F */
1709
1710/* Function to fall back to if there is no special processor-specific version. */
1711static void
1712matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
1713 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
1714 int blas_limit, blas_call gemm)
1715{
1716 const GFC_INTEGER_8 * restrict abase;
1717 const GFC_INTEGER_8 * restrict bbase;
1718 GFC_INTEGER_8 * restrict dest;
1719
1720 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1721 index_type x, y, n, count, xcount, ycount;
1722
1723 assert (GFC_DESCRIPTOR_RANK (a) == 2
1724 || GFC_DESCRIPTOR_RANK (b) == 2);
1725
1726/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1727
1728 Either A or B (but not both) can be rank 1:
1729
1730 o One-dimensional argument A is implicitly treated as a row matrix
1731 dimensioned [1,count], so xcount=1.
1732
1733 o One-dimensional argument B is implicitly treated as a column matrix
1734 dimensioned [count, 1], so ycount=1.
1735*/
1736
1737 if (retarray->base_addr == NULL)
1738 {
1739 if (GFC_DESCRIPTOR_RANK (a) == 1)
1740 {
1741 GFC_DIMENSION_SET(retarray->dim[0], 0,
1742 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1743 }
1744 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1745 {
1746 GFC_DIMENSION_SET(retarray->dim[0], 0,
1747 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1748 }
1749 else
1750 {
1751 GFC_DIMENSION_SET(retarray->dim[0], 0,
1752 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1753
1754 GFC_DIMENSION_SET(retarray->dim[1], 0,
1755 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1756 GFC_DESCRIPTOR_EXTENT(retarray,0));
1757 }
1758
1759 retarray->base_addr
1760 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
1761 retarray->offset = 0;
1762 }
1763 else if (unlikely (compile_options.bounds_check))
1764 {
1765 index_type ret_extent, arg_extent;
1766
1767 if (GFC_DESCRIPTOR_RANK (a) == 1)
1768 {
1769 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1770 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1771 if (arg_extent != ret_extent)
1772 runtime_error ("Incorrect extent in return array in"
1773 " MATMUL intrinsic: is %ld, should be %ld",
1774 (long int) ret_extent, (long int) arg_extent);
1775 }
1776 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1777 {
1778 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1779 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1780 if (arg_extent != ret_extent)
1781 runtime_error ("Incorrect extent in return array in"
1782 " MATMUL intrinsic: is %ld, should be %ld",
1783 (long int) ret_extent, (long int) arg_extent);
1784 }
1785 else
1786 {
1787 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1788 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1789 if (arg_extent != ret_extent)
1790 runtime_error ("Incorrect extent in return array in"
1791 " MATMUL intrinsic for dimension 1:"
1792 " is %ld, should be %ld",
1793 (long int) ret_extent, (long int) arg_extent);
1794
1795 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1796 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1797 if (arg_extent != ret_extent)
1798 runtime_error ("Incorrect extent in return array in"
1799 " MATMUL intrinsic for dimension 2:"
1800 " is %ld, should be %ld",
1801 (long int) ret_extent, (long int) arg_extent);
1802 }
1803 }
1804
1805
1806 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1807 {
1808 /* One-dimensional result may be addressed in the code below
1809 either as a row or a column matrix. We want both cases to
1810 work. */
1811 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1812 }
1813 else
1814 {
1815 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1816 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1817 }
1818
1819
1820 if (GFC_DESCRIPTOR_RANK (a) == 1)
1821 {
1822 /* Treat it as a a row matrix A[1,count]. */
1823 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1824 aystride = 1;
1825
1826 xcount = 1;
1827 count = GFC_DESCRIPTOR_EXTENT(a,0);
1828 }
1829 else
1830 {
1831 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1833
1834 count = GFC_DESCRIPTOR_EXTENT(a,1);
1835 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1836 }
1837
1838 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1839 {
1840 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1841 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1842 }
1843
1844 if (GFC_DESCRIPTOR_RANK (b) == 1)
1845 {
1846 /* Treat it as a column matrix B[count,1] */
1847 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1848
1849 /* bystride should never be used for 1-dimensional b.
1850 in case it is we want it to cause a segfault, rather than
1851 an incorrect result. */
1852 bystride = 0xDEADBEEF;
1853 ycount = 1;
1854 }
1855 else
1856 {
1857 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1858 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1859 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1860 }
1861
1862 abase = a->base_addr;
1863 bbase = b->base_addr;
1864 dest = retarray->base_addr;
1865
1866 /* Now that everything is set up, we perform the multiplication
1867 itself. */
1868
1869#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1870#define min(a,b) ((a) <= (b) ? (a) : (b))
1871#define max(a,b) ((a) >= (b) ? (a) : (b))
1872
1873 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1874 && (bxstride == 1 || bystride == 1)
1875 && (((float) xcount) * ((float) ycount) * ((float) count)
1876 > POW3(blas_limit)))
1877 {
1878 const int m = xcount, n = ycount, k = count, ldc = rystride;
1879 const GFC_INTEGER_8 one = 1, zero = 0;
1880 const int lda = (axstride == 1) ? aystride : axstride,
1881 ldb = (bxstride == 1) ? bystride : bxstride;
1882
1883 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1884 {
1885 assert (gemm != NULL);
1886 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1887 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1888 &ldc, 1, 1);
1889 return;
1890 }
1891 }
1892
1893 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1894 {
1895 /* This block of code implements a tuned matmul, derived from
1896 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1897
1898 Bo Kagstrom and Per Ling
1899 Department of Computing Science
1900 Umea University
1901 S-901 87 Umea, Sweden
1902
1903 from netlib.org, translated to C, and modified for matmul.m4. */
1904
1905 const GFC_INTEGER_8 *a, *b;
1906 GFC_INTEGER_8 *c;
1907 const index_type m = xcount, n = ycount, k = count;
1908
1909 /* System generated locals */
1910 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1911 i1, i2, i3, i4, i5, i6;
1912
1913 /* Local variables */
1914 GFC_INTEGER_8 t1[65536], /* was [256][256] */
1915 f11, f12, f21, f22, f31, f32, f41, f42,
1916 f13, f14, f23, f24, f33, f34, f43, f44;
1917 index_type i, j, l, ii, jj, ll;
1918 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1919
1920 a = abase;
1921 b = bbase;
1922 c = retarray->base_addr;
1923
1924 /* Parameter adjustments */
1925 c_dim1 = rystride;
1926 c_offset = 1 + c_dim1;
1927 c -= c_offset;
1928 a_dim1 = aystride;
1929 a_offset = 1 + a_dim1;
1930 a -= a_offset;
1931 b_dim1 = bystride;
1932 b_offset = 1 + b_dim1;
1933 b -= b_offset;
1934
1935 /* Early exit if possible */
1936 if (m == 0 || n == 0 || k == 0)
1937 return;
1938
1939 /* Empty c first. */
1940 for (j=1; j<=n; j++)
1941 for (i=1; i<=m; i++)
1942 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
1943
1944 /* Start turning the crank. */
1945 i1 = n;
1946 for (jj = 1; jj <= i1; jj += 512)
1947 {
1948 /* Computing MIN */
1949 i2 = 512;
1950 i3 = n - jj + 1;
1951 jsec = min(i2,i3);
1952 ujsec = jsec - jsec % 4;
1953 i2 = k;
1954 for (ll = 1; ll <= i2; ll += 256)
1955 {
1956 /* Computing MIN */
1957 i3 = 256;
1958 i4 = k - ll + 1;
1959 lsec = min(i3,i4);
1960 ulsec = lsec - lsec % 2;
1961
1962 i3 = m;
1963 for (ii = 1; ii <= i3; ii += 256)
1964 {
1965 /* Computing MIN */
1966 i4 = 256;
1967 i5 = m - ii + 1;
1968 isec = min(i4,i5);
1969 uisec = isec - isec % 2;
1970 i4 = ll + ulsec - 1;
1971 for (l = ll; l <= i4; l += 2)
1972 {
1973 i5 = ii + uisec - 1;
1974 for (i = ii; i <= i5; i += 2)
1975 {
1976 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1977 a[i + l * a_dim1];
1978 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1979 a[i + (l + 1) * a_dim1];
1980 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1981 a[i + 1 + l * a_dim1];
1982 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1983 a[i + 1 + (l + 1) * a_dim1];
1984 }
1985 if (uisec < isec)
1986 {
1987 t1[l - ll + 1 + (isec << 8) - 257] =
1988 a[ii + isec - 1 + l * a_dim1];
1989 t1[l - ll + 2 + (isec << 8) - 257] =
1990 a[ii + isec - 1 + (l + 1) * a_dim1];
1991 }
1992 }
1993 if (ulsec < lsec)
1994 {
1995 i4 = ii + isec - 1;
1996 for (i = ii; i<= i4; ++i)
1997 {
1998 t1[lsec + ((i - ii + 1) << 8) - 257] =
1999 a[i + (ll + lsec - 1) * a_dim1];
2000 }
2001 }
2002
2003 uisec = isec - isec % 4;
2004 i4 = jj + ujsec - 1;
2005 for (j = jj; j <= i4; j += 4)
2006 {
2007 i5 = ii + uisec - 1;
2008 for (i = ii; i <= i5; i += 4)
2009 {
2010 f11 = c[i + j * c_dim1];
2011 f21 = c[i + 1 + j * c_dim1];
2012 f12 = c[i + (j + 1) * c_dim1];
2013 f22 = c[i + 1 + (j + 1) * c_dim1];
2014 f13 = c[i + (j + 2) * c_dim1];
2015 f23 = c[i + 1 + (j + 2) * c_dim1];
2016 f14 = c[i + (j + 3) * c_dim1];
2017 f24 = c[i + 1 + (j + 3) * c_dim1];
2018 f31 = c[i + 2 + j * c_dim1];
2019 f41 = c[i + 3 + j * c_dim1];
2020 f32 = c[i + 2 + (j + 1) * c_dim1];
2021 f42 = c[i + 3 + (j + 1) * c_dim1];
2022 f33 = c[i + 2 + (j + 2) * c_dim1];
2023 f43 = c[i + 3 + (j + 2) * c_dim1];
2024 f34 = c[i + 2 + (j + 3) * c_dim1];
2025 f44 = c[i + 3 + (j + 3) * c_dim1];
2026 i6 = ll + lsec - 1;
2027 for (l = ll; l <= i6; ++l)
2028 {
2029 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2030 * b[l + j * b_dim1];
2031 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2032 * b[l + j * b_dim1];
2033 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2034 * b[l + (j + 1) * b_dim1];
2035 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2036 * b[l + (j + 1) * b_dim1];
2037 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038 * b[l + (j + 2) * b_dim1];
2039 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040 * b[l + (j + 2) * b_dim1];
2041 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042 * b[l + (j + 3) * b_dim1];
2043 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044 * b[l + (j + 3) * b_dim1];
2045 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2046 * b[l + j * b_dim1];
2047 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2048 * b[l + j * b_dim1];
2049 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2050 * b[l + (j + 1) * b_dim1];
2051 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2052 * b[l + (j + 1) * b_dim1];
2053 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054 * b[l + (j + 2) * b_dim1];
2055 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056 * b[l + (j + 2) * b_dim1];
2057 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058 * b[l + (j + 3) * b_dim1];
2059 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060 * b[l + (j + 3) * b_dim1];
2061 }
2062 c[i + j * c_dim1] = f11;
2063 c[i + 1 + j * c_dim1] = f21;
2064 c[i + (j + 1) * c_dim1] = f12;
2065 c[i + 1 + (j + 1) * c_dim1] = f22;
2066 c[i + (j + 2) * c_dim1] = f13;
2067 c[i + 1 + (j + 2) * c_dim1] = f23;
2068 c[i + (j + 3) * c_dim1] = f14;
2069 c[i + 1 + (j + 3) * c_dim1] = f24;
2070 c[i + 2 + j * c_dim1] = f31;
2071 c[i + 3 + j * c_dim1] = f41;
2072 c[i + 2 + (j + 1) * c_dim1] = f32;
2073 c[i + 3 + (j + 1) * c_dim1] = f42;
2074 c[i + 2 + (j + 2) * c_dim1] = f33;
2075 c[i + 3 + (j + 2) * c_dim1] = f43;
2076 c[i + 2 + (j + 3) * c_dim1] = f34;
2077 c[i + 3 + (j + 3) * c_dim1] = f44;
2078 }
2079 if (uisec < isec)
2080 {
2081 i5 = ii + isec - 1;
2082 for (i = ii + uisec; i <= i5; ++i)
2083 {
2084 f11 = c[i + j * c_dim1];
2085 f12 = c[i + (j + 1) * c_dim1];
2086 f13 = c[i + (j + 2) * c_dim1];
2087 f14 = c[i + (j + 3) * c_dim1];
2088 i6 = ll + lsec - 1;
2089 for (l = ll; l <= i6; ++l)
2090 {
2091 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2092 257] * b[l + j * b_dim1];
2093 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2094 257] * b[l + (j + 1) * b_dim1];
2095 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2096 257] * b[l + (j + 2) * b_dim1];
2097 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2098 257] * b[l + (j + 3) * b_dim1];
2099 }
2100 c[i + j * c_dim1] = f11;
2101 c[i + (j + 1) * c_dim1] = f12;
2102 c[i + (j + 2) * c_dim1] = f13;
2103 c[i + (j + 3) * c_dim1] = f14;
2104 }
2105 }
2106 }
2107 if (ujsec < jsec)
2108 {
2109 i4 = jj + jsec - 1;
2110 for (j = jj + ujsec; j <= i4; ++j)
2111 {
2112 i5 = ii + uisec - 1;
2113 for (i = ii; i <= i5; i += 4)
2114 {
2115 f11 = c[i + j * c_dim1];
2116 f21 = c[i + 1 + j * c_dim1];
2117 f31 = c[i + 2 + j * c_dim1];
2118 f41 = c[i + 3 + j * c_dim1];
2119 i6 = ll + lsec - 1;
2120 for (l = ll; l <= i6; ++l)
2121 {
2122 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2123 257] * b[l + j * b_dim1];
2124 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2125 257] * b[l + j * b_dim1];
2126 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2127 257] * b[l + j * b_dim1];
2128 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2129 257] * b[l + j * b_dim1];
2130 }
2131 c[i + j * c_dim1] = f11;
2132 c[i + 1 + j * c_dim1] = f21;
2133 c[i + 2 + j * c_dim1] = f31;
2134 c[i + 3 + j * c_dim1] = f41;
2135 }
2136 i5 = ii + isec - 1;
2137 for (i = ii + uisec; i <= i5; ++i)
2138 {
2139 f11 = c[i + j * c_dim1];
2140 i6 = ll + lsec - 1;
2141 for (l = ll; l <= i6; ++l)
2142 {
2143 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2144 257] * b[l + j * b_dim1];
2145 }
2146 c[i + j * c_dim1] = f11;
2147 }
2148 }
2149 }
2150 }
2151 }
2152 }
2153 return;
2154 }
2155 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2156 {
2157 if (GFC_DESCRIPTOR_RANK (a) != 1)
2158 {
2159 const GFC_INTEGER_8 *restrict abase_x;
2160 const GFC_INTEGER_8 *restrict bbase_y;
2161 GFC_INTEGER_8 *restrict dest_y;
2162 GFC_INTEGER_8 s;
2163
2164 for (y = 0; y < ycount; y++)
2165 {
2166 bbase_y = &bbase[y*bystride];
2167 dest_y = &dest[y*rystride];
2168 for (x = 0; x < xcount; x++)
2169 {
2170 abase_x = &abase[x*axstride];
2171 s = (GFC_INTEGER_8) 0;
2172 for (n = 0; n < count; n++)
2173 s += abase_x[n] * bbase_y[n];
2174 dest_y[x] = s;
2175 }
2176 }
2177 }
2178 else
2179 {
2180 const GFC_INTEGER_8 *restrict bbase_y;
2181 GFC_INTEGER_8 s;
2182
2183 for (y = 0; y < ycount; y++)
2184 {
2185 bbase_y = &bbase[y*bystride];
2186 s = (GFC_INTEGER_8) 0;
2187 for (n = 0; n < count; n++)
2188 s += abase[n*axstride] * bbase_y[n];
2189 dest[y*rystride] = s;
2190 }
2191 }
2192 }
2193 else if (axstride < aystride)
2194 {
2195 for (y = 0; y < ycount; y++)
2196 for (x = 0; x < xcount; x++)
2197 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
2198
2199 for (y = 0; y < ycount; y++)
2200 for (n = 0; n < count; n++)
2201 for (x = 0; x < xcount; x++)
2202 /* dest[x,y] += a[x,n] * b[n,y] */
2203 dest[x*rxstride + y*rystride] +=
2204 abase[x*axstride + n*aystride] *
2205 bbase[n*bxstride + y*bystride];
2206 }
2207 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2208 {
2209 const GFC_INTEGER_8 *restrict bbase_y;
2210 GFC_INTEGER_8 s;
2211
2212 for (y = 0; y < ycount; y++)
2213 {
2214 bbase_y = &bbase[y*bystride];
2215 s = (GFC_INTEGER_8) 0;
2216 for (n = 0; n < count; n++)
2217 s += abase[n*axstride] * bbase_y[n*bxstride];
2218 dest[y*rxstride] = s;
2219 }
2220 }
2221 else
2222 {
2223 const GFC_INTEGER_8 *restrict abase_x;
2224 const GFC_INTEGER_8 *restrict bbase_y;
2225 GFC_INTEGER_8 *restrict dest_y;
2226 GFC_INTEGER_8 s;
2227
2228 for (y = 0; y < ycount; y++)
2229 {
2230 bbase_y = &bbase[y*bystride];
2231 dest_y = &dest[y*rystride];
2232 for (x = 0; x < xcount; x++)
2233 {
2234 abase_x = &abase[x*axstride];
2235 s = (GFC_INTEGER_8) 0;
2236 for (n = 0; n < count; n++)
2237 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2238 dest_y[x*rxstride] = s;
2239 }
2240 }
2241 }
2242}
2243#undef POW3
2244#undef min
2245#undef max
2246
2247
2248/* Compiling main function, with selection code for the processor. */
2249
2250/* Currently, this is i386 only. Adjust for other architectures. */
2251
2252#include <config/i386/cpuinfo.h>
2253void matmul_i8 (gfc_array_i8 * const restrict retarray,
2254 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
2255 int blas_limit, blas_call gemm)
2256{
2257 static void (*matmul_p) (gfc_array_i8 * const restrict retarray,
2258 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
f03e9217
TK
2259 int blas_limit, blas_call gemm);
2260
2261 void (*matmul_fn) (gfc_array_i8 * const restrict retarray,
2262 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
2263 int blas_limit, blas_call gemm);
31cfd832 2264
f03e9217
TK
2265 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2266 if (matmul_fn == NULL)
31cfd832 2267 {
f03e9217 2268 matmul_fn = matmul_i8_vanilla;
31cfd832
TK
2269 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2270 {
2271 /* Run down the available processors in order of preference. */
2272#ifdef HAVE_AVX512F
2273 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2274 {
f03e9217
TK
2275 matmul_fn = matmul_i8_avx512f;
2276 goto store;
31cfd832
TK
2277 }
2278
2279#endif /* HAVE_AVX512F */
2280
2281#ifdef HAVE_AVX2
6d03bdcc
TK
2282 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2283 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
31cfd832 2284 {
f03e9217
TK
2285 matmul_fn = matmul_i8_avx2;
2286 goto store;
31cfd832
TK
2287 }
2288
2289#endif
2290
2291#ifdef HAVE_AVX
2292 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2293 {
f03e9217
TK
2294 matmul_fn = matmul_i8_avx;
2295 goto store;
31cfd832
TK
2296 }
2297#endif /* HAVE_AVX */
2298 }
f03e9217
TK
2299 store:
2300 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
31cfd832
TK
2301 }
2302
f03e9217 2303 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
31cfd832
TK
2304}
2305
2306#else /* Just the vanilla function. */
2307
6de9cd9a 2308void
85206901 2309matmul_i8 (gfc_array_i8 * const restrict retarray,
5a0aad31
FXC
2310 gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
2311 int blas_limit, blas_call gemm)
6de9cd9a 2312{
85206901
JB
2313 const GFC_INTEGER_8 * restrict abase;
2314 const GFC_INTEGER_8 * restrict bbase;
2315 GFC_INTEGER_8 * restrict dest;
410d3bba
VL
2316
2317 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2318 index_type x, y, n, count, xcount, ycount;
6de9cd9a
DN
2319
2320 assert (GFC_DESCRIPTOR_RANK (a) == 2
2321 || GFC_DESCRIPTOR_RANK (b) == 2);
883c9d4d 2322
410d3bba
VL
2323/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2324
2325 Either A or B (but not both) can be rank 1:
2326
2327 o One-dimensional argument A is implicitly treated as a row matrix
2328 dimensioned [1,count], so xcount=1.
2329
2330 o One-dimensional argument B is implicitly treated as a column matrix
2331 dimensioned [count, 1], so ycount=1.
5d70ab07 2332*/
410d3bba 2333
21d1335b 2334 if (retarray->base_addr == NULL)
883c9d4d
VL
2335 {
2336 if (GFC_DESCRIPTOR_RANK (a) == 1)
2337 {
dfb55fdc
TK
2338 GFC_DIMENSION_SET(retarray->dim[0], 0,
2339 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
883c9d4d
VL
2340 }
2341 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2342 {
dfb55fdc
TK
2343 GFC_DIMENSION_SET(retarray->dim[0], 0,
2344 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
883c9d4d
VL
2345 }
2346 else
2347 {
dfb55fdc
TK
2348 GFC_DIMENSION_SET(retarray->dim[0], 0,
2349 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
420aa7b8 2350
dfb55fdc
TK
2351 GFC_DIMENSION_SET(retarray->dim[1], 0,
2352 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2353 GFC_DESCRIPTOR_EXTENT(retarray,0));
883c9d4d 2354 }
420aa7b8 2355
21d1335b 2356 retarray->base_addr
92e6f3a4 2357 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
efd4dc1a 2358 retarray->offset = 0;
883c9d4d 2359 }
5d70ab07
JD
2360 else if (unlikely (compile_options.bounds_check))
2361 {
2362 index_type ret_extent, arg_extent;
2363
2364 if (GFC_DESCRIPTOR_RANK (a) == 1)
2365 {
2366 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2367 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2368 if (arg_extent != ret_extent)
2369 runtime_error ("Incorrect extent in return array in"
2370 " MATMUL intrinsic: is %ld, should be %ld",
2371 (long int) ret_extent, (long int) arg_extent);
2372 }
2373 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2374 {
2375 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2376 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2377 if (arg_extent != ret_extent)
2378 runtime_error ("Incorrect extent in return array in"
2379 " MATMUL intrinsic: is %ld, should be %ld",
2380 (long int) ret_extent, (long int) arg_extent);
2381 }
2382 else
2383 {
2384 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2385 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2386 if (arg_extent != ret_extent)
2387 runtime_error ("Incorrect extent in return array in"
2388 " MATMUL intrinsic for dimension 1:"
2389 " is %ld, should be %ld",
2390 (long int) ret_extent, (long int) arg_extent);
2391
2392 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2393 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2394 if (arg_extent != ret_extent)
2395 runtime_error ("Incorrect extent in return array in"
2396 " MATMUL intrinsic for dimension 2:"
2397 " is %ld, should be %ld",
2398 (long int) ret_extent, (long int) arg_extent);
2399 }
2400 }
883c9d4d 2401
6de9cd9a
DN
2402
2403 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2404 {
410d3bba
VL
2405 /* One-dimensional result may be addressed in the code below
2406 either as a row or a column matrix. We want both cases to
2407 work. */
dfb55fdc 2408 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
6de9cd9a
DN
2409 }
2410 else
2411 {
dfb55fdc
TK
2412 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2413 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
6de9cd9a
DN
2414 }
2415
410d3bba 2416
6de9cd9a
DN
2417 if (GFC_DESCRIPTOR_RANK (a) == 1)
2418 {
410d3bba 2419 /* Treat it as a a row matrix A[1,count]. */
dfb55fdc 2420 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
410d3bba
VL
2421 aystride = 1;
2422
6de9cd9a 2423 xcount = 1;
dfb55fdc 2424 count = GFC_DESCRIPTOR_EXTENT(a,0);
6de9cd9a
DN
2425 }
2426 else
2427 {
dfb55fdc
TK
2428 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2429 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
410d3bba 2430
dfb55fdc
TK
2431 count = GFC_DESCRIPTOR_EXTENT(a,1);
2432 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
6de9cd9a 2433 }
410d3bba 2434
dfb55fdc 2435 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
7edc89d4 2436 {
dfb55fdc 2437 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
7edc89d4
TK
2438 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2439 }
410d3bba 2440
6de9cd9a
DN
2441 if (GFC_DESCRIPTOR_RANK (b) == 1)
2442 {
410d3bba 2443 /* Treat it as a column matrix B[count,1] */
dfb55fdc 2444 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
410d3bba
VL
2445
2446 /* bystride should never be used for 1-dimensional b.
2447 in case it is we want it to cause a segfault, rather than
2448 an incorrect result. */
420aa7b8 2449 bystride = 0xDEADBEEF;
6de9cd9a
DN
2450 ycount = 1;
2451 }
2452 else
2453 {
dfb55fdc
TK
2454 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2455 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2456 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
6de9cd9a
DN
2457 }
2458
21d1335b
TB
2459 abase = a->base_addr;
2460 bbase = b->base_addr;
2461 dest = retarray->base_addr;
410d3bba 2462
5d70ab07 2463 /* Now that everything is set up, we perform the multiplication
5a0aad31
FXC
2464 itself. */
2465
2466#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
5d70ab07
JD
2467#define min(a,b) ((a) <= (b) ? (a) : (b))
2468#define max(a,b) ((a) >= (b) ? (a) : (b))
5a0aad31
FXC
2469
2470 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2471 && (bxstride == 1 || bystride == 1)
2472 && (((float) xcount) * ((float) ycount) * ((float) count)
2473 > POW3(blas_limit)))
6de9cd9a 2474 {
5d70ab07
JD
2475 const int m = xcount, n = ycount, k = count, ldc = rystride;
2476 const GFC_INTEGER_8 one = 1, zero = 0;
2477 const int lda = (axstride == 1) ? aystride : axstride,
2478 ldb = (bxstride == 1) ? bystride : bxstride;
410d3bba 2479
5d70ab07 2480 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
ae740cce 2481 {
5d70ab07
JD
2482 assert (gemm != NULL);
2483 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2484 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2485 &ldc, 1, 1);
2486 return;
ae740cce 2487 }
5d70ab07 2488 }
410d3bba 2489
5d70ab07
JD
2490 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2491 {
2492 /* This block of code implements a tuned matmul, derived from
2493 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2494
2495 Bo Kagstrom and Per Ling
2496 Department of Computing Science
2497 Umea University
2498 S-901 87 Umea, Sweden
2499
2500 from netlib.org, translated to C, and modified for matmul.m4. */
2501
2502 const GFC_INTEGER_8 *a, *b;
2503 GFC_INTEGER_8 *c;
2504 const index_type m = xcount, n = ycount, k = count;
2505
2506 /* System generated locals */
2507 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2508 i1, i2, i3, i4, i5, i6;
2509
2510 /* Local variables */
2511 GFC_INTEGER_8 t1[65536], /* was [256][256] */
2512 f11, f12, f21, f22, f31, f32, f41, f42,
2513 f13, f14, f23, f24, f33, f34, f43, f44;
2514 index_type i, j, l, ii, jj, ll;
2515 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2516
2517 a = abase;
2518 b = bbase;
2519 c = retarray->base_addr;
2520
2521 /* Parameter adjustments */
2522 c_dim1 = rystride;
2523 c_offset = 1 + c_dim1;
2524 c -= c_offset;
2525 a_dim1 = aystride;
2526 a_offset = 1 + a_dim1;
2527 a -= a_offset;
2528 b_dim1 = bystride;
2529 b_offset = 1 + b_dim1;
2530 b -= b_offset;
2531
2532 /* Early exit if possible */
2533 if (m == 0 || n == 0 || k == 0)
2534 return;
2535
2536 /* Empty c first. */
2537 for (j=1; j<=n; j++)
2538 for (i=1; i<=m; i++)
2539 c[i + j * c_dim1] = (GFC_INTEGER_8)0;
2540
2541 /* Start turning the crank. */
2542 i1 = n;
2543 for (jj = 1; jj <= i1; jj += 512)
410d3bba 2544 {
5d70ab07
JD
2545 /* Computing MIN */
2546 i2 = 512;
2547 i3 = n - jj + 1;
2548 jsec = min(i2,i3);
2549 ujsec = jsec - jsec % 4;
2550 i2 = k;
2551 for (ll = 1; ll <= i2; ll += 256)
410d3bba 2552 {
5d70ab07
JD
2553 /* Computing MIN */
2554 i3 = 256;
2555 i4 = k - ll + 1;
2556 lsec = min(i3,i4);
2557 ulsec = lsec - lsec % 2;
2558
2559 i3 = m;
2560 for (ii = 1; ii <= i3; ii += 256)
410d3bba 2561 {
5d70ab07
JD
2562 /* Computing MIN */
2563 i4 = 256;
2564 i5 = m - ii + 1;
2565 isec = min(i4,i5);
2566 uisec = isec - isec % 2;
2567 i4 = ll + ulsec - 1;
2568 for (l = ll; l <= i4; l += 2)
2569 {
2570 i5 = ii + uisec - 1;
2571 for (i = ii; i <= i5; i += 2)
2572 {
2573 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2574 a[i + l * a_dim1];
2575 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2576 a[i + (l + 1) * a_dim1];
2577 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2578 a[i + 1 + l * a_dim1];
2579 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2580 a[i + 1 + (l + 1) * a_dim1];
2581 }
2582 if (uisec < isec)
2583 {
2584 t1[l - ll + 1 + (isec << 8) - 257] =
2585 a[ii + isec - 1 + l * a_dim1];
2586 t1[l - ll + 2 + (isec << 8) - 257] =
2587 a[ii + isec - 1 + (l + 1) * a_dim1];
2588 }
2589 }
2590 if (ulsec < lsec)
2591 {
2592 i4 = ii + isec - 1;
2593 for (i = ii; i<= i4; ++i)
2594 {
2595 t1[lsec + ((i - ii + 1) << 8) - 257] =
2596 a[i + (ll + lsec - 1) * a_dim1];
2597 }
2598 }
2599
2600 uisec = isec - isec % 4;
2601 i4 = jj + ujsec - 1;
2602 for (j = jj; j <= i4; j += 4)
2603 {
2604 i5 = ii + uisec - 1;
2605 for (i = ii; i <= i5; i += 4)
2606 {
2607 f11 = c[i + j * c_dim1];
2608 f21 = c[i + 1 + j * c_dim1];
2609 f12 = c[i + (j + 1) * c_dim1];
2610 f22 = c[i + 1 + (j + 1) * c_dim1];
2611 f13 = c[i + (j + 2) * c_dim1];
2612 f23 = c[i + 1 + (j + 2) * c_dim1];
2613 f14 = c[i + (j + 3) * c_dim1];
2614 f24 = c[i + 1 + (j + 3) * c_dim1];
2615 f31 = c[i + 2 + j * c_dim1];
2616 f41 = c[i + 3 + j * c_dim1];
2617 f32 = c[i + 2 + (j + 1) * c_dim1];
2618 f42 = c[i + 3 + (j + 1) * c_dim1];
2619 f33 = c[i + 2 + (j + 2) * c_dim1];
2620 f43 = c[i + 3 + (j + 2) * c_dim1];
2621 f34 = c[i + 2 + (j + 3) * c_dim1];
2622 f44 = c[i + 3 + (j + 3) * c_dim1];
2623 i6 = ll + lsec - 1;
2624 for (l = ll; l <= i6; ++l)
2625 {
2626 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627 * b[l + j * b_dim1];
2628 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629 * b[l + j * b_dim1];
2630 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631 * b[l + (j + 1) * b_dim1];
2632 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633 * b[l + (j + 1) * b_dim1];
2634 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635 * b[l + (j + 2) * b_dim1];
2636 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637 * b[l + (j + 2) * b_dim1];
2638 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2639 * b[l + (j + 3) * b_dim1];
2640 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2641 * b[l + (j + 3) * b_dim1];
2642 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643 * b[l + j * b_dim1];
2644 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645 * b[l + j * b_dim1];
2646 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647 * b[l + (j + 1) * b_dim1];
2648 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649 * b[l + (j + 1) * b_dim1];
2650 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651 * b[l + (j + 2) * b_dim1];
2652 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653 * b[l + (j + 2) * b_dim1];
2654 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2655 * b[l + (j + 3) * b_dim1];
2656 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2657 * b[l + (j + 3) * b_dim1];
2658 }
2659 c[i + j * c_dim1] = f11;
2660 c[i + 1 + j * c_dim1] = f21;
2661 c[i + (j + 1) * c_dim1] = f12;
2662 c[i + 1 + (j + 1) * c_dim1] = f22;
2663 c[i + (j + 2) * c_dim1] = f13;
2664 c[i + 1 + (j + 2) * c_dim1] = f23;
2665 c[i + (j + 3) * c_dim1] = f14;
2666 c[i + 1 + (j + 3) * c_dim1] = f24;
2667 c[i + 2 + j * c_dim1] = f31;
2668 c[i + 3 + j * c_dim1] = f41;
2669 c[i + 2 + (j + 1) * c_dim1] = f32;
2670 c[i + 3 + (j + 1) * c_dim1] = f42;
2671 c[i + 2 + (j + 2) * c_dim1] = f33;
2672 c[i + 3 + (j + 2) * c_dim1] = f43;
2673 c[i + 2 + (j + 3) * c_dim1] = f34;
2674 c[i + 3 + (j + 3) * c_dim1] = f44;
2675 }
2676 if (uisec < isec)
2677 {
2678 i5 = ii + isec - 1;
2679 for (i = ii + uisec; i <= i5; ++i)
2680 {
2681 f11 = c[i + j * c_dim1];
2682 f12 = c[i + (j + 1) * c_dim1];
2683 f13 = c[i + (j + 2) * c_dim1];
2684 f14 = c[i + (j + 3) * c_dim1];
2685 i6 = ll + lsec - 1;
2686 for (l = ll; l <= i6; ++l)
2687 {
2688 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + j * b_dim1];
2690 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691 257] * b[l + (j + 1) * b_dim1];
2692 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2693 257] * b[l + (j + 2) * b_dim1];
2694 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2695 257] * b[l + (j + 3) * b_dim1];
2696 }
2697 c[i + j * c_dim1] = f11;
2698 c[i + (j + 1) * c_dim1] = f12;
2699 c[i + (j + 2) * c_dim1] = f13;
2700 c[i + (j + 3) * c_dim1] = f14;
2701 }
2702 }
2703 }
2704 if (ujsec < jsec)
2705 {
2706 i4 = jj + jsec - 1;
2707 for (j = jj + ujsec; j <= i4; ++j)
2708 {
2709 i5 = ii + uisec - 1;
2710 for (i = ii; i <= i5; i += 4)
2711 {
2712 f11 = c[i + j * c_dim1];
2713 f21 = c[i + 1 + j * c_dim1];
2714 f31 = c[i + 2 + j * c_dim1];
2715 f41 = c[i + 3 + j * c_dim1];
2716 i6 = ll + lsec - 1;
2717 for (l = ll; l <= i6; ++l)
2718 {
2719 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2720 257] * b[l + j * b_dim1];
2721 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2722 257] * b[l + j * b_dim1];
2723 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2724 257] * b[l + j * b_dim1];
2725 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2726 257] * b[l + j * b_dim1];
2727 }
2728 c[i + j * c_dim1] = f11;
2729 c[i + 1 + j * c_dim1] = f21;
2730 c[i + 2 + j * c_dim1] = f31;
2731 c[i + 3 + j * c_dim1] = f41;
2732 }
2733 i5 = ii + isec - 1;
2734 for (i = ii + uisec; i <= i5; ++i)
2735 {
2736 f11 = c[i + j * c_dim1];
2737 i6 = ll + lsec - 1;
2738 for (l = ll; l <= i6; ++l)
2739 {
2740 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741 257] * b[l + j * b_dim1];
2742 }
2743 c[i + j * c_dim1] = f11;
2744 }
2745 }
2746 }
410d3bba
VL
2747 }
2748 }
2749 }
5d70ab07 2750 return;
410d3bba 2751 }
1524f80b
RS
2752 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2753 {
a4a11197
PT
2754 if (GFC_DESCRIPTOR_RANK (a) != 1)
2755 {
2756 const GFC_INTEGER_8 *restrict abase_x;
2757 const GFC_INTEGER_8 *restrict bbase_y;
2758 GFC_INTEGER_8 *restrict dest_y;
2759 GFC_INTEGER_8 s;
1524f80b 2760
a4a11197
PT
2761 for (y = 0; y < ycount; y++)
2762 {
2763 bbase_y = &bbase[y*bystride];
2764 dest_y = &dest[y*rystride];
2765 for (x = 0; x < xcount; x++)
2766 {
2767 abase_x = &abase[x*axstride];
2768 s = (GFC_INTEGER_8) 0;
2769 for (n = 0; n < count; n++)
2770 s += abase_x[n] * bbase_y[n];
2771 dest_y[x] = s;
2772 }
2773 }
2774 }
2775 else
1524f80b 2776 {
a4a11197
PT
2777 const GFC_INTEGER_8 *restrict bbase_y;
2778 GFC_INTEGER_8 s;
2779
2780 for (y = 0; y < ycount; y++)
1524f80b 2781 {
a4a11197 2782 bbase_y = &bbase[y*bystride];
1524f80b
RS
2783 s = (GFC_INTEGER_8) 0;
2784 for (n = 0; n < count; n++)
a4a11197
PT
2785 s += abase[n*axstride] * bbase_y[n];
2786 dest[y*rystride] = s;
1524f80b
RS
2787 }
2788 }
2789 }
2790 else if (axstride < aystride)
410d3bba
VL
2791 {
2792 for (y = 0; y < ycount; y++)
2793 for (x = 0; x < xcount; x++)
2794 dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
2795
2796 for (y = 0; y < ycount; y++)
2797 for (n = 0; n < count; n++)
2798 for (x = 0; x < xcount; x++)
2799 /* dest[x,y] += a[x,n] * b[n,y] */
5d70ab07
JD
2800 dest[x*rxstride + y*rystride] +=
2801 abase[x*axstride + n*aystride] *
2802 bbase[n*bxstride + y*bystride];
6de9cd9a 2803 }
f0e871d6
PT
2804 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2805 {
2806 const GFC_INTEGER_8 *restrict bbase_y;
2807 GFC_INTEGER_8 s;
2808
2809 for (y = 0; y < ycount; y++)
2810 {
2811 bbase_y = &bbase[y*bystride];
2812 s = (GFC_INTEGER_8) 0;
2813 for (n = 0; n < count; n++)
2814 s += abase[n*axstride] * bbase_y[n*bxstride];
2815 dest[y*rxstride] = s;
2816 }
2817 }
1524f80b
RS
2818 else
2819 {
2820 const GFC_INTEGER_8 *restrict abase_x;
2821 const GFC_INTEGER_8 *restrict bbase_y;
2822 GFC_INTEGER_8 *restrict dest_y;
2823 GFC_INTEGER_8 s;
2824
2825 for (y = 0; y < ycount; y++)
2826 {
2827 bbase_y = &bbase[y*bystride];
2828 dest_y = &dest[y*rystride];
2829 for (x = 0; x < xcount; x++)
2830 {
2831 abase_x = &abase[x*axstride];
2832 s = (GFC_INTEGER_8) 0;
2833 for (n = 0; n < count; n++)
2834 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2835 dest_y[x*rxstride] = s;
2836 }
2837 }
2838 }
6de9cd9a 2839}
31cfd832
TK
2840#undef POW3
2841#undef min
2842#undef max
2843
644cb69f 2844#endif
31cfd832
TK
2845#endif
2846