]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgfortran/generated/matmul_i2.c
gthr-win32.h: Define NOGDI before windows.h include to prevent w32api CC_NONE macro...
[thirdparty/gcc.git] / libgfortran / generated / matmul_i2.c
CommitLineData
567c915b 1/* Implementation of the MATMUL intrinsic
cbe34bb5 2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
567c915b
TK
3 Contributed by Paul Brook <paul@nowt.org>
4
21d1335b 5This file is part of the GNU Fortran runtime library (libgfortran).
567c915b
TK
6
7Libgfortran is free software; you can redistribute it and/or
8modify it under the terms of the GNU General Public
9License as published by the Free Software Foundation; either
748086b7 10version 3 of the License, or (at your option) any later version.
567c915b
TK
11
12Libgfortran is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
748086b7
JJ
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24<http://www.gnu.org/licenses/>. */
567c915b 25
36ae8a61 26#include "libgfortran.h"
567c915b
TK
27#include <string.h>
28#include <assert.h>
36ae8a61 29
567c915b
TK
30
31#if defined (HAVE_GFC_INTEGER_2)
32
33/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
5d70ab07 34 passed to us by the front-end, in which case we call it for large
567c915b
TK
35 matrices. */
36
37typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
39 const int *, const GFC_INTEGER_2 *, const int *,
40 const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
41 int, int);
42
43/* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66*/
67
68/* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72extern void matmul_i2 (gfc_array_i2 * const restrict retarray,
73 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75export_proto(matmul_i2);
76
31cfd832
TK
77
78
79
80/* Put exhaustive list of possible architectures here here, ORed together. */
81
82#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
83
84#ifdef HAVE_AVX
85static void
86matmul_i2_avx (gfc_array_i2 * const restrict retarray,
87 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
88 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
89static void
90matmul_i2_avx (gfc_array_i2 * const restrict retarray,
91 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
92 int blas_limit, blas_call gemm)
93{
94 const GFC_INTEGER_2 * restrict abase;
95 const GFC_INTEGER_2 * restrict bbase;
96 GFC_INTEGER_2 * restrict dest;
97
98 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
99 index_type x, y, n, count, xcount, ycount;
100
101 assert (GFC_DESCRIPTOR_RANK (a) == 2
102 || GFC_DESCRIPTOR_RANK (b) == 2);
103
104/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
105
106 Either A or B (but not both) can be rank 1:
107
108 o One-dimensional argument A is implicitly treated as a row matrix
109 dimensioned [1,count], so xcount=1.
110
111 o One-dimensional argument B is implicitly treated as a column matrix
112 dimensioned [count, 1], so ycount=1.
113*/
114
115 if (retarray->base_addr == NULL)
116 {
117 if (GFC_DESCRIPTOR_RANK (a) == 1)
118 {
119 GFC_DIMENSION_SET(retarray->dim[0], 0,
120 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
121 }
122 else if (GFC_DESCRIPTOR_RANK (b) == 1)
123 {
124 GFC_DIMENSION_SET(retarray->dim[0], 0,
125 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
126 }
127 else
128 {
129 GFC_DIMENSION_SET(retarray->dim[0], 0,
130 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
131
132 GFC_DIMENSION_SET(retarray->dim[1], 0,
133 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
134 GFC_DESCRIPTOR_EXTENT(retarray,0));
135 }
136
137 retarray->base_addr
138 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
139 retarray->offset = 0;
140 }
141 else if (unlikely (compile_options.bounds_check))
142 {
143 index_type ret_extent, arg_extent;
144
145 if (GFC_DESCRIPTOR_RANK (a) == 1)
146 {
147 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
148 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
149 if (arg_extent != ret_extent)
150 runtime_error ("Incorrect extent in return array in"
151 " MATMUL intrinsic: is %ld, should be %ld",
152 (long int) ret_extent, (long int) arg_extent);
153 }
154 else if (GFC_DESCRIPTOR_RANK (b) == 1)
155 {
156 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
157 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
158 if (arg_extent != ret_extent)
159 runtime_error ("Incorrect extent in return array in"
160 " MATMUL intrinsic: is %ld, should be %ld",
161 (long int) ret_extent, (long int) arg_extent);
162 }
163 else
164 {
165 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
166 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
167 if (arg_extent != ret_extent)
168 runtime_error ("Incorrect extent in return array in"
169 " MATMUL intrinsic for dimension 1:"
170 " is %ld, should be %ld",
171 (long int) ret_extent, (long int) arg_extent);
172
173 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
174 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
175 if (arg_extent != ret_extent)
176 runtime_error ("Incorrect extent in return array in"
177 " MATMUL intrinsic for dimension 2:"
178 " is %ld, should be %ld",
179 (long int) ret_extent, (long int) arg_extent);
180 }
181 }
182
183
184 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
185 {
186 /* One-dimensional result may be addressed in the code below
187 either as a row or a column matrix. We want both cases to
188 work. */
189 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
190 }
191 else
192 {
193 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
194 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
195 }
196
197
198 if (GFC_DESCRIPTOR_RANK (a) == 1)
199 {
200 /* Treat it as a a row matrix A[1,count]. */
201 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
202 aystride = 1;
203
204 xcount = 1;
205 count = GFC_DESCRIPTOR_EXTENT(a,0);
206 }
207 else
208 {
209 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
210 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
211
212 count = GFC_DESCRIPTOR_EXTENT(a,1);
213 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
214 }
215
216 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
217 {
218 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
219 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
220 }
221
222 if (GFC_DESCRIPTOR_RANK (b) == 1)
223 {
224 /* Treat it as a column matrix B[count,1] */
225 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
226
227 /* bystride should never be used for 1-dimensional b.
228 in case it is we want it to cause a segfault, rather than
229 an incorrect result. */
230 bystride = 0xDEADBEEF;
231 ycount = 1;
232 }
233 else
234 {
235 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
236 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
237 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
238 }
239
240 abase = a->base_addr;
241 bbase = b->base_addr;
242 dest = retarray->base_addr;
243
244 /* Now that everything is set up, we perform the multiplication
245 itself. */
246
247#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
248#define min(a,b) ((a) <= (b) ? (a) : (b))
249#define max(a,b) ((a) >= (b) ? (a) : (b))
250
251 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
252 && (bxstride == 1 || bystride == 1)
253 && (((float) xcount) * ((float) ycount) * ((float) count)
254 > POW3(blas_limit)))
255 {
256 const int m = xcount, n = ycount, k = count, ldc = rystride;
257 const GFC_INTEGER_2 one = 1, zero = 0;
258 const int lda = (axstride == 1) ? aystride : axstride,
259 ldb = (bxstride == 1) ? bystride : bxstride;
260
261 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
262 {
263 assert (gemm != NULL);
264 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
265 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
266 &ldc, 1, 1);
267 return;
268 }
269 }
270
271 if (rxstride == 1 && axstride == 1 && bxstride == 1)
272 {
273 /* This block of code implements a tuned matmul, derived from
274 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
275
276 Bo Kagstrom and Per Ling
277 Department of Computing Science
278 Umea University
279 S-901 87 Umea, Sweden
280
281 from netlib.org, translated to C, and modified for matmul.m4. */
282
283 const GFC_INTEGER_2 *a, *b;
284 GFC_INTEGER_2 *c;
285 const index_type m = xcount, n = ycount, k = count;
286
287 /* System generated locals */
288 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
289 i1, i2, i3, i4, i5, i6;
290
291 /* Local variables */
292 GFC_INTEGER_2 t1[65536], /* was [256][256] */
293 f11, f12, f21, f22, f31, f32, f41, f42,
294 f13, f14, f23, f24, f33, f34, f43, f44;
295 index_type i, j, l, ii, jj, ll;
296 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
297
298 a = abase;
299 b = bbase;
300 c = retarray->base_addr;
301
302 /* Parameter adjustments */
303 c_dim1 = rystride;
304 c_offset = 1 + c_dim1;
305 c -= c_offset;
306 a_dim1 = aystride;
307 a_offset = 1 + a_dim1;
308 a -= a_offset;
309 b_dim1 = bystride;
310 b_offset = 1 + b_dim1;
311 b -= b_offset;
312
313 /* Early exit if possible */
314 if (m == 0 || n == 0 || k == 0)
315 return;
316
317 /* Empty c first. */
318 for (j=1; j<=n; j++)
319 for (i=1; i<=m; i++)
320 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
321
322 /* Start turning the crank. */
323 i1 = n;
324 for (jj = 1; jj <= i1; jj += 512)
325 {
326 /* Computing MIN */
327 i2 = 512;
328 i3 = n - jj + 1;
329 jsec = min(i2,i3);
330 ujsec = jsec - jsec % 4;
331 i2 = k;
332 for (ll = 1; ll <= i2; ll += 256)
333 {
334 /* Computing MIN */
335 i3 = 256;
336 i4 = k - ll + 1;
337 lsec = min(i3,i4);
338 ulsec = lsec - lsec % 2;
339
340 i3 = m;
341 for (ii = 1; ii <= i3; ii += 256)
342 {
343 /* Computing MIN */
344 i4 = 256;
345 i5 = m - ii + 1;
346 isec = min(i4,i5);
347 uisec = isec - isec % 2;
348 i4 = ll + ulsec - 1;
349 for (l = ll; l <= i4; l += 2)
350 {
351 i5 = ii + uisec - 1;
352 for (i = ii; i <= i5; i += 2)
353 {
354 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
355 a[i + l * a_dim1];
356 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
357 a[i + (l + 1) * a_dim1];
358 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
359 a[i + 1 + l * a_dim1];
360 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
361 a[i + 1 + (l + 1) * a_dim1];
362 }
363 if (uisec < isec)
364 {
365 t1[l - ll + 1 + (isec << 8) - 257] =
366 a[ii + isec - 1 + l * a_dim1];
367 t1[l - ll + 2 + (isec << 8) - 257] =
368 a[ii + isec - 1 + (l + 1) * a_dim1];
369 }
370 }
371 if (ulsec < lsec)
372 {
373 i4 = ii + isec - 1;
374 for (i = ii; i<= i4; ++i)
375 {
376 t1[lsec + ((i - ii + 1) << 8) - 257] =
377 a[i + (ll + lsec - 1) * a_dim1];
378 }
379 }
380
381 uisec = isec - isec % 4;
382 i4 = jj + ujsec - 1;
383 for (j = jj; j <= i4; j += 4)
384 {
385 i5 = ii + uisec - 1;
386 for (i = ii; i <= i5; i += 4)
387 {
388 f11 = c[i + j * c_dim1];
389 f21 = c[i + 1 + j * c_dim1];
390 f12 = c[i + (j + 1) * c_dim1];
391 f22 = c[i + 1 + (j + 1) * c_dim1];
392 f13 = c[i + (j + 2) * c_dim1];
393 f23 = c[i + 1 + (j + 2) * c_dim1];
394 f14 = c[i + (j + 3) * c_dim1];
395 f24 = c[i + 1 + (j + 3) * c_dim1];
396 f31 = c[i + 2 + j * c_dim1];
397 f41 = c[i + 3 + j * c_dim1];
398 f32 = c[i + 2 + (j + 1) * c_dim1];
399 f42 = c[i + 3 + (j + 1) * c_dim1];
400 f33 = c[i + 2 + (j + 2) * c_dim1];
401 f43 = c[i + 3 + (j + 2) * c_dim1];
402 f34 = c[i + 2 + (j + 3) * c_dim1];
403 f44 = c[i + 3 + (j + 3) * c_dim1];
404 i6 = ll + lsec - 1;
405 for (l = ll; l <= i6; ++l)
406 {
407 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
408 * b[l + j * b_dim1];
409 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
410 * b[l + j * b_dim1];
411 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
412 * b[l + (j + 1) * b_dim1];
413 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
414 * b[l + (j + 1) * b_dim1];
415 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
416 * b[l + (j + 2) * b_dim1];
417 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
418 * b[l + (j + 2) * b_dim1];
419 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
420 * b[l + (j + 3) * b_dim1];
421 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
422 * b[l + (j + 3) * b_dim1];
423 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
424 * b[l + j * b_dim1];
425 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
426 * b[l + j * b_dim1];
427 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
428 * b[l + (j + 1) * b_dim1];
429 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
430 * b[l + (j + 1) * b_dim1];
431 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
432 * b[l + (j + 2) * b_dim1];
433 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
434 * b[l + (j + 2) * b_dim1];
435 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
436 * b[l + (j + 3) * b_dim1];
437 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
438 * b[l + (j + 3) * b_dim1];
439 }
440 c[i + j * c_dim1] = f11;
441 c[i + 1 + j * c_dim1] = f21;
442 c[i + (j + 1) * c_dim1] = f12;
443 c[i + 1 + (j + 1) * c_dim1] = f22;
444 c[i + (j + 2) * c_dim1] = f13;
445 c[i + 1 + (j + 2) * c_dim1] = f23;
446 c[i + (j + 3) * c_dim1] = f14;
447 c[i + 1 + (j + 3) * c_dim1] = f24;
448 c[i + 2 + j * c_dim1] = f31;
449 c[i + 3 + j * c_dim1] = f41;
450 c[i + 2 + (j + 1) * c_dim1] = f32;
451 c[i + 3 + (j + 1) * c_dim1] = f42;
452 c[i + 2 + (j + 2) * c_dim1] = f33;
453 c[i + 3 + (j + 2) * c_dim1] = f43;
454 c[i + 2 + (j + 3) * c_dim1] = f34;
455 c[i + 3 + (j + 3) * c_dim1] = f44;
456 }
457 if (uisec < isec)
458 {
459 i5 = ii + isec - 1;
460 for (i = ii + uisec; i <= i5; ++i)
461 {
462 f11 = c[i + j * c_dim1];
463 f12 = c[i + (j + 1) * c_dim1];
464 f13 = c[i + (j + 2) * c_dim1];
465 f14 = c[i + (j + 3) * c_dim1];
466 i6 = ll + lsec - 1;
467 for (l = ll; l <= i6; ++l)
468 {
469 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
470 257] * b[l + j * b_dim1];
471 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
472 257] * b[l + (j + 1) * b_dim1];
473 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
474 257] * b[l + (j + 2) * b_dim1];
475 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
476 257] * b[l + (j + 3) * b_dim1];
477 }
478 c[i + j * c_dim1] = f11;
479 c[i + (j + 1) * c_dim1] = f12;
480 c[i + (j + 2) * c_dim1] = f13;
481 c[i + (j + 3) * c_dim1] = f14;
482 }
483 }
484 }
485 if (ujsec < jsec)
486 {
487 i4 = jj + jsec - 1;
488 for (j = jj + ujsec; j <= i4; ++j)
489 {
490 i5 = ii + uisec - 1;
491 for (i = ii; i <= i5; i += 4)
492 {
493 f11 = c[i + j * c_dim1];
494 f21 = c[i + 1 + j * c_dim1];
495 f31 = c[i + 2 + j * c_dim1];
496 f41 = c[i + 3 + j * c_dim1];
497 i6 = ll + lsec - 1;
498 for (l = ll; l <= i6; ++l)
499 {
500 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
501 257] * b[l + j * b_dim1];
502 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
503 257] * b[l + j * b_dim1];
504 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
505 257] * b[l + j * b_dim1];
506 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
507 257] * b[l + j * b_dim1];
508 }
509 c[i + j * c_dim1] = f11;
510 c[i + 1 + j * c_dim1] = f21;
511 c[i + 2 + j * c_dim1] = f31;
512 c[i + 3 + j * c_dim1] = f41;
513 }
514 i5 = ii + isec - 1;
515 for (i = ii + uisec; i <= i5; ++i)
516 {
517 f11 = c[i + j * c_dim1];
518 i6 = ll + lsec - 1;
519 for (l = ll; l <= i6; ++l)
520 {
521 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
522 257] * b[l + j * b_dim1];
523 }
524 c[i + j * c_dim1] = f11;
525 }
526 }
527 }
528 }
529 }
530 }
531 return;
532 }
533 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
534 {
535 if (GFC_DESCRIPTOR_RANK (a) != 1)
536 {
537 const GFC_INTEGER_2 *restrict abase_x;
538 const GFC_INTEGER_2 *restrict bbase_y;
539 GFC_INTEGER_2 *restrict dest_y;
540 GFC_INTEGER_2 s;
541
542 for (y = 0; y < ycount; y++)
543 {
544 bbase_y = &bbase[y*bystride];
545 dest_y = &dest[y*rystride];
546 for (x = 0; x < xcount; x++)
547 {
548 abase_x = &abase[x*axstride];
549 s = (GFC_INTEGER_2) 0;
550 for (n = 0; n < count; n++)
551 s += abase_x[n] * bbase_y[n];
552 dest_y[x] = s;
553 }
554 }
555 }
556 else
557 {
558 const GFC_INTEGER_2 *restrict bbase_y;
559 GFC_INTEGER_2 s;
560
561 for (y = 0; y < ycount; y++)
562 {
563 bbase_y = &bbase[y*bystride];
564 s = (GFC_INTEGER_2) 0;
565 for (n = 0; n < count; n++)
566 s += abase[n*axstride] * bbase_y[n];
567 dest[y*rystride] = s;
568 }
569 }
570 }
571 else if (axstride < aystride)
572 {
573 for (y = 0; y < ycount; y++)
574 for (x = 0; x < xcount; x++)
575 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
576
577 for (y = 0; y < ycount; y++)
578 for (n = 0; n < count; n++)
579 for (x = 0; x < xcount; x++)
580 /* dest[x,y] += a[x,n] * b[n,y] */
581 dest[x*rxstride + y*rystride] +=
582 abase[x*axstride + n*aystride] *
583 bbase[n*bxstride + y*bystride];
584 }
585 else if (GFC_DESCRIPTOR_RANK (a) == 1)
586 {
587 const GFC_INTEGER_2 *restrict bbase_y;
588 GFC_INTEGER_2 s;
589
590 for (y = 0; y < ycount; y++)
591 {
592 bbase_y = &bbase[y*bystride];
593 s = (GFC_INTEGER_2) 0;
594 for (n = 0; n < count; n++)
595 s += abase[n*axstride] * bbase_y[n*bxstride];
596 dest[y*rxstride] = s;
597 }
598 }
599 else
600 {
601 const GFC_INTEGER_2 *restrict abase_x;
602 const GFC_INTEGER_2 *restrict bbase_y;
603 GFC_INTEGER_2 *restrict dest_y;
604 GFC_INTEGER_2 s;
605
606 for (y = 0; y < ycount; y++)
607 {
608 bbase_y = &bbase[y*bystride];
609 dest_y = &dest[y*rystride];
610 for (x = 0; x < xcount; x++)
611 {
612 abase_x = &abase[x*axstride];
613 s = (GFC_INTEGER_2) 0;
614 for (n = 0; n < count; n++)
615 s += abase_x[n*aystride] * bbase_y[n*bxstride];
616 dest_y[x*rxstride] = s;
617 }
618 }
619 }
620}
621#undef POW3
622#undef min
623#undef max
624
625#endif /* HAVE_AVX */
626
627#ifdef HAVE_AVX2
628static void
629matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
630 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
631 int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
632static void
633matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
634 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
635 int blas_limit, blas_call gemm)
636{
637 const GFC_INTEGER_2 * restrict abase;
638 const GFC_INTEGER_2 * restrict bbase;
639 GFC_INTEGER_2 * restrict dest;
640
641 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
642 index_type x, y, n, count, xcount, ycount;
643
644 assert (GFC_DESCRIPTOR_RANK (a) == 2
645 || GFC_DESCRIPTOR_RANK (b) == 2);
646
647/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
648
649 Either A or B (but not both) can be rank 1:
650
651 o One-dimensional argument A is implicitly treated as a row matrix
652 dimensioned [1,count], so xcount=1.
653
654 o One-dimensional argument B is implicitly treated as a column matrix
655 dimensioned [count, 1], so ycount=1.
656*/
657
658 if (retarray->base_addr == NULL)
659 {
660 if (GFC_DESCRIPTOR_RANK (a) == 1)
661 {
662 GFC_DIMENSION_SET(retarray->dim[0], 0,
663 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
664 }
665 else if (GFC_DESCRIPTOR_RANK (b) == 1)
666 {
667 GFC_DIMENSION_SET(retarray->dim[0], 0,
668 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
669 }
670 else
671 {
672 GFC_DIMENSION_SET(retarray->dim[0], 0,
673 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
674
675 GFC_DIMENSION_SET(retarray->dim[1], 0,
676 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
677 GFC_DESCRIPTOR_EXTENT(retarray,0));
678 }
679
680 retarray->base_addr
681 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
682 retarray->offset = 0;
683 }
684 else if (unlikely (compile_options.bounds_check))
685 {
686 index_type ret_extent, arg_extent;
687
688 if (GFC_DESCRIPTOR_RANK (a) == 1)
689 {
690 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
691 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
692 if (arg_extent != ret_extent)
693 runtime_error ("Incorrect extent in return array in"
694 " MATMUL intrinsic: is %ld, should be %ld",
695 (long int) ret_extent, (long int) arg_extent);
696 }
697 else if (GFC_DESCRIPTOR_RANK (b) == 1)
698 {
699 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
700 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
701 if (arg_extent != ret_extent)
702 runtime_error ("Incorrect extent in return array in"
703 " MATMUL intrinsic: is %ld, should be %ld",
704 (long int) ret_extent, (long int) arg_extent);
705 }
706 else
707 {
708 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
709 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
710 if (arg_extent != ret_extent)
711 runtime_error ("Incorrect extent in return array in"
712 " MATMUL intrinsic for dimension 1:"
713 " is %ld, should be %ld",
714 (long int) ret_extent, (long int) arg_extent);
715
716 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
717 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
718 if (arg_extent != ret_extent)
719 runtime_error ("Incorrect extent in return array in"
720 " MATMUL intrinsic for dimension 2:"
721 " is %ld, should be %ld",
722 (long int) ret_extent, (long int) arg_extent);
723 }
724 }
725
726
727 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
728 {
729 /* One-dimensional result may be addressed in the code below
730 either as a row or a column matrix. We want both cases to
731 work. */
732 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
733 }
734 else
735 {
736 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
737 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
738 }
739
740
741 if (GFC_DESCRIPTOR_RANK (a) == 1)
742 {
743 /* Treat it as a a row matrix A[1,count]. */
744 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
745 aystride = 1;
746
747 xcount = 1;
748 count = GFC_DESCRIPTOR_EXTENT(a,0);
749 }
750 else
751 {
752 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
753 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
754
755 count = GFC_DESCRIPTOR_EXTENT(a,1);
756 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
757 }
758
759 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
760 {
761 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
762 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
763 }
764
765 if (GFC_DESCRIPTOR_RANK (b) == 1)
766 {
767 /* Treat it as a column matrix B[count,1] */
768 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
769
770 /* bystride should never be used for 1-dimensional b.
771 in case it is we want it to cause a segfault, rather than
772 an incorrect result. */
773 bystride = 0xDEADBEEF;
774 ycount = 1;
775 }
776 else
777 {
778 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
779 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
780 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
781 }
782
783 abase = a->base_addr;
784 bbase = b->base_addr;
785 dest = retarray->base_addr;
786
787 /* Now that everything is set up, we perform the multiplication
788 itself. */
789
790#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
791#define min(a,b) ((a) <= (b) ? (a) : (b))
792#define max(a,b) ((a) >= (b) ? (a) : (b))
793
794 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
795 && (bxstride == 1 || bystride == 1)
796 && (((float) xcount) * ((float) ycount) * ((float) count)
797 > POW3(blas_limit)))
798 {
799 const int m = xcount, n = ycount, k = count, ldc = rystride;
800 const GFC_INTEGER_2 one = 1, zero = 0;
801 const int lda = (axstride == 1) ? aystride : axstride,
802 ldb = (bxstride == 1) ? bystride : bxstride;
803
804 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
805 {
806 assert (gemm != NULL);
807 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
808 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
809 &ldc, 1, 1);
810 return;
811 }
812 }
813
814 if (rxstride == 1 && axstride == 1 && bxstride == 1)
815 {
816 /* This block of code implements a tuned matmul, derived from
817 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
818
819 Bo Kagstrom and Per Ling
820 Department of Computing Science
821 Umea University
822 S-901 87 Umea, Sweden
823
824 from netlib.org, translated to C, and modified for matmul.m4. */
825
826 const GFC_INTEGER_2 *a, *b;
827 GFC_INTEGER_2 *c;
828 const index_type m = xcount, n = ycount, k = count;
829
830 /* System generated locals */
831 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
832 i1, i2, i3, i4, i5, i6;
833
834 /* Local variables */
835 GFC_INTEGER_2 t1[65536], /* was [256][256] */
836 f11, f12, f21, f22, f31, f32, f41, f42,
837 f13, f14, f23, f24, f33, f34, f43, f44;
838 index_type i, j, l, ii, jj, ll;
839 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
840
841 a = abase;
842 b = bbase;
843 c = retarray->base_addr;
844
845 /* Parameter adjustments */
846 c_dim1 = rystride;
847 c_offset = 1 + c_dim1;
848 c -= c_offset;
849 a_dim1 = aystride;
850 a_offset = 1 + a_dim1;
851 a -= a_offset;
852 b_dim1 = bystride;
853 b_offset = 1 + b_dim1;
854 b -= b_offset;
855
856 /* Early exit if possible */
857 if (m == 0 || n == 0 || k == 0)
858 return;
859
860 /* Empty c first. */
861 for (j=1; j<=n; j++)
862 for (i=1; i<=m; i++)
863 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
864
865 /* Start turning the crank. */
866 i1 = n;
867 for (jj = 1; jj <= i1; jj += 512)
868 {
869 /* Computing MIN */
870 i2 = 512;
871 i3 = n - jj + 1;
872 jsec = min(i2,i3);
873 ujsec = jsec - jsec % 4;
874 i2 = k;
875 for (ll = 1; ll <= i2; ll += 256)
876 {
877 /* Computing MIN */
878 i3 = 256;
879 i4 = k - ll + 1;
880 lsec = min(i3,i4);
881 ulsec = lsec - lsec % 2;
882
883 i3 = m;
884 for (ii = 1; ii <= i3; ii += 256)
885 {
886 /* Computing MIN */
887 i4 = 256;
888 i5 = m - ii + 1;
889 isec = min(i4,i5);
890 uisec = isec - isec % 2;
891 i4 = ll + ulsec - 1;
892 for (l = ll; l <= i4; l += 2)
893 {
894 i5 = ii + uisec - 1;
895 for (i = ii; i <= i5; i += 2)
896 {
897 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
898 a[i + l * a_dim1];
899 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
900 a[i + (l + 1) * a_dim1];
901 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
902 a[i + 1 + l * a_dim1];
903 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
904 a[i + 1 + (l + 1) * a_dim1];
905 }
906 if (uisec < isec)
907 {
908 t1[l - ll + 1 + (isec << 8) - 257] =
909 a[ii + isec - 1 + l * a_dim1];
910 t1[l - ll + 2 + (isec << 8) - 257] =
911 a[ii + isec - 1 + (l + 1) * a_dim1];
912 }
913 }
914 if (ulsec < lsec)
915 {
916 i4 = ii + isec - 1;
917 for (i = ii; i<= i4; ++i)
918 {
919 t1[lsec + ((i - ii + 1) << 8) - 257] =
920 a[i + (ll + lsec - 1) * a_dim1];
921 }
922 }
923
924 uisec = isec - isec % 4;
925 i4 = jj + ujsec - 1;
926 for (j = jj; j <= i4; j += 4)
927 {
928 i5 = ii + uisec - 1;
929 for (i = ii; i <= i5; i += 4)
930 {
931 f11 = c[i + j * c_dim1];
932 f21 = c[i + 1 + j * c_dim1];
933 f12 = c[i + (j + 1) * c_dim1];
934 f22 = c[i + 1 + (j + 1) * c_dim1];
935 f13 = c[i + (j + 2) * c_dim1];
936 f23 = c[i + 1 + (j + 2) * c_dim1];
937 f14 = c[i + (j + 3) * c_dim1];
938 f24 = c[i + 1 + (j + 3) * c_dim1];
939 f31 = c[i + 2 + j * c_dim1];
940 f41 = c[i + 3 + j * c_dim1];
941 f32 = c[i + 2 + (j + 1) * c_dim1];
942 f42 = c[i + 3 + (j + 1) * c_dim1];
943 f33 = c[i + 2 + (j + 2) * c_dim1];
944 f43 = c[i + 3 + (j + 2) * c_dim1];
945 f34 = c[i + 2 + (j + 3) * c_dim1];
946 f44 = c[i + 3 + (j + 3) * c_dim1];
947 i6 = ll + lsec - 1;
948 for (l = ll; l <= i6; ++l)
949 {
950 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
951 * b[l + j * b_dim1];
952 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
953 * b[l + j * b_dim1];
954 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
955 * b[l + (j + 1) * b_dim1];
956 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
957 * b[l + (j + 1) * b_dim1];
958 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
959 * b[l + (j + 2) * b_dim1];
960 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
961 * b[l + (j + 2) * b_dim1];
962 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
963 * b[l + (j + 3) * b_dim1];
964 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
965 * b[l + (j + 3) * b_dim1];
966 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
967 * b[l + j * b_dim1];
968 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
969 * b[l + j * b_dim1];
970 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
971 * b[l + (j + 1) * b_dim1];
972 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
973 * b[l + (j + 1) * b_dim1];
974 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
975 * b[l + (j + 2) * b_dim1];
976 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
977 * b[l + (j + 2) * b_dim1];
978 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
979 * b[l + (j + 3) * b_dim1];
980 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
981 * b[l + (j + 3) * b_dim1];
982 }
983 c[i + j * c_dim1] = f11;
984 c[i + 1 + j * c_dim1] = f21;
985 c[i + (j + 1) * c_dim1] = f12;
986 c[i + 1 + (j + 1) * c_dim1] = f22;
987 c[i + (j + 2) * c_dim1] = f13;
988 c[i + 1 + (j + 2) * c_dim1] = f23;
989 c[i + (j + 3) * c_dim1] = f14;
990 c[i + 1 + (j + 3) * c_dim1] = f24;
991 c[i + 2 + j * c_dim1] = f31;
992 c[i + 3 + j * c_dim1] = f41;
993 c[i + 2 + (j + 1) * c_dim1] = f32;
994 c[i + 3 + (j + 1) * c_dim1] = f42;
995 c[i + 2 + (j + 2) * c_dim1] = f33;
996 c[i + 3 + (j + 2) * c_dim1] = f43;
997 c[i + 2 + (j + 3) * c_dim1] = f34;
998 c[i + 3 + (j + 3) * c_dim1] = f44;
999 }
1000 if (uisec < isec)
1001 {
1002 i5 = ii + isec - 1;
1003 for (i = ii + uisec; i <= i5; ++i)
1004 {
1005 f11 = c[i + j * c_dim1];
1006 f12 = c[i + (j + 1) * c_dim1];
1007 f13 = c[i + (j + 2) * c_dim1];
1008 f14 = c[i + (j + 3) * c_dim1];
1009 i6 = ll + lsec - 1;
1010 for (l = ll; l <= i6; ++l)
1011 {
1012 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1013 257] * b[l + j * b_dim1];
1014 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1015 257] * b[l + (j + 1) * b_dim1];
1016 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1017 257] * b[l + (j + 2) * b_dim1];
1018 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1019 257] * b[l + (j + 3) * b_dim1];
1020 }
1021 c[i + j * c_dim1] = f11;
1022 c[i + (j + 1) * c_dim1] = f12;
1023 c[i + (j + 2) * c_dim1] = f13;
1024 c[i + (j + 3) * c_dim1] = f14;
1025 }
1026 }
1027 }
1028 if (ujsec < jsec)
1029 {
1030 i4 = jj + jsec - 1;
1031 for (j = jj + ujsec; j <= i4; ++j)
1032 {
1033 i5 = ii + uisec - 1;
1034 for (i = ii; i <= i5; i += 4)
1035 {
1036 f11 = c[i + j * c_dim1];
1037 f21 = c[i + 1 + j * c_dim1];
1038 f31 = c[i + 2 + j * c_dim1];
1039 f41 = c[i + 3 + j * c_dim1];
1040 i6 = ll + lsec - 1;
1041 for (l = ll; l <= i6; ++l)
1042 {
1043 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1044 257] * b[l + j * b_dim1];
1045 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1046 257] * b[l + j * b_dim1];
1047 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1048 257] * b[l + j * b_dim1];
1049 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1050 257] * b[l + j * b_dim1];
1051 }
1052 c[i + j * c_dim1] = f11;
1053 c[i + 1 + j * c_dim1] = f21;
1054 c[i + 2 + j * c_dim1] = f31;
1055 c[i + 3 + j * c_dim1] = f41;
1056 }
1057 i5 = ii + isec - 1;
1058 for (i = ii + uisec; i <= i5; ++i)
1059 {
1060 f11 = c[i + j * c_dim1];
1061 i6 = ll + lsec - 1;
1062 for (l = ll; l <= i6; ++l)
1063 {
1064 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1065 257] * b[l + j * b_dim1];
1066 }
1067 c[i + j * c_dim1] = f11;
1068 }
1069 }
1070 }
1071 }
1072 }
1073 }
1074 return;
1075 }
1076 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1077 {
1078 if (GFC_DESCRIPTOR_RANK (a) != 1)
1079 {
1080 const GFC_INTEGER_2 *restrict abase_x;
1081 const GFC_INTEGER_2 *restrict bbase_y;
1082 GFC_INTEGER_2 *restrict dest_y;
1083 GFC_INTEGER_2 s;
1084
1085 for (y = 0; y < ycount; y++)
1086 {
1087 bbase_y = &bbase[y*bystride];
1088 dest_y = &dest[y*rystride];
1089 for (x = 0; x < xcount; x++)
1090 {
1091 abase_x = &abase[x*axstride];
1092 s = (GFC_INTEGER_2) 0;
1093 for (n = 0; n < count; n++)
1094 s += abase_x[n] * bbase_y[n];
1095 dest_y[x] = s;
1096 }
1097 }
1098 }
1099 else
1100 {
1101 const GFC_INTEGER_2 *restrict bbase_y;
1102 GFC_INTEGER_2 s;
1103
1104 for (y = 0; y < ycount; y++)
1105 {
1106 bbase_y = &bbase[y*bystride];
1107 s = (GFC_INTEGER_2) 0;
1108 for (n = 0; n < count; n++)
1109 s += abase[n*axstride] * bbase_y[n];
1110 dest[y*rystride] = s;
1111 }
1112 }
1113 }
1114 else if (axstride < aystride)
1115 {
1116 for (y = 0; y < ycount; y++)
1117 for (x = 0; x < xcount; x++)
1118 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1119
1120 for (y = 0; y < ycount; y++)
1121 for (n = 0; n < count; n++)
1122 for (x = 0; x < xcount; x++)
1123 /* dest[x,y] += a[x,n] * b[n,y] */
1124 dest[x*rxstride + y*rystride] +=
1125 abase[x*axstride + n*aystride] *
1126 bbase[n*bxstride + y*bystride];
1127 }
1128 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1129 {
1130 const GFC_INTEGER_2 *restrict bbase_y;
1131 GFC_INTEGER_2 s;
1132
1133 for (y = 0; y < ycount; y++)
1134 {
1135 bbase_y = &bbase[y*bystride];
1136 s = (GFC_INTEGER_2) 0;
1137 for (n = 0; n < count; n++)
1138 s += abase[n*axstride] * bbase_y[n*bxstride];
1139 dest[y*rxstride] = s;
1140 }
1141 }
1142 else
1143 {
1144 const GFC_INTEGER_2 *restrict abase_x;
1145 const GFC_INTEGER_2 *restrict bbase_y;
1146 GFC_INTEGER_2 *restrict dest_y;
1147 GFC_INTEGER_2 s;
1148
1149 for (y = 0; y < ycount; y++)
1150 {
1151 bbase_y = &bbase[y*bystride];
1152 dest_y = &dest[y*rystride];
1153 for (x = 0; x < xcount; x++)
1154 {
1155 abase_x = &abase[x*axstride];
1156 s = (GFC_INTEGER_2) 0;
1157 for (n = 0; n < count; n++)
1158 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1159 dest_y[x*rxstride] = s;
1160 }
1161 }
1162 }
1163}
1164#undef POW3
1165#undef min
1166#undef max
1167
1168#endif /* HAVE_AVX2 */
1169
1170#ifdef HAVE_AVX512F
1171static void
1172matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
1173 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1174 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1175static void
1176matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
1177 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1178 int blas_limit, blas_call gemm)
1179{
1180 const GFC_INTEGER_2 * restrict abase;
1181 const GFC_INTEGER_2 * restrict bbase;
1182 GFC_INTEGER_2 * restrict dest;
1183
1184 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1185 index_type x, y, n, count, xcount, ycount;
1186
1187 assert (GFC_DESCRIPTOR_RANK (a) == 2
1188 || GFC_DESCRIPTOR_RANK (b) == 2);
1189
1190/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1191
1192 Either A or B (but not both) can be rank 1:
1193
1194 o One-dimensional argument A is implicitly treated as a row matrix
1195 dimensioned [1,count], so xcount=1.
1196
1197 o One-dimensional argument B is implicitly treated as a column matrix
1198 dimensioned [count, 1], so ycount=1.
1199*/
1200
1201 if (retarray->base_addr == NULL)
1202 {
1203 if (GFC_DESCRIPTOR_RANK (a) == 1)
1204 {
1205 GFC_DIMENSION_SET(retarray->dim[0], 0,
1206 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1207 }
1208 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1209 {
1210 GFC_DIMENSION_SET(retarray->dim[0], 0,
1211 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1212 }
1213 else
1214 {
1215 GFC_DIMENSION_SET(retarray->dim[0], 0,
1216 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1217
1218 GFC_DIMENSION_SET(retarray->dim[1], 0,
1219 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1220 GFC_DESCRIPTOR_EXTENT(retarray,0));
1221 }
1222
1223 retarray->base_addr
1224 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
1225 retarray->offset = 0;
1226 }
1227 else if (unlikely (compile_options.bounds_check))
1228 {
1229 index_type ret_extent, arg_extent;
1230
1231 if (GFC_DESCRIPTOR_RANK (a) == 1)
1232 {
1233 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1234 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1235 if (arg_extent != ret_extent)
1236 runtime_error ("Incorrect extent in return array in"
1237 " MATMUL intrinsic: is %ld, should be %ld",
1238 (long int) ret_extent, (long int) arg_extent);
1239 }
1240 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1241 {
1242 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1243 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1244 if (arg_extent != ret_extent)
1245 runtime_error ("Incorrect extent in return array in"
1246 " MATMUL intrinsic: is %ld, should be %ld",
1247 (long int) ret_extent, (long int) arg_extent);
1248 }
1249 else
1250 {
1251 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1252 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1253 if (arg_extent != ret_extent)
1254 runtime_error ("Incorrect extent in return array in"
1255 " MATMUL intrinsic for dimension 1:"
1256 " is %ld, should be %ld",
1257 (long int) ret_extent, (long int) arg_extent);
1258
1259 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1260 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1261 if (arg_extent != ret_extent)
1262 runtime_error ("Incorrect extent in return array in"
1263 " MATMUL intrinsic for dimension 2:"
1264 " is %ld, should be %ld",
1265 (long int) ret_extent, (long int) arg_extent);
1266 }
1267 }
1268
1269
1270 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1271 {
1272 /* One-dimensional result may be addressed in the code below
1273 either as a row or a column matrix. We want both cases to
1274 work. */
1275 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1276 }
1277 else
1278 {
1279 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1280 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1281 }
1282
1283
1284 if (GFC_DESCRIPTOR_RANK (a) == 1)
1285 {
1286 /* Treat it as a a row matrix A[1,count]. */
1287 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1288 aystride = 1;
1289
1290 xcount = 1;
1291 count = GFC_DESCRIPTOR_EXTENT(a,0);
1292 }
1293 else
1294 {
1295 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1296 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1297
1298 count = GFC_DESCRIPTOR_EXTENT(a,1);
1299 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1300 }
1301
1302 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1303 {
1304 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1305 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1306 }
1307
1308 if (GFC_DESCRIPTOR_RANK (b) == 1)
1309 {
1310 /* Treat it as a column matrix B[count,1] */
1311 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1312
1313 /* bystride should never be used for 1-dimensional b.
1314 in case it is we want it to cause a segfault, rather than
1315 an incorrect result. */
1316 bystride = 0xDEADBEEF;
1317 ycount = 1;
1318 }
1319 else
1320 {
1321 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1322 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1323 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1324 }
1325
1326 abase = a->base_addr;
1327 bbase = b->base_addr;
1328 dest = retarray->base_addr;
1329
1330 /* Now that everything is set up, we perform the multiplication
1331 itself. */
1332
1333#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1334#define min(a,b) ((a) <= (b) ? (a) : (b))
1335#define max(a,b) ((a) >= (b) ? (a) : (b))
1336
1337 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1338 && (bxstride == 1 || bystride == 1)
1339 && (((float) xcount) * ((float) ycount) * ((float) count)
1340 > POW3(blas_limit)))
1341 {
1342 const int m = xcount, n = ycount, k = count, ldc = rystride;
1343 const GFC_INTEGER_2 one = 1, zero = 0;
1344 const int lda = (axstride == 1) ? aystride : axstride,
1345 ldb = (bxstride == 1) ? bystride : bxstride;
1346
1347 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1348 {
1349 assert (gemm != NULL);
1350 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1351 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1352 &ldc, 1, 1);
1353 return;
1354 }
1355 }
1356
1357 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1358 {
1359 /* This block of code implements a tuned matmul, derived from
1360 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1361
1362 Bo Kagstrom and Per Ling
1363 Department of Computing Science
1364 Umea University
1365 S-901 87 Umea, Sweden
1366
1367 from netlib.org, translated to C, and modified for matmul.m4. */
1368
1369 const GFC_INTEGER_2 *a, *b;
1370 GFC_INTEGER_2 *c;
1371 const index_type m = xcount, n = ycount, k = count;
1372
1373 /* System generated locals */
1374 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1375 i1, i2, i3, i4, i5, i6;
1376
1377 /* Local variables */
1378 GFC_INTEGER_2 t1[65536], /* was [256][256] */
1379 f11, f12, f21, f22, f31, f32, f41, f42,
1380 f13, f14, f23, f24, f33, f34, f43, f44;
1381 index_type i, j, l, ii, jj, ll;
1382 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1383
1384 a = abase;
1385 b = bbase;
1386 c = retarray->base_addr;
1387
1388 /* Parameter adjustments */
1389 c_dim1 = rystride;
1390 c_offset = 1 + c_dim1;
1391 c -= c_offset;
1392 a_dim1 = aystride;
1393 a_offset = 1 + a_dim1;
1394 a -= a_offset;
1395 b_dim1 = bystride;
1396 b_offset = 1 + b_dim1;
1397 b -= b_offset;
1398
1399 /* Early exit if possible */
1400 if (m == 0 || n == 0 || k == 0)
1401 return;
1402
1403 /* Empty c first. */
1404 for (j=1; j<=n; j++)
1405 for (i=1; i<=m; i++)
1406 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
1407
1408 /* Start turning the crank. */
1409 i1 = n;
1410 for (jj = 1; jj <= i1; jj += 512)
1411 {
1412 /* Computing MIN */
1413 i2 = 512;
1414 i3 = n - jj + 1;
1415 jsec = min(i2,i3);
1416 ujsec = jsec - jsec % 4;
1417 i2 = k;
1418 for (ll = 1; ll <= i2; ll += 256)
1419 {
1420 /* Computing MIN */
1421 i3 = 256;
1422 i4 = k - ll + 1;
1423 lsec = min(i3,i4);
1424 ulsec = lsec - lsec % 2;
1425
1426 i3 = m;
1427 for (ii = 1; ii <= i3; ii += 256)
1428 {
1429 /* Computing MIN */
1430 i4 = 256;
1431 i5 = m - ii + 1;
1432 isec = min(i4,i5);
1433 uisec = isec - isec % 2;
1434 i4 = ll + ulsec - 1;
1435 for (l = ll; l <= i4; l += 2)
1436 {
1437 i5 = ii + uisec - 1;
1438 for (i = ii; i <= i5; i += 2)
1439 {
1440 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1441 a[i + l * a_dim1];
1442 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1443 a[i + (l + 1) * a_dim1];
1444 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1445 a[i + 1 + l * a_dim1];
1446 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1447 a[i + 1 + (l + 1) * a_dim1];
1448 }
1449 if (uisec < isec)
1450 {
1451 t1[l - ll + 1 + (isec << 8) - 257] =
1452 a[ii + isec - 1 + l * a_dim1];
1453 t1[l - ll + 2 + (isec << 8) - 257] =
1454 a[ii + isec - 1 + (l + 1) * a_dim1];
1455 }
1456 }
1457 if (ulsec < lsec)
1458 {
1459 i4 = ii + isec - 1;
1460 for (i = ii; i<= i4; ++i)
1461 {
1462 t1[lsec + ((i - ii + 1) << 8) - 257] =
1463 a[i + (ll + lsec - 1) * a_dim1];
1464 }
1465 }
1466
1467 uisec = isec - isec % 4;
1468 i4 = jj + ujsec - 1;
1469 for (j = jj; j <= i4; j += 4)
1470 {
1471 i5 = ii + uisec - 1;
1472 for (i = ii; i <= i5; i += 4)
1473 {
1474 f11 = c[i + j * c_dim1];
1475 f21 = c[i + 1 + j * c_dim1];
1476 f12 = c[i + (j + 1) * c_dim1];
1477 f22 = c[i + 1 + (j + 1) * c_dim1];
1478 f13 = c[i + (j + 2) * c_dim1];
1479 f23 = c[i + 1 + (j + 2) * c_dim1];
1480 f14 = c[i + (j + 3) * c_dim1];
1481 f24 = c[i + 1 + (j + 3) * c_dim1];
1482 f31 = c[i + 2 + j * c_dim1];
1483 f41 = c[i + 3 + j * c_dim1];
1484 f32 = c[i + 2 + (j + 1) * c_dim1];
1485 f42 = c[i + 3 + (j + 1) * c_dim1];
1486 f33 = c[i + 2 + (j + 2) * c_dim1];
1487 f43 = c[i + 3 + (j + 2) * c_dim1];
1488 f34 = c[i + 2 + (j + 3) * c_dim1];
1489 f44 = c[i + 3 + (j + 3) * c_dim1];
1490 i6 = ll + lsec - 1;
1491 for (l = ll; l <= i6; ++l)
1492 {
1493 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1494 * b[l + j * b_dim1];
1495 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1496 * b[l + j * b_dim1];
1497 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1498 * b[l + (j + 1) * b_dim1];
1499 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1500 * b[l + (j + 1) * b_dim1];
1501 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1502 * b[l + (j + 2) * b_dim1];
1503 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1504 * b[l + (j + 2) * b_dim1];
1505 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1506 * b[l + (j + 3) * b_dim1];
1507 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1508 * b[l + (j + 3) * b_dim1];
1509 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1510 * b[l + j * b_dim1];
1511 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1512 * b[l + j * b_dim1];
1513 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1514 * b[l + (j + 1) * b_dim1];
1515 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1516 * b[l + (j + 1) * b_dim1];
1517 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1518 * b[l + (j + 2) * b_dim1];
1519 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1520 * b[l + (j + 2) * b_dim1];
1521 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1522 * b[l + (j + 3) * b_dim1];
1523 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1524 * b[l + (j + 3) * b_dim1];
1525 }
1526 c[i + j * c_dim1] = f11;
1527 c[i + 1 + j * c_dim1] = f21;
1528 c[i + (j + 1) * c_dim1] = f12;
1529 c[i + 1 + (j + 1) * c_dim1] = f22;
1530 c[i + (j + 2) * c_dim1] = f13;
1531 c[i + 1 + (j + 2) * c_dim1] = f23;
1532 c[i + (j + 3) * c_dim1] = f14;
1533 c[i + 1 + (j + 3) * c_dim1] = f24;
1534 c[i + 2 + j * c_dim1] = f31;
1535 c[i + 3 + j * c_dim1] = f41;
1536 c[i + 2 + (j + 1) * c_dim1] = f32;
1537 c[i + 3 + (j + 1) * c_dim1] = f42;
1538 c[i + 2 + (j + 2) * c_dim1] = f33;
1539 c[i + 3 + (j + 2) * c_dim1] = f43;
1540 c[i + 2 + (j + 3) * c_dim1] = f34;
1541 c[i + 3 + (j + 3) * c_dim1] = f44;
1542 }
1543 if (uisec < isec)
1544 {
1545 i5 = ii + isec - 1;
1546 for (i = ii + uisec; i <= i5; ++i)
1547 {
1548 f11 = c[i + j * c_dim1];
1549 f12 = c[i + (j + 1) * c_dim1];
1550 f13 = c[i + (j + 2) * c_dim1];
1551 f14 = c[i + (j + 3) * c_dim1];
1552 i6 = ll + lsec - 1;
1553 for (l = ll; l <= i6; ++l)
1554 {
1555 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1556 257] * b[l + j * b_dim1];
1557 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1558 257] * b[l + (j + 1) * b_dim1];
1559 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1560 257] * b[l + (j + 2) * b_dim1];
1561 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1562 257] * b[l + (j + 3) * b_dim1];
1563 }
1564 c[i + j * c_dim1] = f11;
1565 c[i + (j + 1) * c_dim1] = f12;
1566 c[i + (j + 2) * c_dim1] = f13;
1567 c[i + (j + 3) * c_dim1] = f14;
1568 }
1569 }
1570 }
1571 if (ujsec < jsec)
1572 {
1573 i4 = jj + jsec - 1;
1574 for (j = jj + ujsec; j <= i4; ++j)
1575 {
1576 i5 = ii + uisec - 1;
1577 for (i = ii; i <= i5; i += 4)
1578 {
1579 f11 = c[i + j * c_dim1];
1580 f21 = c[i + 1 + j * c_dim1];
1581 f31 = c[i + 2 + j * c_dim1];
1582 f41 = c[i + 3 + j * c_dim1];
1583 i6 = ll + lsec - 1;
1584 for (l = ll; l <= i6; ++l)
1585 {
1586 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1587 257] * b[l + j * b_dim1];
1588 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1589 257] * b[l + j * b_dim1];
1590 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1591 257] * b[l + j * b_dim1];
1592 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1593 257] * b[l + j * b_dim1];
1594 }
1595 c[i + j * c_dim1] = f11;
1596 c[i + 1 + j * c_dim1] = f21;
1597 c[i + 2 + j * c_dim1] = f31;
1598 c[i + 3 + j * c_dim1] = f41;
1599 }
1600 i5 = ii + isec - 1;
1601 for (i = ii + uisec; i <= i5; ++i)
1602 {
1603 f11 = c[i + j * c_dim1];
1604 i6 = ll + lsec - 1;
1605 for (l = ll; l <= i6; ++l)
1606 {
1607 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1608 257] * b[l + j * b_dim1];
1609 }
1610 c[i + j * c_dim1] = f11;
1611 }
1612 }
1613 }
1614 }
1615 }
1616 }
1617 return;
1618 }
1619 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1620 {
1621 if (GFC_DESCRIPTOR_RANK (a) != 1)
1622 {
1623 const GFC_INTEGER_2 *restrict abase_x;
1624 const GFC_INTEGER_2 *restrict bbase_y;
1625 GFC_INTEGER_2 *restrict dest_y;
1626 GFC_INTEGER_2 s;
1627
1628 for (y = 0; y < ycount; y++)
1629 {
1630 bbase_y = &bbase[y*bystride];
1631 dest_y = &dest[y*rystride];
1632 for (x = 0; x < xcount; x++)
1633 {
1634 abase_x = &abase[x*axstride];
1635 s = (GFC_INTEGER_2) 0;
1636 for (n = 0; n < count; n++)
1637 s += abase_x[n] * bbase_y[n];
1638 dest_y[x] = s;
1639 }
1640 }
1641 }
1642 else
1643 {
1644 const GFC_INTEGER_2 *restrict bbase_y;
1645 GFC_INTEGER_2 s;
1646
1647 for (y = 0; y < ycount; y++)
1648 {
1649 bbase_y = &bbase[y*bystride];
1650 s = (GFC_INTEGER_2) 0;
1651 for (n = 0; n < count; n++)
1652 s += abase[n*axstride] * bbase_y[n];
1653 dest[y*rystride] = s;
1654 }
1655 }
1656 }
1657 else if (axstride < aystride)
1658 {
1659 for (y = 0; y < ycount; y++)
1660 for (x = 0; x < xcount; x++)
1661 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1662
1663 for (y = 0; y < ycount; y++)
1664 for (n = 0; n < count; n++)
1665 for (x = 0; x < xcount; x++)
1666 /* dest[x,y] += a[x,n] * b[n,y] */
1667 dest[x*rxstride + y*rystride] +=
1668 abase[x*axstride + n*aystride] *
1669 bbase[n*bxstride + y*bystride];
1670 }
1671 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1672 {
1673 const GFC_INTEGER_2 *restrict bbase_y;
1674 GFC_INTEGER_2 s;
1675
1676 for (y = 0; y < ycount; y++)
1677 {
1678 bbase_y = &bbase[y*bystride];
1679 s = (GFC_INTEGER_2) 0;
1680 for (n = 0; n < count; n++)
1681 s += abase[n*axstride] * bbase_y[n*bxstride];
1682 dest[y*rxstride] = s;
1683 }
1684 }
1685 else
1686 {
1687 const GFC_INTEGER_2 *restrict abase_x;
1688 const GFC_INTEGER_2 *restrict bbase_y;
1689 GFC_INTEGER_2 *restrict dest_y;
1690 GFC_INTEGER_2 s;
1691
1692 for (y = 0; y < ycount; y++)
1693 {
1694 bbase_y = &bbase[y*bystride];
1695 dest_y = &dest[y*rystride];
1696 for (x = 0; x < xcount; x++)
1697 {
1698 abase_x = &abase[x*axstride];
1699 s = (GFC_INTEGER_2) 0;
1700 for (n = 0; n < count; n++)
1701 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1702 dest_y[x*rxstride] = s;
1703 }
1704 }
1705 }
1706}
1707#undef POW3
1708#undef min
1709#undef max
1710
1711#endif /* HAVE_AVX512F */
1712
1713/* Function to fall back to if there is no special processor-specific version. */
1714static void
1715matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
1716 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
1717 int blas_limit, blas_call gemm)
1718{
1719 const GFC_INTEGER_2 * restrict abase;
1720 const GFC_INTEGER_2 * restrict bbase;
1721 GFC_INTEGER_2 * restrict dest;
1722
1723 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1724 index_type x, y, n, count, xcount, ycount;
1725
1726 assert (GFC_DESCRIPTOR_RANK (a) == 2
1727 || GFC_DESCRIPTOR_RANK (b) == 2);
1728
1729/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1730
1731 Either A or B (but not both) can be rank 1:
1732
1733 o One-dimensional argument A is implicitly treated as a row matrix
1734 dimensioned [1,count], so xcount=1.
1735
1736 o One-dimensional argument B is implicitly treated as a column matrix
1737 dimensioned [count, 1], so ycount=1.
1738*/
1739
1740 if (retarray->base_addr == NULL)
1741 {
1742 if (GFC_DESCRIPTOR_RANK (a) == 1)
1743 {
1744 GFC_DIMENSION_SET(retarray->dim[0], 0,
1745 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1746 }
1747 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1748 {
1749 GFC_DIMENSION_SET(retarray->dim[0], 0,
1750 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1751 }
1752 else
1753 {
1754 GFC_DIMENSION_SET(retarray->dim[0], 0,
1755 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1756
1757 GFC_DIMENSION_SET(retarray->dim[1], 0,
1758 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1759 GFC_DESCRIPTOR_EXTENT(retarray,0));
1760 }
1761
1762 retarray->base_addr
1763 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
1764 retarray->offset = 0;
1765 }
1766 else if (unlikely (compile_options.bounds_check))
1767 {
1768 index_type ret_extent, arg_extent;
1769
1770 if (GFC_DESCRIPTOR_RANK (a) == 1)
1771 {
1772 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1773 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1774 if (arg_extent != ret_extent)
1775 runtime_error ("Incorrect extent in return array in"
1776 " MATMUL intrinsic: is %ld, should be %ld",
1777 (long int) ret_extent, (long int) arg_extent);
1778 }
1779 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1780 {
1781 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1782 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1783 if (arg_extent != ret_extent)
1784 runtime_error ("Incorrect extent in return array in"
1785 " MATMUL intrinsic: is %ld, should be %ld",
1786 (long int) ret_extent, (long int) arg_extent);
1787 }
1788 else
1789 {
1790 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1791 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1792 if (arg_extent != ret_extent)
1793 runtime_error ("Incorrect extent in return array in"
1794 " MATMUL intrinsic for dimension 1:"
1795 " is %ld, should be %ld",
1796 (long int) ret_extent, (long int) arg_extent);
1797
1798 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1799 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1800 if (arg_extent != ret_extent)
1801 runtime_error ("Incorrect extent in return array in"
1802 " MATMUL intrinsic for dimension 2:"
1803 " is %ld, should be %ld",
1804 (long int) ret_extent, (long int) arg_extent);
1805 }
1806 }
1807
1808
1809 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1810 {
1811 /* One-dimensional result may be addressed in the code below
1812 either as a row or a column matrix. We want both cases to
1813 work. */
1814 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1815 }
1816 else
1817 {
1818 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1819 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1820 }
1821
1822
1823 if (GFC_DESCRIPTOR_RANK (a) == 1)
1824 {
1825 /* Treat it as a a row matrix A[1,count]. */
1826 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1827 aystride = 1;
1828
1829 xcount = 1;
1830 count = GFC_DESCRIPTOR_EXTENT(a,0);
1831 }
1832 else
1833 {
1834 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1835 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1836
1837 count = GFC_DESCRIPTOR_EXTENT(a,1);
1838 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1839 }
1840
1841 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1842 {
1843 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1844 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1845 }
1846
1847 if (GFC_DESCRIPTOR_RANK (b) == 1)
1848 {
1849 /* Treat it as a column matrix B[count,1] */
1850 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1851
1852 /* bystride should never be used for 1-dimensional b.
1853 in case it is we want it to cause a segfault, rather than
1854 an incorrect result. */
1855 bystride = 0xDEADBEEF;
1856 ycount = 1;
1857 }
1858 else
1859 {
1860 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1861 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1862 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1863 }
1864
1865 abase = a->base_addr;
1866 bbase = b->base_addr;
1867 dest = retarray->base_addr;
1868
1869 /* Now that everything is set up, we perform the multiplication
1870 itself. */
1871
1872#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1873#define min(a,b) ((a) <= (b) ? (a) : (b))
1874#define max(a,b) ((a) >= (b) ? (a) : (b))
1875
1876 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1877 && (bxstride == 1 || bystride == 1)
1878 && (((float) xcount) * ((float) ycount) * ((float) count)
1879 > POW3(blas_limit)))
1880 {
1881 const int m = xcount, n = ycount, k = count, ldc = rystride;
1882 const GFC_INTEGER_2 one = 1, zero = 0;
1883 const int lda = (axstride == 1) ? aystride : axstride,
1884 ldb = (bxstride == 1) ? bystride : bxstride;
1885
1886 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1887 {
1888 assert (gemm != NULL);
1889 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1890 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1891 &ldc, 1, 1);
1892 return;
1893 }
1894 }
1895
1896 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1897 {
1898 /* This block of code implements a tuned matmul, derived from
1899 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1900
1901 Bo Kagstrom and Per Ling
1902 Department of Computing Science
1903 Umea University
1904 S-901 87 Umea, Sweden
1905
1906 from netlib.org, translated to C, and modified for matmul.m4. */
1907
1908 const GFC_INTEGER_2 *a, *b;
1909 GFC_INTEGER_2 *c;
1910 const index_type m = xcount, n = ycount, k = count;
1911
1912 /* System generated locals */
1913 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1914 i1, i2, i3, i4, i5, i6;
1915
1916 /* Local variables */
1917 GFC_INTEGER_2 t1[65536], /* was [256][256] */
1918 f11, f12, f21, f22, f31, f32, f41, f42,
1919 f13, f14, f23, f24, f33, f34, f43, f44;
1920 index_type i, j, l, ii, jj, ll;
1921 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1922
1923 a = abase;
1924 b = bbase;
1925 c = retarray->base_addr;
1926
1927 /* Parameter adjustments */
1928 c_dim1 = rystride;
1929 c_offset = 1 + c_dim1;
1930 c -= c_offset;
1931 a_dim1 = aystride;
1932 a_offset = 1 + a_dim1;
1933 a -= a_offset;
1934 b_dim1 = bystride;
1935 b_offset = 1 + b_dim1;
1936 b -= b_offset;
1937
1938 /* Early exit if possible */
1939 if (m == 0 || n == 0 || k == 0)
1940 return;
1941
1942 /* Empty c first. */
1943 for (j=1; j<=n; j++)
1944 for (i=1; i<=m; i++)
1945 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
1946
1947 /* Start turning the crank. */
1948 i1 = n;
1949 for (jj = 1; jj <= i1; jj += 512)
1950 {
1951 /* Computing MIN */
1952 i2 = 512;
1953 i3 = n - jj + 1;
1954 jsec = min(i2,i3);
1955 ujsec = jsec - jsec % 4;
1956 i2 = k;
1957 for (ll = 1; ll <= i2; ll += 256)
1958 {
1959 /* Computing MIN */
1960 i3 = 256;
1961 i4 = k - ll + 1;
1962 lsec = min(i3,i4);
1963 ulsec = lsec - lsec % 2;
1964
1965 i3 = m;
1966 for (ii = 1; ii <= i3; ii += 256)
1967 {
1968 /* Computing MIN */
1969 i4 = 256;
1970 i5 = m - ii + 1;
1971 isec = min(i4,i5);
1972 uisec = isec - isec % 2;
1973 i4 = ll + ulsec - 1;
1974 for (l = ll; l <= i4; l += 2)
1975 {
1976 i5 = ii + uisec - 1;
1977 for (i = ii; i <= i5; i += 2)
1978 {
1979 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1980 a[i + l * a_dim1];
1981 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1982 a[i + (l + 1) * a_dim1];
1983 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1984 a[i + 1 + l * a_dim1];
1985 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1986 a[i + 1 + (l + 1) * a_dim1];
1987 }
1988 if (uisec < isec)
1989 {
1990 t1[l - ll + 1 + (isec << 8) - 257] =
1991 a[ii + isec - 1 + l * a_dim1];
1992 t1[l - ll + 2 + (isec << 8) - 257] =
1993 a[ii + isec - 1 + (l + 1) * a_dim1];
1994 }
1995 }
1996 if (ulsec < lsec)
1997 {
1998 i4 = ii + isec - 1;
1999 for (i = ii; i<= i4; ++i)
2000 {
2001 t1[lsec + ((i - ii + 1) << 8) - 257] =
2002 a[i + (ll + lsec - 1) * a_dim1];
2003 }
2004 }
2005
2006 uisec = isec - isec % 4;
2007 i4 = jj + ujsec - 1;
2008 for (j = jj; j <= i4; j += 4)
2009 {
2010 i5 = ii + uisec - 1;
2011 for (i = ii; i <= i5; i += 4)
2012 {
2013 f11 = c[i + j * c_dim1];
2014 f21 = c[i + 1 + j * c_dim1];
2015 f12 = c[i + (j + 1) * c_dim1];
2016 f22 = c[i + 1 + (j + 1) * c_dim1];
2017 f13 = c[i + (j + 2) * c_dim1];
2018 f23 = c[i + 1 + (j + 2) * c_dim1];
2019 f14 = c[i + (j + 3) * c_dim1];
2020 f24 = c[i + 1 + (j + 3) * c_dim1];
2021 f31 = c[i + 2 + j * c_dim1];
2022 f41 = c[i + 3 + j * c_dim1];
2023 f32 = c[i + 2 + (j + 1) * c_dim1];
2024 f42 = c[i + 3 + (j + 1) * c_dim1];
2025 f33 = c[i + 2 + (j + 2) * c_dim1];
2026 f43 = c[i + 3 + (j + 2) * c_dim1];
2027 f34 = c[i + 2 + (j + 3) * c_dim1];
2028 f44 = c[i + 3 + (j + 3) * c_dim1];
2029 i6 = ll + lsec - 1;
2030 for (l = ll; l <= i6; ++l)
2031 {
2032 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2033 * b[l + j * b_dim1];
2034 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2035 * b[l + j * b_dim1];
2036 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2037 * b[l + (j + 1) * b_dim1];
2038 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2039 * b[l + (j + 1) * b_dim1];
2040 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2041 * b[l + (j + 2) * b_dim1];
2042 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2043 * b[l + (j + 2) * b_dim1];
2044 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2045 * b[l + (j + 3) * b_dim1];
2046 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2047 * b[l + (j + 3) * b_dim1];
2048 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2049 * b[l + j * b_dim1];
2050 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2051 * b[l + j * b_dim1];
2052 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2053 * b[l + (j + 1) * b_dim1];
2054 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2055 * b[l + (j + 1) * b_dim1];
2056 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2057 * b[l + (j + 2) * b_dim1];
2058 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2059 * b[l + (j + 2) * b_dim1];
2060 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2061 * b[l + (j + 3) * b_dim1];
2062 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2063 * b[l + (j + 3) * b_dim1];
2064 }
2065 c[i + j * c_dim1] = f11;
2066 c[i + 1 + j * c_dim1] = f21;
2067 c[i + (j + 1) * c_dim1] = f12;
2068 c[i + 1 + (j + 1) * c_dim1] = f22;
2069 c[i + (j + 2) * c_dim1] = f13;
2070 c[i + 1 + (j + 2) * c_dim1] = f23;
2071 c[i + (j + 3) * c_dim1] = f14;
2072 c[i + 1 + (j + 3) * c_dim1] = f24;
2073 c[i + 2 + j * c_dim1] = f31;
2074 c[i + 3 + j * c_dim1] = f41;
2075 c[i + 2 + (j + 1) * c_dim1] = f32;
2076 c[i + 3 + (j + 1) * c_dim1] = f42;
2077 c[i + 2 + (j + 2) * c_dim1] = f33;
2078 c[i + 3 + (j + 2) * c_dim1] = f43;
2079 c[i + 2 + (j + 3) * c_dim1] = f34;
2080 c[i + 3 + (j + 3) * c_dim1] = f44;
2081 }
2082 if (uisec < isec)
2083 {
2084 i5 = ii + isec - 1;
2085 for (i = ii + uisec; i <= i5; ++i)
2086 {
2087 f11 = c[i + j * c_dim1];
2088 f12 = c[i + (j + 1) * c_dim1];
2089 f13 = c[i + (j + 2) * c_dim1];
2090 f14 = c[i + (j + 3) * c_dim1];
2091 i6 = ll + lsec - 1;
2092 for (l = ll; l <= i6; ++l)
2093 {
2094 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2095 257] * b[l + j * b_dim1];
2096 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2097 257] * b[l + (j + 1) * b_dim1];
2098 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2099 257] * b[l + (j + 2) * b_dim1];
2100 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2101 257] * b[l + (j + 3) * b_dim1];
2102 }
2103 c[i + j * c_dim1] = f11;
2104 c[i + (j + 1) * c_dim1] = f12;
2105 c[i + (j + 2) * c_dim1] = f13;
2106 c[i + (j + 3) * c_dim1] = f14;
2107 }
2108 }
2109 }
2110 if (ujsec < jsec)
2111 {
2112 i4 = jj + jsec - 1;
2113 for (j = jj + ujsec; j <= i4; ++j)
2114 {
2115 i5 = ii + uisec - 1;
2116 for (i = ii; i <= i5; i += 4)
2117 {
2118 f11 = c[i + j * c_dim1];
2119 f21 = c[i + 1 + j * c_dim1];
2120 f31 = c[i + 2 + j * c_dim1];
2121 f41 = c[i + 3 + j * c_dim1];
2122 i6 = ll + lsec - 1;
2123 for (l = ll; l <= i6; ++l)
2124 {
2125 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2126 257] * b[l + j * b_dim1];
2127 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2128 257] * b[l + j * b_dim1];
2129 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2130 257] * b[l + j * b_dim1];
2131 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2132 257] * b[l + j * b_dim1];
2133 }
2134 c[i + j * c_dim1] = f11;
2135 c[i + 1 + j * c_dim1] = f21;
2136 c[i + 2 + j * c_dim1] = f31;
2137 c[i + 3 + j * c_dim1] = f41;
2138 }
2139 i5 = ii + isec - 1;
2140 for (i = ii + uisec; i <= i5; ++i)
2141 {
2142 f11 = c[i + j * c_dim1];
2143 i6 = ll + lsec - 1;
2144 for (l = ll; l <= i6; ++l)
2145 {
2146 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2147 257] * b[l + j * b_dim1];
2148 }
2149 c[i + j * c_dim1] = f11;
2150 }
2151 }
2152 }
2153 }
2154 }
2155 }
2156 return;
2157 }
2158 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2159 {
2160 if (GFC_DESCRIPTOR_RANK (a) != 1)
2161 {
2162 const GFC_INTEGER_2 *restrict abase_x;
2163 const GFC_INTEGER_2 *restrict bbase_y;
2164 GFC_INTEGER_2 *restrict dest_y;
2165 GFC_INTEGER_2 s;
2166
2167 for (y = 0; y < ycount; y++)
2168 {
2169 bbase_y = &bbase[y*bystride];
2170 dest_y = &dest[y*rystride];
2171 for (x = 0; x < xcount; x++)
2172 {
2173 abase_x = &abase[x*axstride];
2174 s = (GFC_INTEGER_2) 0;
2175 for (n = 0; n < count; n++)
2176 s += abase_x[n] * bbase_y[n];
2177 dest_y[x] = s;
2178 }
2179 }
2180 }
2181 else
2182 {
2183 const GFC_INTEGER_2 *restrict bbase_y;
2184 GFC_INTEGER_2 s;
2185
2186 for (y = 0; y < ycount; y++)
2187 {
2188 bbase_y = &bbase[y*bystride];
2189 s = (GFC_INTEGER_2) 0;
2190 for (n = 0; n < count; n++)
2191 s += abase[n*axstride] * bbase_y[n];
2192 dest[y*rystride] = s;
2193 }
2194 }
2195 }
2196 else if (axstride < aystride)
2197 {
2198 for (y = 0; y < ycount; y++)
2199 for (x = 0; x < xcount; x++)
2200 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
2201
2202 for (y = 0; y < ycount; y++)
2203 for (n = 0; n < count; n++)
2204 for (x = 0; x < xcount; x++)
2205 /* dest[x,y] += a[x,n] * b[n,y] */
2206 dest[x*rxstride + y*rystride] +=
2207 abase[x*axstride + n*aystride] *
2208 bbase[n*bxstride + y*bystride];
2209 }
2210 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2211 {
2212 const GFC_INTEGER_2 *restrict bbase_y;
2213 GFC_INTEGER_2 s;
2214
2215 for (y = 0; y < ycount; y++)
2216 {
2217 bbase_y = &bbase[y*bystride];
2218 s = (GFC_INTEGER_2) 0;
2219 for (n = 0; n < count; n++)
2220 s += abase[n*axstride] * bbase_y[n*bxstride];
2221 dest[y*rxstride] = s;
2222 }
2223 }
2224 else
2225 {
2226 const GFC_INTEGER_2 *restrict abase_x;
2227 const GFC_INTEGER_2 *restrict bbase_y;
2228 GFC_INTEGER_2 *restrict dest_y;
2229 GFC_INTEGER_2 s;
2230
2231 for (y = 0; y < ycount; y++)
2232 {
2233 bbase_y = &bbase[y*bystride];
2234 dest_y = &dest[y*rystride];
2235 for (x = 0; x < xcount; x++)
2236 {
2237 abase_x = &abase[x*axstride];
2238 s = (GFC_INTEGER_2) 0;
2239 for (n = 0; n < count; n++)
2240 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2241 dest_y[x*rxstride] = s;
2242 }
2243 }
2244 }
2245}
2246#undef POW3
2247#undef min
2248#undef max
2249
2250
2251/* Compiling main function, with selection code for the processor. */
2252
2253/* Currently, this is i386 only. Adjust for other architectures. */
2254
2255#include <config/i386/cpuinfo.h>
2256void matmul_i2 (gfc_array_i2 * const restrict retarray,
2257 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2258 int blas_limit, blas_call gemm)
2259{
2260 static void (*matmul_p) (gfc_array_i2 * const restrict retarray,
2261 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2262 int blas_limit, blas_call gemm) = NULL;
2263
2264 if (matmul_p == NULL)
2265 {
2266 matmul_p = matmul_i2_vanilla;
2267 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2268 {
2269 /* Run down the available processors in order of preference. */
2270#ifdef HAVE_AVX512F
2271 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2272 {
2273 matmul_p = matmul_i2_avx512f;
2274 goto tailcall;
2275 }
2276
2277#endif /* HAVE_AVX512F */
2278
2279#ifdef HAVE_AVX2
2280 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2281 {
2282 matmul_p = matmul_i2_avx2;
2283 goto tailcall;
2284 }
2285
2286#endif
2287
2288#ifdef HAVE_AVX
2289 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2290 {
2291 matmul_p = matmul_i2_avx;
2292 goto tailcall;
2293 }
2294#endif /* HAVE_AVX */
2295 }
2296 }
2297
2298tailcall:
2299 (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2300}
2301
2302#else /* Just the vanilla function. */
2303
567c915b
TK
2304void
2305matmul_i2 (gfc_array_i2 * const restrict retarray,
2306 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
2307 int blas_limit, blas_call gemm)
2308{
2309 const GFC_INTEGER_2 * restrict abase;
2310 const GFC_INTEGER_2 * restrict bbase;
2311 GFC_INTEGER_2 * restrict dest;
2312
2313 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2314 index_type x, y, n, count, xcount, ycount;
2315
2316 assert (GFC_DESCRIPTOR_RANK (a) == 2
2317 || GFC_DESCRIPTOR_RANK (b) == 2);
2318
2319/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2320
2321 Either A or B (but not both) can be rank 1:
2322
2323 o One-dimensional argument A is implicitly treated as a row matrix
2324 dimensioned [1,count], so xcount=1.
2325
2326 o One-dimensional argument B is implicitly treated as a column matrix
2327 dimensioned [count, 1], so ycount=1.
5d70ab07 2328*/
567c915b 2329
21d1335b 2330 if (retarray->base_addr == NULL)
567c915b
TK
2331 {
2332 if (GFC_DESCRIPTOR_RANK (a) == 1)
2333 {
dfb55fdc
TK
2334 GFC_DIMENSION_SET(retarray->dim[0], 0,
2335 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
567c915b
TK
2336 }
2337 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2338 {
dfb55fdc
TK
2339 GFC_DIMENSION_SET(retarray->dim[0], 0,
2340 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
567c915b
TK
2341 }
2342 else
2343 {
dfb55fdc
TK
2344 GFC_DIMENSION_SET(retarray->dim[0], 0,
2345 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
567c915b 2346
dfb55fdc
TK
2347 GFC_DIMENSION_SET(retarray->dim[1], 0,
2348 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2349 GFC_DESCRIPTOR_EXTENT(retarray,0));
567c915b
TK
2350 }
2351
21d1335b 2352 retarray->base_addr
92e6f3a4 2353 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
567c915b
TK
2354 retarray->offset = 0;
2355 }
5d70ab07
JD
2356 else if (unlikely (compile_options.bounds_check))
2357 {
2358 index_type ret_extent, arg_extent;
2359
2360 if (GFC_DESCRIPTOR_RANK (a) == 1)
2361 {
2362 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2363 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2364 if (arg_extent != ret_extent)
2365 runtime_error ("Incorrect extent in return array in"
2366 " MATMUL intrinsic: is %ld, should be %ld",
2367 (long int) ret_extent, (long int) arg_extent);
2368 }
2369 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2370 {
2371 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2372 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2373 if (arg_extent != ret_extent)
2374 runtime_error ("Incorrect extent in return array in"
2375 " MATMUL intrinsic: is %ld, should be %ld",
2376 (long int) ret_extent, (long int) arg_extent);
2377 }
2378 else
2379 {
2380 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2381 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2382 if (arg_extent != ret_extent)
2383 runtime_error ("Incorrect extent in return array in"
2384 " MATMUL intrinsic for dimension 1:"
2385 " is %ld, should be %ld",
2386 (long int) ret_extent, (long int) arg_extent);
2387
2388 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2389 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2390 if (arg_extent != ret_extent)
2391 runtime_error ("Incorrect extent in return array in"
2392 " MATMUL intrinsic for dimension 2:"
2393 " is %ld, should be %ld",
2394 (long int) ret_extent, (long int) arg_extent);
2395 }
2396 }
567c915b
TK
2397
2398
2399 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2400 {
2401 /* One-dimensional result may be addressed in the code below
2402 either as a row or a column matrix. We want both cases to
2403 work. */
dfb55fdc 2404 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
567c915b
TK
2405 }
2406 else
2407 {
dfb55fdc
TK
2408 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2409 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
567c915b
TK
2410 }
2411
2412
2413 if (GFC_DESCRIPTOR_RANK (a) == 1)
2414 {
2415 /* Treat it as a a row matrix A[1,count]. */
dfb55fdc 2416 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
567c915b
TK
2417 aystride = 1;
2418
2419 xcount = 1;
dfb55fdc 2420 count = GFC_DESCRIPTOR_EXTENT(a,0);
567c915b
TK
2421 }
2422 else
2423 {
dfb55fdc
TK
2424 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2425 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
567c915b 2426
dfb55fdc
TK
2427 count = GFC_DESCRIPTOR_EXTENT(a,1);
2428 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
567c915b
TK
2429 }
2430
dfb55fdc 2431 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
7edc89d4 2432 {
dfb55fdc 2433 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
7edc89d4
TK
2434 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2435 }
567c915b
TK
2436
2437 if (GFC_DESCRIPTOR_RANK (b) == 1)
2438 {
2439 /* Treat it as a column matrix B[count,1] */
dfb55fdc 2440 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
567c915b
TK
2441
2442 /* bystride should never be used for 1-dimensional b.
2443 in case it is we want it to cause a segfault, rather than
2444 an incorrect result. */
2445 bystride = 0xDEADBEEF;
2446 ycount = 1;
2447 }
2448 else
2449 {
dfb55fdc
TK
2450 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2451 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2452 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
567c915b
TK
2453 }
2454
21d1335b
TB
2455 abase = a->base_addr;
2456 bbase = b->base_addr;
2457 dest = retarray->base_addr;
567c915b 2458
5d70ab07 2459 /* Now that everything is set up, we perform the multiplication
567c915b
TK
2460 itself. */
2461
2462#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
5d70ab07
JD
2463#define min(a,b) ((a) <= (b) ? (a) : (b))
2464#define max(a,b) ((a) >= (b) ? (a) : (b))
567c915b
TK
2465
2466 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2467 && (bxstride == 1 || bystride == 1)
2468 && (((float) xcount) * ((float) ycount) * ((float) count)
2469 > POW3(blas_limit)))
567c915b 2470 {
5d70ab07
JD
2471 const int m = xcount, n = ycount, k = count, ldc = rystride;
2472 const GFC_INTEGER_2 one = 1, zero = 0;
2473 const int lda = (axstride == 1) ? aystride : axstride,
2474 ldb = (bxstride == 1) ? bystride : bxstride;
567c915b 2475
5d70ab07 2476 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
567c915b 2477 {
5d70ab07
JD
2478 assert (gemm != NULL);
2479 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2480 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2481 &ldc, 1, 1);
2482 return;
567c915b 2483 }
5d70ab07 2484 }
567c915b 2485
5d70ab07
JD
2486 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2487 {
2488 /* This block of code implements a tuned matmul, derived from
2489 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2490
2491 Bo Kagstrom and Per Ling
2492 Department of Computing Science
2493 Umea University
2494 S-901 87 Umea, Sweden
2495
2496 from netlib.org, translated to C, and modified for matmul.m4. */
2497
2498 const GFC_INTEGER_2 *a, *b;
2499 GFC_INTEGER_2 *c;
2500 const index_type m = xcount, n = ycount, k = count;
2501
2502 /* System generated locals */
2503 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2504 i1, i2, i3, i4, i5, i6;
2505
2506 /* Local variables */
2507 GFC_INTEGER_2 t1[65536], /* was [256][256] */
2508 f11, f12, f21, f22, f31, f32, f41, f42,
2509 f13, f14, f23, f24, f33, f34, f43, f44;
2510 index_type i, j, l, ii, jj, ll;
2511 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2512
2513 a = abase;
2514 b = bbase;
2515 c = retarray->base_addr;
2516
2517 /* Parameter adjustments */
2518 c_dim1 = rystride;
2519 c_offset = 1 + c_dim1;
2520 c -= c_offset;
2521 a_dim1 = aystride;
2522 a_offset = 1 + a_dim1;
2523 a -= a_offset;
2524 b_dim1 = bystride;
2525 b_offset = 1 + b_dim1;
2526 b -= b_offset;
2527
2528 /* Early exit if possible */
2529 if (m == 0 || n == 0 || k == 0)
2530 return;
2531
2532 /* Empty c first. */
2533 for (j=1; j<=n; j++)
2534 for (i=1; i<=m; i++)
2535 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
2536
2537 /* Start turning the crank. */
2538 i1 = n;
2539 for (jj = 1; jj <= i1; jj += 512)
567c915b 2540 {
5d70ab07
JD
2541 /* Computing MIN */
2542 i2 = 512;
2543 i3 = n - jj + 1;
2544 jsec = min(i2,i3);
2545 ujsec = jsec - jsec % 4;
2546 i2 = k;
2547 for (ll = 1; ll <= i2; ll += 256)
567c915b 2548 {
5d70ab07
JD
2549 /* Computing MIN */
2550 i3 = 256;
2551 i4 = k - ll + 1;
2552 lsec = min(i3,i4);
2553 ulsec = lsec - lsec % 2;
2554
2555 i3 = m;
2556 for (ii = 1; ii <= i3; ii += 256)
567c915b 2557 {
5d70ab07
JD
2558 /* Computing MIN */
2559 i4 = 256;
2560 i5 = m - ii + 1;
2561 isec = min(i4,i5);
2562 uisec = isec - isec % 2;
2563 i4 = ll + ulsec - 1;
2564 for (l = ll; l <= i4; l += 2)
2565 {
2566 i5 = ii + uisec - 1;
2567 for (i = ii; i <= i5; i += 2)
2568 {
2569 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2570 a[i + l * a_dim1];
2571 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2572 a[i + (l + 1) * a_dim1];
2573 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2574 a[i + 1 + l * a_dim1];
2575 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2576 a[i + 1 + (l + 1) * a_dim1];
2577 }
2578 if (uisec < isec)
2579 {
2580 t1[l - ll + 1 + (isec << 8) - 257] =
2581 a[ii + isec - 1 + l * a_dim1];
2582 t1[l - ll + 2 + (isec << 8) - 257] =
2583 a[ii + isec - 1 + (l + 1) * a_dim1];
2584 }
2585 }
2586 if (ulsec < lsec)
2587 {
2588 i4 = ii + isec - 1;
2589 for (i = ii; i<= i4; ++i)
2590 {
2591 t1[lsec + ((i - ii + 1) << 8) - 257] =
2592 a[i + (ll + lsec - 1) * a_dim1];
2593 }
2594 }
2595
2596 uisec = isec - isec % 4;
2597 i4 = jj + ujsec - 1;
2598 for (j = jj; j <= i4; j += 4)
2599 {
2600 i5 = ii + uisec - 1;
2601 for (i = ii; i <= i5; i += 4)
2602 {
2603 f11 = c[i + j * c_dim1];
2604 f21 = c[i + 1 + j * c_dim1];
2605 f12 = c[i + (j + 1) * c_dim1];
2606 f22 = c[i + 1 + (j + 1) * c_dim1];
2607 f13 = c[i + (j + 2) * c_dim1];
2608 f23 = c[i + 1 + (j + 2) * c_dim1];
2609 f14 = c[i + (j + 3) * c_dim1];
2610 f24 = c[i + 1 + (j + 3) * c_dim1];
2611 f31 = c[i + 2 + j * c_dim1];
2612 f41 = c[i + 3 + j * c_dim1];
2613 f32 = c[i + 2 + (j + 1) * c_dim1];
2614 f42 = c[i + 3 + (j + 1) * c_dim1];
2615 f33 = c[i + 2 + (j + 2) * c_dim1];
2616 f43 = c[i + 3 + (j + 2) * c_dim1];
2617 f34 = c[i + 2 + (j + 3) * c_dim1];
2618 f44 = c[i + 3 + (j + 3) * c_dim1];
2619 i6 = ll + lsec - 1;
2620 for (l = ll; l <= i6; ++l)
2621 {
2622 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2623 * b[l + j * b_dim1];
2624 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2625 * b[l + j * b_dim1];
2626 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627 * b[l + (j + 1) * b_dim1];
2628 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629 * b[l + (j + 1) * b_dim1];
2630 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631 * b[l + (j + 2) * b_dim1];
2632 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633 * b[l + (j + 2) * b_dim1];
2634 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635 * b[l + (j + 3) * b_dim1];
2636 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637 * b[l + (j + 3) * b_dim1];
2638 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2639 * b[l + j * b_dim1];
2640 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2641 * b[l + j * b_dim1];
2642 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643 * b[l + (j + 1) * b_dim1];
2644 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645 * b[l + (j + 1) * b_dim1];
2646 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647 * b[l + (j + 2) * b_dim1];
2648 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649 * b[l + (j + 2) * b_dim1];
2650 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651 * b[l + (j + 3) * b_dim1];
2652 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653 * b[l + (j + 3) * b_dim1];
2654 }
2655 c[i + j * c_dim1] = f11;
2656 c[i + 1 + j * c_dim1] = f21;
2657 c[i + (j + 1) * c_dim1] = f12;
2658 c[i + 1 + (j + 1) * c_dim1] = f22;
2659 c[i + (j + 2) * c_dim1] = f13;
2660 c[i + 1 + (j + 2) * c_dim1] = f23;
2661 c[i + (j + 3) * c_dim1] = f14;
2662 c[i + 1 + (j + 3) * c_dim1] = f24;
2663 c[i + 2 + j * c_dim1] = f31;
2664 c[i + 3 + j * c_dim1] = f41;
2665 c[i + 2 + (j + 1) * c_dim1] = f32;
2666 c[i + 3 + (j + 1) * c_dim1] = f42;
2667 c[i + 2 + (j + 2) * c_dim1] = f33;
2668 c[i + 3 + (j + 2) * c_dim1] = f43;
2669 c[i + 2 + (j + 3) * c_dim1] = f34;
2670 c[i + 3 + (j + 3) * c_dim1] = f44;
2671 }
2672 if (uisec < isec)
2673 {
2674 i5 = ii + isec - 1;
2675 for (i = ii + uisec; i <= i5; ++i)
2676 {
2677 f11 = c[i + j * c_dim1];
2678 f12 = c[i + (j + 1) * c_dim1];
2679 f13 = c[i + (j + 2) * c_dim1];
2680 f14 = c[i + (j + 3) * c_dim1];
2681 i6 = ll + lsec - 1;
2682 for (l = ll; l <= i6; ++l)
2683 {
2684 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2685 257] * b[l + j * b_dim1];
2686 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2687 257] * b[l + (j + 1) * b_dim1];
2688 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689 257] * b[l + (j + 2) * b_dim1];
2690 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691 257] * b[l + (j + 3) * b_dim1];
2692 }
2693 c[i + j * c_dim1] = f11;
2694 c[i + (j + 1) * c_dim1] = f12;
2695 c[i + (j + 2) * c_dim1] = f13;
2696 c[i + (j + 3) * c_dim1] = f14;
2697 }
2698 }
2699 }
2700 if (ujsec < jsec)
2701 {
2702 i4 = jj + jsec - 1;
2703 for (j = jj + ujsec; j <= i4; ++j)
2704 {
2705 i5 = ii + uisec - 1;
2706 for (i = ii; i <= i5; i += 4)
2707 {
2708 f11 = c[i + j * c_dim1];
2709 f21 = c[i + 1 + j * c_dim1];
2710 f31 = c[i + 2 + j * c_dim1];
2711 f41 = c[i + 3 + j * c_dim1];
2712 i6 = ll + lsec - 1;
2713 for (l = ll; l <= i6; ++l)
2714 {
2715 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2716 257] * b[l + j * b_dim1];
2717 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2718 257] * b[l + j * b_dim1];
2719 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2720 257] * b[l + j * b_dim1];
2721 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2722 257] * b[l + j * b_dim1];
2723 }
2724 c[i + j * c_dim1] = f11;
2725 c[i + 1 + j * c_dim1] = f21;
2726 c[i + 2 + j * c_dim1] = f31;
2727 c[i + 3 + j * c_dim1] = f41;
2728 }
2729 i5 = ii + isec - 1;
2730 for (i = ii + uisec; i <= i5; ++i)
2731 {
2732 f11 = c[i + j * c_dim1];
2733 i6 = ll + lsec - 1;
2734 for (l = ll; l <= i6; ++l)
2735 {
2736 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2737 257] * b[l + j * b_dim1];
2738 }
2739 c[i + j * c_dim1] = f11;
2740 }
2741 }
2742 }
567c915b
TK
2743 }
2744 }
2745 }
5d70ab07 2746 return;
567c915b
TK
2747 }
2748 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2749 {
2750 if (GFC_DESCRIPTOR_RANK (a) != 1)
2751 {
2752 const GFC_INTEGER_2 *restrict abase_x;
2753 const GFC_INTEGER_2 *restrict bbase_y;
2754 GFC_INTEGER_2 *restrict dest_y;
2755 GFC_INTEGER_2 s;
2756
2757 for (y = 0; y < ycount; y++)
2758 {
2759 bbase_y = &bbase[y*bystride];
2760 dest_y = &dest[y*rystride];
2761 for (x = 0; x < xcount; x++)
2762 {
2763 abase_x = &abase[x*axstride];
2764 s = (GFC_INTEGER_2) 0;
2765 for (n = 0; n < count; n++)
2766 s += abase_x[n] * bbase_y[n];
2767 dest_y[x] = s;
2768 }
2769 }
2770 }
2771 else
2772 {
2773 const GFC_INTEGER_2 *restrict bbase_y;
2774 GFC_INTEGER_2 s;
2775
2776 for (y = 0; y < ycount; y++)
2777 {
2778 bbase_y = &bbase[y*bystride];
2779 s = (GFC_INTEGER_2) 0;
2780 for (n = 0; n < count; n++)
2781 s += abase[n*axstride] * bbase_y[n];
2782 dest[y*rystride] = s;
2783 }
2784 }
2785 }
2786 else if (axstride < aystride)
2787 {
2788 for (y = 0; y < ycount; y++)
2789 for (x = 0; x < xcount; x++)
2790 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
2791
2792 for (y = 0; y < ycount; y++)
2793 for (n = 0; n < count; n++)
2794 for (x = 0; x < xcount; x++)
2795 /* dest[x,y] += a[x,n] * b[n,y] */
5d70ab07
JD
2796 dest[x*rxstride + y*rystride] +=
2797 abase[x*axstride + n*aystride] *
2798 bbase[n*bxstride + y*bystride];
567c915b
TK
2799 }
2800 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2801 {
2802 const GFC_INTEGER_2 *restrict bbase_y;
2803 GFC_INTEGER_2 s;
2804
2805 for (y = 0; y < ycount; y++)
2806 {
2807 bbase_y = &bbase[y*bystride];
2808 s = (GFC_INTEGER_2) 0;
2809 for (n = 0; n < count; n++)
2810 s += abase[n*axstride] * bbase_y[n*bxstride];
2811 dest[y*rxstride] = s;
2812 }
2813 }
2814 else
2815 {
2816 const GFC_INTEGER_2 *restrict abase_x;
2817 const GFC_INTEGER_2 *restrict bbase_y;
2818 GFC_INTEGER_2 *restrict dest_y;
2819 GFC_INTEGER_2 s;
2820
2821 for (y = 0; y < ycount; y++)
2822 {
2823 bbase_y = &bbase[y*bystride];
2824 dest_y = &dest[y*rystride];
2825 for (x = 0; x < xcount; x++)
2826 {
2827 abase_x = &abase[x*axstride];
2828 s = (GFC_INTEGER_2) 0;
2829 for (n = 0; n < count; n++)
2830 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2831 dest_y[x*rxstride] = s;
2832 }
2833 }
2834 }
2835}
31cfd832
TK
2836#undef POW3
2837#undef min
2838#undef max
2839
567c915b 2840#endif
31cfd832
TK
2841#endif
2842