]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/m4/matmul.m4
[multiple changes]
[thirdparty/gcc.git] / libgfortran / m4 / matmul.m4
1 `/* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <stdlib.h>
28 #include <string.h>
29 #include <assert.h>'
30
31 include(iparm.m4)dnl
32
33 `#if defined (HAVE_'rtype_name`)
34
35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36 passed to us by the front-end, in which case we call it for large
37 matrices. */
38
39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40 const int *, const 'rtype_name` *, const 'rtype_name` *,
41 const int *, const 'rtype_name` *, const int *,
42 const 'rtype_name` *, 'rtype_name` *, const int *,
43 int, int);
44
45 /* The order of loops is different in the case of plain matrix
46 multiplication C=MATMUL(A,B), and in the frequent special case where
47 the argument A is the temporary result of a TRANSPOSE intrinsic:
48 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
49 looking at their strides.
50
51 The equivalent Fortran pseudo-code is:
52
53 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
54 IF (.NOT.IS_TRANSPOSED(A)) THEN
55 C = 0
56 DO J=1,N
57 DO K=1,COUNT
58 DO I=1,M
59 C(I,J) = C(I,J)+A(I,K)*B(K,J)
60 ELSE
61 DO J=1,N
62 DO I=1,M
63 S = 0
64 DO K=1,COUNT
65 S = S+A(I,K)*B(K,J)
66 C(I,J) = S
67 ENDIF
68 */
69
70 /* If try_blas is set to a nonzero value, then the matmul function will
71 see if there is a way to perform the matrix multiplication by a call
72 to the BLAS gemm function. */
73
74 extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
75 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
76 int blas_limit, blas_call gemm);
77 export_proto(matmul_'rtype_code`);
78
79 void
80 matmul_'rtype_code` ('rtype` * const restrict retarray,
81 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
82 int blas_limit, blas_call gemm)
83 {
84 const 'rtype_name` * restrict abase;
85 const 'rtype_name` * restrict bbase;
86 'rtype_name` * restrict dest;
87
88 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
89 index_type x, y, n, count, xcount, ycount;
90
91 assert (GFC_DESCRIPTOR_RANK (a) == 2
92 || GFC_DESCRIPTOR_RANK (b) == 2);
93
94 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
95
96 Either A or B (but not both) can be rank 1:
97
98 o One-dimensional argument A is implicitly treated as a row matrix
99 dimensioned [1,count], so xcount=1.
100
101 o One-dimensional argument B is implicitly treated as a column matrix
102 dimensioned [count, 1], so ycount=1.
103 */
104
105 if (retarray->base_addr == NULL)
106 {
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
108 {
109 GFC_DIMENSION_SET(retarray->dim[0], 0,
110 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
111 }
112 else if (GFC_DESCRIPTOR_RANK (b) == 1)
113 {
114 GFC_DIMENSION_SET(retarray->dim[0], 0,
115 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
116 }
117 else
118 {
119 GFC_DIMENSION_SET(retarray->dim[0], 0,
120 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
121
122 GFC_DIMENSION_SET(retarray->dim[1], 0,
123 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
124 GFC_DESCRIPTOR_EXTENT(retarray,0));
125 }
126
127 retarray->base_addr
128 = xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`));
129 retarray->offset = 0;
130 }
131 else if (unlikely (compile_options.bounds_check))
132 {
133 index_type ret_extent, arg_extent;
134
135 if (GFC_DESCRIPTOR_RANK (a) == 1)
136 {
137 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
138 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
139 if (arg_extent != ret_extent)
140 runtime_error ("Incorrect extent in return array in"
141 " MATMUL intrinsic: is %ld, should be %ld",
142 (long int) ret_extent, (long int) arg_extent);
143 }
144 else if (GFC_DESCRIPTOR_RANK (b) == 1)
145 {
146 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
147 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
148 if (arg_extent != ret_extent)
149 runtime_error ("Incorrect extent in return array in"
150 " MATMUL intrinsic: is %ld, should be %ld",
151 (long int) ret_extent, (long int) arg_extent);
152 }
153 else
154 {
155 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
156 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
157 if (arg_extent != ret_extent)
158 runtime_error ("Incorrect extent in return array in"
159 " MATMUL intrinsic for dimension 1:"
160 " is %ld, should be %ld",
161 (long int) ret_extent, (long int) arg_extent);
162
163 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
164 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
165 if (arg_extent != ret_extent)
166 runtime_error ("Incorrect extent in return array in"
167 " MATMUL intrinsic for dimension 2:"
168 " is %ld, should be %ld",
169 (long int) ret_extent, (long int) arg_extent);
170 }
171 }
172 '
173 sinclude(`matmul_asm_'rtype_code`.m4')dnl
174 `
175 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
176 {
177 /* One-dimensional result may be addressed in the code below
178 either as a row or a column matrix. We want both cases to
179 work. */
180 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
181 }
182 else
183 {
184 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
185 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
186 }
187
188
189 if (GFC_DESCRIPTOR_RANK (a) == 1)
190 {
191 /* Treat it as a a row matrix A[1,count]. */
192 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
193 aystride = 1;
194
195 xcount = 1;
196 count = GFC_DESCRIPTOR_EXTENT(a,0);
197 }
198 else
199 {
200 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
201 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
202
203 count = GFC_DESCRIPTOR_EXTENT(a,1);
204 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
205 }
206
207 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
208 {
209 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
210 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
211 }
212
213 if (GFC_DESCRIPTOR_RANK (b) == 1)
214 {
215 /* Treat it as a column matrix B[count,1] */
216 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
217
218 /* bystride should never be used for 1-dimensional b.
219 in case it is we want it to cause a segfault, rather than
220 an incorrect result. */
221 bystride = 0xDEADBEEF;
222 ycount = 1;
223 }
224 else
225 {
226 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
227 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
228 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
229 }
230
231 abase = a->base_addr;
232 bbase = b->base_addr;
233 dest = retarray->base_addr;
234
235 /* Now that everything is set up, we perform the multiplication
236 itself. */
237
238 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
239 #define min(a,b) ((a) <= (b) ? (a) : (b))
240 #define max(a,b) ((a) >= (b) ? (a) : (b))
241
242 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
243 && (bxstride == 1 || bystride == 1)
244 && (((float) xcount) * ((float) ycount) * ((float) count)
245 > POW3(blas_limit)))
246 {
247 const int m = xcount, n = ycount, k = count, ldc = rystride;
248 const 'rtype_name` one = 1, zero = 0;
249 const int lda = (axstride == 1) ? aystride : axstride,
250 ldb = (bxstride == 1) ? bystride : bxstride;
251
252 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
253 {
254 assert (gemm != NULL);
255 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
256 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
257 &ldc, 1, 1);
258 return;
259 }
260 }
261
262 if (rxstride == 1 && axstride == 1 && bxstride == 1)
263 {
264 /* This block of code implements a tuned matmul, derived from
265 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
266
267 Bo Kagstrom and Per Ling
268 Department of Computing Science
269 Umea University
270 S-901 87 Umea, Sweden
271
272 from netlib.org, translated to C, and modified for matmul.m4. */
273
274 const 'rtype_name` *a, *b;
275 'rtype_name` *c;
276 const index_type m = xcount, n = ycount, k = count;
277
278 /* System generated locals */
279 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
280 i1, i2, i3, i4, i5, i6;
281
282 /* Local variables */
283 'rtype_name` t1[65536], /* was [256][256] */
284 f11, f12, f21, f22, f31, f32, f41, f42,
285 f13, f14, f23, f24, f33, f34, f43, f44;
286 index_type i, j, l, ii, jj, ll;
287 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
288
289 a = abase;
290 b = bbase;
291 c = retarray->base_addr;
292
293 /* Parameter adjustments */
294 c_dim1 = rystride;
295 c_offset = 1 + c_dim1;
296 c -= c_offset;
297 a_dim1 = aystride;
298 a_offset = 1 + a_dim1;
299 a -= a_offset;
300 b_dim1 = bystride;
301 b_offset = 1 + b_dim1;
302 b -= b_offset;
303
304 /* Early exit if possible */
305 if (m == 0 || n == 0 || k == 0)
306 return;
307
308 /* Empty c first. */
309 for (j=1; j<=n; j++)
310 for (i=1; i<=m; i++)
311 c[i + j * c_dim1] = ('rtype_name`)0;
312
313 /* Start turning the crank. */
314 i1 = n;
315 for (jj = 1; jj <= i1; jj += 512)
316 {
317 /* Computing MIN */
318 i2 = 512;
319 i3 = n - jj + 1;
320 jsec = min(i2,i3);
321 ujsec = jsec - jsec % 4;
322 i2 = k;
323 for (ll = 1; ll <= i2; ll += 256)
324 {
325 /* Computing MIN */
326 i3 = 256;
327 i4 = k - ll + 1;
328 lsec = min(i3,i4);
329 ulsec = lsec - lsec % 2;
330
331 i3 = m;
332 for (ii = 1; ii <= i3; ii += 256)
333 {
334 /* Computing MIN */
335 i4 = 256;
336 i5 = m - ii + 1;
337 isec = min(i4,i5);
338 uisec = isec - isec % 2;
339 i4 = ll + ulsec - 1;
340 for (l = ll; l <= i4; l += 2)
341 {
342 i5 = ii + uisec - 1;
343 for (i = ii; i <= i5; i += 2)
344 {
345 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
346 a[i + l * a_dim1];
347 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
348 a[i + (l + 1) * a_dim1];
349 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
350 a[i + 1 + l * a_dim1];
351 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
352 a[i + 1 + (l + 1) * a_dim1];
353 }
354 if (uisec < isec)
355 {
356 t1[l - ll + 1 + (isec << 8) - 257] =
357 a[ii + isec - 1 + l * a_dim1];
358 t1[l - ll + 2 + (isec << 8) - 257] =
359 a[ii + isec - 1 + (l + 1) * a_dim1];
360 }
361 }
362 if (ulsec < lsec)
363 {
364 i4 = ii + isec - 1;
365 for (i = ii; i<= i4; ++i)
366 {
367 t1[lsec + ((i - ii + 1) << 8) - 257] =
368 a[i + (ll + lsec - 1) * a_dim1];
369 }
370 }
371
372 uisec = isec - isec % 4;
373 i4 = jj + ujsec - 1;
374 for (j = jj; j <= i4; j += 4)
375 {
376 i5 = ii + uisec - 1;
377 for (i = ii; i <= i5; i += 4)
378 {
379 f11 = c[i + j * c_dim1];
380 f21 = c[i + 1 + j * c_dim1];
381 f12 = c[i + (j + 1) * c_dim1];
382 f22 = c[i + 1 + (j + 1) * c_dim1];
383 f13 = c[i + (j + 2) * c_dim1];
384 f23 = c[i + 1 + (j + 2) * c_dim1];
385 f14 = c[i + (j + 3) * c_dim1];
386 f24 = c[i + 1 + (j + 3) * c_dim1];
387 f31 = c[i + 2 + j * c_dim1];
388 f41 = c[i + 3 + j * c_dim1];
389 f32 = c[i + 2 + (j + 1) * c_dim1];
390 f42 = c[i + 3 + (j + 1) * c_dim1];
391 f33 = c[i + 2 + (j + 2) * c_dim1];
392 f43 = c[i + 3 + (j + 2) * c_dim1];
393 f34 = c[i + 2 + (j + 3) * c_dim1];
394 f44 = c[i + 3 + (j + 3) * c_dim1];
395 i6 = ll + lsec - 1;
396 for (l = ll; l <= i6; ++l)
397 {
398 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
399 * b[l + j * b_dim1];
400 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
401 * b[l + j * b_dim1];
402 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
403 * b[l + (j + 1) * b_dim1];
404 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
405 * b[l + (j + 1) * b_dim1];
406 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
407 * b[l + (j + 2) * b_dim1];
408 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
409 * b[l + (j + 2) * b_dim1];
410 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
411 * b[l + (j + 3) * b_dim1];
412 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
413 * b[l + (j + 3) * b_dim1];
414 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
415 * b[l + j * b_dim1];
416 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
417 * b[l + j * b_dim1];
418 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
419 * b[l + (j + 1) * b_dim1];
420 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
421 * b[l + (j + 1) * b_dim1];
422 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
423 * b[l + (j + 2) * b_dim1];
424 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
425 * b[l + (j + 2) * b_dim1];
426 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
427 * b[l + (j + 3) * b_dim1];
428 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
429 * b[l + (j + 3) * b_dim1];
430 }
431 c[i + j * c_dim1] = f11;
432 c[i + 1 + j * c_dim1] = f21;
433 c[i + (j + 1) * c_dim1] = f12;
434 c[i + 1 + (j + 1) * c_dim1] = f22;
435 c[i + (j + 2) * c_dim1] = f13;
436 c[i + 1 + (j + 2) * c_dim1] = f23;
437 c[i + (j + 3) * c_dim1] = f14;
438 c[i + 1 + (j + 3) * c_dim1] = f24;
439 c[i + 2 + j * c_dim1] = f31;
440 c[i + 3 + j * c_dim1] = f41;
441 c[i + 2 + (j + 1) * c_dim1] = f32;
442 c[i + 3 + (j + 1) * c_dim1] = f42;
443 c[i + 2 + (j + 2) * c_dim1] = f33;
444 c[i + 3 + (j + 2) * c_dim1] = f43;
445 c[i + 2 + (j + 3) * c_dim1] = f34;
446 c[i + 3 + (j + 3) * c_dim1] = f44;
447 }
448 if (uisec < isec)
449 {
450 i5 = ii + isec - 1;
451 for (i = ii + uisec; i <= i5; ++i)
452 {
453 f11 = c[i + j * c_dim1];
454 f12 = c[i + (j + 1) * c_dim1];
455 f13 = c[i + (j + 2) * c_dim1];
456 f14 = c[i + (j + 3) * c_dim1];
457 i6 = ll + lsec - 1;
458 for (l = ll; l <= i6; ++l)
459 {
460 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461 257] * b[l + j * b_dim1];
462 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463 257] * b[l + (j + 1) * b_dim1];
464 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
465 257] * b[l + (j + 2) * b_dim1];
466 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
467 257] * b[l + (j + 3) * b_dim1];
468 }
469 c[i + j * c_dim1] = f11;
470 c[i + (j + 1) * c_dim1] = f12;
471 c[i + (j + 2) * c_dim1] = f13;
472 c[i + (j + 3) * c_dim1] = f14;
473 }
474 }
475 }
476 if (ujsec < jsec)
477 {
478 i4 = jj + jsec - 1;
479 for (j = jj + ujsec; j <= i4; ++j)
480 {
481 i5 = ii + uisec - 1;
482 for (i = ii; i <= i5; i += 4)
483 {
484 f11 = c[i + j * c_dim1];
485 f21 = c[i + 1 + j * c_dim1];
486 f31 = c[i + 2 + j * c_dim1];
487 f41 = c[i + 3 + j * c_dim1];
488 i6 = ll + lsec - 1;
489 for (l = ll; l <= i6; ++l)
490 {
491 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
492 257] * b[l + j * b_dim1];
493 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
494 257] * b[l + j * b_dim1];
495 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
496 257] * b[l + j * b_dim1];
497 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
498 257] * b[l + j * b_dim1];
499 }
500 c[i + j * c_dim1] = f11;
501 c[i + 1 + j * c_dim1] = f21;
502 c[i + 2 + j * c_dim1] = f31;
503 c[i + 3 + j * c_dim1] = f41;
504 }
505 i5 = ii + isec - 1;
506 for (i = ii + uisec; i <= i5; ++i)
507 {
508 f11 = c[i + j * c_dim1];
509 i6 = ll + lsec - 1;
510 for (l = ll; l <= i6; ++l)
511 {
512 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
513 257] * b[l + j * b_dim1];
514 }
515 c[i + j * c_dim1] = f11;
516 }
517 }
518 }
519 }
520 }
521 }
522 return;
523 }
524 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
525 {
526 if (GFC_DESCRIPTOR_RANK (a) != 1)
527 {
528 const 'rtype_name` *restrict abase_x;
529 const 'rtype_name` *restrict bbase_y;
530 'rtype_name` *restrict dest_y;
531 'rtype_name` s;
532
533 for (y = 0; y < ycount; y++)
534 {
535 bbase_y = &bbase[y*bystride];
536 dest_y = &dest[y*rystride];
537 for (x = 0; x < xcount; x++)
538 {
539 abase_x = &abase[x*axstride];
540 s = ('rtype_name`) 0;
541 for (n = 0; n < count; n++)
542 s += abase_x[n] * bbase_y[n];
543 dest_y[x] = s;
544 }
545 }
546 }
547 else
548 {
549 const 'rtype_name` *restrict bbase_y;
550 'rtype_name` s;
551
552 for (y = 0; y < ycount; y++)
553 {
554 bbase_y = &bbase[y*bystride];
555 s = ('rtype_name`) 0;
556 for (n = 0; n < count; n++)
557 s += abase[n*axstride] * bbase_y[n];
558 dest[y*rystride] = s;
559 }
560 }
561 }
562 else if (axstride < aystride)
563 {
564 for (y = 0; y < ycount; y++)
565 for (x = 0; x < xcount; x++)
566 dest[x*rxstride + y*rystride] = ('rtype_name`)0;
567
568 for (y = 0; y < ycount; y++)
569 for (n = 0; n < count; n++)
570 for (x = 0; x < xcount; x++)
571 /* dest[x,y] += a[x,n] * b[n,y] */
572 dest[x*rxstride + y*rystride] +=
573 abase[x*axstride + n*aystride] *
574 bbase[n*bxstride + y*bystride];
575 }
576 else if (GFC_DESCRIPTOR_RANK (a) == 1)
577 {
578 const 'rtype_name` *restrict bbase_y;
579 'rtype_name` s;
580
581 for (y = 0; y < ycount; y++)
582 {
583 bbase_y = &bbase[y*bystride];
584 s = ('rtype_name`) 0;
585 for (n = 0; n < count; n++)
586 s += abase[n*axstride] * bbase_y[n*bxstride];
587 dest[y*rxstride] = s;
588 }
589 }
590 else
591 {
592 const 'rtype_name` *restrict abase_x;
593 const 'rtype_name` *restrict bbase_y;
594 'rtype_name` *restrict dest_y;
595 'rtype_name` s;
596
597 for (y = 0; y < ycount; y++)
598 {
599 bbase_y = &bbase[y*bystride];
600 dest_y = &dest[y*rystride];
601 for (x = 0; x < xcount; x++)
602 {
603 abase_x = &abase[x*axstride];
604 s = ('rtype_name`) 0;
605 for (n = 0; n < count; n++)
606 s += abase_x[n*aystride] * bbase_y[n*bxstride];
607 dest_y[x*rxstride] = s;
608 }
609 }
610 }
611 }'
612 #endif