]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgfortran/generated/matmulavx128_i2.c
Update copyright years.
[thirdparty/gcc.git] / libgfortran / generated / matmulavx128_i2.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
3 Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 /* These are the specific versions of matmul with -mprefer-avx128. */
32
33 #if defined (HAVE_GFC_INTEGER_2)
34
35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
36 passed to us by the front-end, in which case we call it for large
37 matrices. */
38
39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
40 const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
41 const int *, const GFC_INTEGER_2 *, const int *,
42 const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
43 int, int);
44
45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
46 void
47 matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
48 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
49 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
50 internal_proto(matmul_i2_avx128_fma3);
51 void
52 matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
53 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
54 int blas_limit, blas_call gemm)
55 {
56 const GFC_INTEGER_2 * restrict abase;
57 const GFC_INTEGER_2 * restrict bbase;
58 GFC_INTEGER_2 * restrict dest;
59
60 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
61 index_type x, y, n, count, xcount, ycount;
62
63 assert (GFC_DESCRIPTOR_RANK (a) == 2
64 || GFC_DESCRIPTOR_RANK (b) == 2);
65
66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
67
68 Either A or B (but not both) can be rank 1:
69
70 o One-dimensional argument A is implicitly treated as a row matrix
71 dimensioned [1,count], so xcount=1.
72
73 o One-dimensional argument B is implicitly treated as a column matrix
74 dimensioned [count, 1], so ycount=1.
75 */
76
77 if (retarray->base_addr == NULL)
78 {
79 if (GFC_DESCRIPTOR_RANK (a) == 1)
80 {
81 GFC_DIMENSION_SET(retarray->dim[0], 0,
82 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
83 }
84 else if (GFC_DESCRIPTOR_RANK (b) == 1)
85 {
86 GFC_DIMENSION_SET(retarray->dim[0], 0,
87 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
88 }
89 else
90 {
91 GFC_DIMENSION_SET(retarray->dim[0], 0,
92 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
93
94 GFC_DIMENSION_SET(retarray->dim[1], 0,
95 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
96 GFC_DESCRIPTOR_EXTENT(retarray,0));
97 }
98
99 retarray->base_addr
100 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
101 retarray->offset = 0;
102 }
103 else if (unlikely (compile_options.bounds_check))
104 {
105 index_type ret_extent, arg_extent;
106
107 if (GFC_DESCRIPTOR_RANK (a) == 1)
108 {
109 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
110 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
111 if (arg_extent != ret_extent)
112 runtime_error ("Array bound mismatch for dimension 1 of "
113 "array (%ld/%ld) ",
114 (long int) ret_extent, (long int) arg_extent);
115 }
116 else if (GFC_DESCRIPTOR_RANK (b) == 1)
117 {
118 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
119 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
120 if (arg_extent != ret_extent)
121 runtime_error ("Array bound mismatch for dimension 1 of "
122 "array (%ld/%ld) ",
123 (long int) ret_extent, (long int) arg_extent);
124 }
125 else
126 {
127 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
128 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
129 if (arg_extent != ret_extent)
130 runtime_error ("Array bound mismatch for dimension 1 of "
131 "array (%ld/%ld) ",
132 (long int) ret_extent, (long int) arg_extent);
133
134 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
135 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
136 if (arg_extent != ret_extent)
137 runtime_error ("Array bound mismatch for dimension 2 of "
138 "array (%ld/%ld) ",
139 (long int) ret_extent, (long int) arg_extent);
140 }
141 }
142
143
144 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
145 {
146 /* One-dimensional result may be addressed in the code below
147 either as a row or a column matrix. We want both cases to
148 work. */
149 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
150 }
151 else
152 {
153 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
154 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
155 }
156
157
158 if (GFC_DESCRIPTOR_RANK (a) == 1)
159 {
160 /* Treat it as a a row matrix A[1,count]. */
161 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
162 aystride = 1;
163
164 xcount = 1;
165 count = GFC_DESCRIPTOR_EXTENT(a,0);
166 }
167 else
168 {
169 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
170 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
171
172 count = GFC_DESCRIPTOR_EXTENT(a,1);
173 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
174 }
175
176 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
177 {
178 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
179 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
180 "in dimension 1: is %ld, should be %ld",
181 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
182 }
183
184 if (GFC_DESCRIPTOR_RANK (b) == 1)
185 {
186 /* Treat it as a column matrix B[count,1] */
187 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
188
189 /* bystride should never be used for 1-dimensional b.
190 The value is only used for calculation of the
191 memory by the buffer. */
192 bystride = 256;
193 ycount = 1;
194 }
195 else
196 {
197 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
198 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
199 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
200 }
201
202 abase = a->base_addr;
203 bbase = b->base_addr;
204 dest = retarray->base_addr;
205
206 /* Now that everything is set up, we perform the multiplication
207 itself. */
208
209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
210 #define min(a,b) ((a) <= (b) ? (a) : (b))
211 #define max(a,b) ((a) >= (b) ? (a) : (b))
212
213 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
214 && (bxstride == 1 || bystride == 1)
215 && (((float) xcount) * ((float) ycount) * ((float) count)
216 > POW3(blas_limit)))
217 {
218 const int m = xcount, n = ycount, k = count, ldc = rystride;
219 const GFC_INTEGER_2 one = 1, zero = 0;
220 const int lda = (axstride == 1) ? aystride : axstride,
221 ldb = (bxstride == 1) ? bystride : bxstride;
222
223 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
224 {
225 assert (gemm != NULL);
226 const char *transa, *transb;
227 if (try_blas & 2)
228 transa = "C";
229 else
230 transa = axstride == 1 ? "N" : "T";
231
232 if (try_blas & 4)
233 transb = "C";
234 else
235 transb = bxstride == 1 ? "N" : "T";
236
237 gemm (transa, transb , &m,
238 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
239 &ldc, 1, 1);
240 return;
241 }
242 }
243
244 if (rxstride == 1 && axstride == 1 && bxstride == 1)
245 {
246 /* This block of code implements a tuned matmul, derived from
247 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
248
249 Bo Kagstrom and Per Ling
250 Department of Computing Science
251 Umea University
252 S-901 87 Umea, Sweden
253
254 from netlib.org, translated to C, and modified for matmul.m4. */
255
256 const GFC_INTEGER_2 *a, *b;
257 GFC_INTEGER_2 *c;
258 const index_type m = xcount, n = ycount, k = count;
259
260 /* System generated locals */
261 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
262 i1, i2, i3, i4, i5, i6;
263
264 /* Local variables */
265 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
266 f13, f14, f23, f24, f33, f34, f43, f44;
267 index_type i, j, l, ii, jj, ll;
268 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
269 GFC_INTEGER_2 *t1;
270
271 a = abase;
272 b = bbase;
273 c = retarray->base_addr;
274
275 /* Parameter adjustments */
276 c_dim1 = rystride;
277 c_offset = 1 + c_dim1;
278 c -= c_offset;
279 a_dim1 = aystride;
280 a_offset = 1 + a_dim1;
281 a -= a_offset;
282 b_dim1 = bystride;
283 b_offset = 1 + b_dim1;
284 b -= b_offset;
285
286 /* Empty c first. */
287 for (j=1; j<=n; j++)
288 for (i=1; i<=m; i++)
289 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
290
291 /* Early exit if possible */
292 if (m == 0 || n == 0 || k == 0)
293 return;
294
295 /* Adjust size of t1 to what is needed. */
296 index_type t1_dim, a_sz;
297 if (aystride == 1)
298 a_sz = rystride;
299 else
300 a_sz = a_dim1;
301
302 t1_dim = a_sz * 256 + b_dim1;
303 if (t1_dim > 65536)
304 t1_dim = 65536;
305
306 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
307
308 /* Start turning the crank. */
309 i1 = n;
310 for (jj = 1; jj <= i1; jj += 512)
311 {
312 /* Computing MIN */
313 i2 = 512;
314 i3 = n - jj + 1;
315 jsec = min(i2,i3);
316 ujsec = jsec - jsec % 4;
317 i2 = k;
318 for (ll = 1; ll <= i2; ll += 256)
319 {
320 /* Computing MIN */
321 i3 = 256;
322 i4 = k - ll + 1;
323 lsec = min(i3,i4);
324 ulsec = lsec - lsec % 2;
325
326 i3 = m;
327 for (ii = 1; ii <= i3; ii += 256)
328 {
329 /* Computing MIN */
330 i4 = 256;
331 i5 = m - ii + 1;
332 isec = min(i4,i5);
333 uisec = isec - isec % 2;
334 i4 = ll + ulsec - 1;
335 for (l = ll; l <= i4; l += 2)
336 {
337 i5 = ii + uisec - 1;
338 for (i = ii; i <= i5; i += 2)
339 {
340 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
341 a[i + l * a_dim1];
342 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
343 a[i + (l + 1) * a_dim1];
344 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
345 a[i + 1 + l * a_dim1];
346 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
347 a[i + 1 + (l + 1) * a_dim1];
348 }
349 if (uisec < isec)
350 {
351 t1[l - ll + 1 + (isec << 8) - 257] =
352 a[ii + isec - 1 + l * a_dim1];
353 t1[l - ll + 2 + (isec << 8) - 257] =
354 a[ii + isec - 1 + (l + 1) * a_dim1];
355 }
356 }
357 if (ulsec < lsec)
358 {
359 i4 = ii + isec - 1;
360 for (i = ii; i<= i4; ++i)
361 {
362 t1[lsec + ((i - ii + 1) << 8) - 257] =
363 a[i + (ll + lsec - 1) * a_dim1];
364 }
365 }
366
367 uisec = isec - isec % 4;
368 i4 = jj + ujsec - 1;
369 for (j = jj; j <= i4; j += 4)
370 {
371 i5 = ii + uisec - 1;
372 for (i = ii; i <= i5; i += 4)
373 {
374 f11 = c[i + j * c_dim1];
375 f21 = c[i + 1 + j * c_dim1];
376 f12 = c[i + (j + 1) * c_dim1];
377 f22 = c[i + 1 + (j + 1) * c_dim1];
378 f13 = c[i + (j + 2) * c_dim1];
379 f23 = c[i + 1 + (j + 2) * c_dim1];
380 f14 = c[i + (j + 3) * c_dim1];
381 f24 = c[i + 1 + (j + 3) * c_dim1];
382 f31 = c[i + 2 + j * c_dim1];
383 f41 = c[i + 3 + j * c_dim1];
384 f32 = c[i + 2 + (j + 1) * c_dim1];
385 f42 = c[i + 3 + (j + 1) * c_dim1];
386 f33 = c[i + 2 + (j + 2) * c_dim1];
387 f43 = c[i + 3 + (j + 2) * c_dim1];
388 f34 = c[i + 2 + (j + 3) * c_dim1];
389 f44 = c[i + 3 + (j + 3) * c_dim1];
390 i6 = ll + lsec - 1;
391 for (l = ll; l <= i6; ++l)
392 {
393 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
394 * b[l + j * b_dim1];
395 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
396 * b[l + j * b_dim1];
397 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
398 * b[l + (j + 1) * b_dim1];
399 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
400 * b[l + (j + 1) * b_dim1];
401 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
402 * b[l + (j + 2) * b_dim1];
403 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
404 * b[l + (j + 2) * b_dim1];
405 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
406 * b[l + (j + 3) * b_dim1];
407 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
408 * b[l + (j + 3) * b_dim1];
409 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
410 * b[l + j * b_dim1];
411 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
412 * b[l + j * b_dim1];
413 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
414 * b[l + (j + 1) * b_dim1];
415 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
416 * b[l + (j + 1) * b_dim1];
417 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
418 * b[l + (j + 2) * b_dim1];
419 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
420 * b[l + (j + 2) * b_dim1];
421 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
422 * b[l + (j + 3) * b_dim1];
423 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
424 * b[l + (j + 3) * b_dim1];
425 }
426 c[i + j * c_dim1] = f11;
427 c[i + 1 + j * c_dim1] = f21;
428 c[i + (j + 1) * c_dim1] = f12;
429 c[i + 1 + (j + 1) * c_dim1] = f22;
430 c[i + (j + 2) * c_dim1] = f13;
431 c[i + 1 + (j + 2) * c_dim1] = f23;
432 c[i + (j + 3) * c_dim1] = f14;
433 c[i + 1 + (j + 3) * c_dim1] = f24;
434 c[i + 2 + j * c_dim1] = f31;
435 c[i + 3 + j * c_dim1] = f41;
436 c[i + 2 + (j + 1) * c_dim1] = f32;
437 c[i + 3 + (j + 1) * c_dim1] = f42;
438 c[i + 2 + (j + 2) * c_dim1] = f33;
439 c[i + 3 + (j + 2) * c_dim1] = f43;
440 c[i + 2 + (j + 3) * c_dim1] = f34;
441 c[i + 3 + (j + 3) * c_dim1] = f44;
442 }
443 if (uisec < isec)
444 {
445 i5 = ii + isec - 1;
446 for (i = ii + uisec; i <= i5; ++i)
447 {
448 f11 = c[i + j * c_dim1];
449 f12 = c[i + (j + 1) * c_dim1];
450 f13 = c[i + (j + 2) * c_dim1];
451 f14 = c[i + (j + 3) * c_dim1];
452 i6 = ll + lsec - 1;
453 for (l = ll; l <= i6; ++l)
454 {
455 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
456 257] * b[l + j * b_dim1];
457 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
458 257] * b[l + (j + 1) * b_dim1];
459 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
460 257] * b[l + (j + 2) * b_dim1];
461 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
462 257] * b[l + (j + 3) * b_dim1];
463 }
464 c[i + j * c_dim1] = f11;
465 c[i + (j + 1) * c_dim1] = f12;
466 c[i + (j + 2) * c_dim1] = f13;
467 c[i + (j + 3) * c_dim1] = f14;
468 }
469 }
470 }
471 if (ujsec < jsec)
472 {
473 i4 = jj + jsec - 1;
474 for (j = jj + ujsec; j <= i4; ++j)
475 {
476 i5 = ii + uisec - 1;
477 for (i = ii; i <= i5; i += 4)
478 {
479 f11 = c[i + j * c_dim1];
480 f21 = c[i + 1 + j * c_dim1];
481 f31 = c[i + 2 + j * c_dim1];
482 f41 = c[i + 3 + j * c_dim1];
483 i6 = ll + lsec - 1;
484 for (l = ll; l <= i6; ++l)
485 {
486 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
487 257] * b[l + j * b_dim1];
488 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
489 257] * b[l + j * b_dim1];
490 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
491 257] * b[l + j * b_dim1];
492 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
493 257] * b[l + j * b_dim1];
494 }
495 c[i + j * c_dim1] = f11;
496 c[i + 1 + j * c_dim1] = f21;
497 c[i + 2 + j * c_dim1] = f31;
498 c[i + 3 + j * c_dim1] = f41;
499 }
500 i5 = ii + isec - 1;
501 for (i = ii + uisec; i <= i5; ++i)
502 {
503 f11 = c[i + j * c_dim1];
504 i6 = ll + lsec - 1;
505 for (l = ll; l <= i6; ++l)
506 {
507 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
508 257] * b[l + j * b_dim1];
509 }
510 c[i + j * c_dim1] = f11;
511 }
512 }
513 }
514 }
515 }
516 }
517 free(t1);
518 return;
519 }
520 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
521 {
522 if (GFC_DESCRIPTOR_RANK (a) != 1)
523 {
524 const GFC_INTEGER_2 *restrict abase_x;
525 const GFC_INTEGER_2 *restrict bbase_y;
526 GFC_INTEGER_2 *restrict dest_y;
527 GFC_INTEGER_2 s;
528
529 for (y = 0; y < ycount; y++)
530 {
531 bbase_y = &bbase[y*bystride];
532 dest_y = &dest[y*rystride];
533 for (x = 0; x < xcount; x++)
534 {
535 abase_x = &abase[x*axstride];
536 s = (GFC_INTEGER_2) 0;
537 for (n = 0; n < count; n++)
538 s += abase_x[n] * bbase_y[n];
539 dest_y[x] = s;
540 }
541 }
542 }
543 else
544 {
545 const GFC_INTEGER_2 *restrict bbase_y;
546 GFC_INTEGER_2 s;
547
548 for (y = 0; y < ycount; y++)
549 {
550 bbase_y = &bbase[y*bystride];
551 s = (GFC_INTEGER_2) 0;
552 for (n = 0; n < count; n++)
553 s += abase[n*axstride] * bbase_y[n];
554 dest[y*rystride] = s;
555 }
556 }
557 }
558 else if (axstride < aystride)
559 {
560 for (y = 0; y < ycount; y++)
561 for (x = 0; x < xcount; x++)
562 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
563
564 for (y = 0; y < ycount; y++)
565 for (n = 0; n < count; n++)
566 for (x = 0; x < xcount; x++)
567 /* dest[x,y] += a[x,n] * b[n,y] */
568 dest[x*rxstride + y*rystride] +=
569 abase[x*axstride + n*aystride] *
570 bbase[n*bxstride + y*bystride];
571 }
572 else if (GFC_DESCRIPTOR_RANK (a) == 1)
573 {
574 const GFC_INTEGER_2 *restrict bbase_y;
575 GFC_INTEGER_2 s;
576
577 for (y = 0; y < ycount; y++)
578 {
579 bbase_y = &bbase[y*bystride];
580 s = (GFC_INTEGER_2) 0;
581 for (n = 0; n < count; n++)
582 s += abase[n*axstride] * bbase_y[n*bxstride];
583 dest[y*rxstride] = s;
584 }
585 }
586 else
587 {
588 const GFC_INTEGER_2 *restrict abase_x;
589 const GFC_INTEGER_2 *restrict bbase_y;
590 GFC_INTEGER_2 *restrict dest_y;
591 GFC_INTEGER_2 s;
592
593 for (y = 0; y < ycount; y++)
594 {
595 bbase_y = &bbase[y*bystride];
596 dest_y = &dest[y*rystride];
597 for (x = 0; x < xcount; x++)
598 {
599 abase_x = &abase[x*axstride];
600 s = (GFC_INTEGER_2) 0;
601 for (n = 0; n < count; n++)
602 s += abase_x[n*aystride] * bbase_y[n*bxstride];
603 dest_y[x*rxstride] = s;
604 }
605 }
606 }
607 }
608 #undef POW3
609 #undef min
610 #undef max
611
612 #endif
613
614 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
615 void
616 matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
617 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
618 int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
619 internal_proto(matmul_i2_avx128_fma4);
620 void
621 matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
622 gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
623 int blas_limit, blas_call gemm)
624 {
625 const GFC_INTEGER_2 * restrict abase;
626 const GFC_INTEGER_2 * restrict bbase;
627 GFC_INTEGER_2 * restrict dest;
628
629 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
630 index_type x, y, n, count, xcount, ycount;
631
632 assert (GFC_DESCRIPTOR_RANK (a) == 2
633 || GFC_DESCRIPTOR_RANK (b) == 2);
634
635 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
636
637 Either A or B (but not both) can be rank 1:
638
639 o One-dimensional argument A is implicitly treated as a row matrix
640 dimensioned [1,count], so xcount=1.
641
642 o One-dimensional argument B is implicitly treated as a column matrix
643 dimensioned [count, 1], so ycount=1.
644 */
645
646 if (retarray->base_addr == NULL)
647 {
648 if (GFC_DESCRIPTOR_RANK (a) == 1)
649 {
650 GFC_DIMENSION_SET(retarray->dim[0], 0,
651 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
652 }
653 else if (GFC_DESCRIPTOR_RANK (b) == 1)
654 {
655 GFC_DIMENSION_SET(retarray->dim[0], 0,
656 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
657 }
658 else
659 {
660 GFC_DIMENSION_SET(retarray->dim[0], 0,
661 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
662
663 GFC_DIMENSION_SET(retarray->dim[1], 0,
664 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
665 GFC_DESCRIPTOR_EXTENT(retarray,0));
666 }
667
668 retarray->base_addr
669 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
670 retarray->offset = 0;
671 }
672 else if (unlikely (compile_options.bounds_check))
673 {
674 index_type ret_extent, arg_extent;
675
676 if (GFC_DESCRIPTOR_RANK (a) == 1)
677 {
678 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
679 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
680 if (arg_extent != ret_extent)
681 runtime_error ("Array bound mismatch for dimension 1 of "
682 "array (%ld/%ld) ",
683 (long int) ret_extent, (long int) arg_extent);
684 }
685 else if (GFC_DESCRIPTOR_RANK (b) == 1)
686 {
687 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
688 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
689 if (arg_extent != ret_extent)
690 runtime_error ("Array bound mismatch for dimension 1 of "
691 "array (%ld/%ld) ",
692 (long int) ret_extent, (long int) arg_extent);
693 }
694 else
695 {
696 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
697 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
698 if (arg_extent != ret_extent)
699 runtime_error ("Array bound mismatch for dimension 1 of "
700 "array (%ld/%ld) ",
701 (long int) ret_extent, (long int) arg_extent);
702
703 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
704 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
705 if (arg_extent != ret_extent)
706 runtime_error ("Array bound mismatch for dimension 2 of "
707 "array (%ld/%ld) ",
708 (long int) ret_extent, (long int) arg_extent);
709 }
710 }
711
712
713 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
714 {
715 /* One-dimensional result may be addressed in the code below
716 either as a row or a column matrix. We want both cases to
717 work. */
718 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
719 }
720 else
721 {
722 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
723 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
724 }
725
726
727 if (GFC_DESCRIPTOR_RANK (a) == 1)
728 {
729 /* Treat it as a a row matrix A[1,count]. */
730 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
731 aystride = 1;
732
733 xcount = 1;
734 count = GFC_DESCRIPTOR_EXTENT(a,0);
735 }
736 else
737 {
738 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
739 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
740
741 count = GFC_DESCRIPTOR_EXTENT(a,1);
742 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
743 }
744
745 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
746 {
747 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
748 runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
749 "in dimension 1: is %ld, should be %ld",
750 (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
751 }
752
753 if (GFC_DESCRIPTOR_RANK (b) == 1)
754 {
755 /* Treat it as a column matrix B[count,1] */
756 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
757
758 /* bystride should never be used for 1-dimensional b.
759 The value is only used for calculation of the
760 memory by the buffer. */
761 bystride = 256;
762 ycount = 1;
763 }
764 else
765 {
766 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
767 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
768 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
769 }
770
771 abase = a->base_addr;
772 bbase = b->base_addr;
773 dest = retarray->base_addr;
774
775 /* Now that everything is set up, we perform the multiplication
776 itself. */
777
778 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
779 #define min(a,b) ((a) <= (b) ? (a) : (b))
780 #define max(a,b) ((a) >= (b) ? (a) : (b))
781
782 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
783 && (bxstride == 1 || bystride == 1)
784 && (((float) xcount) * ((float) ycount) * ((float) count)
785 > POW3(blas_limit)))
786 {
787 const int m = xcount, n = ycount, k = count, ldc = rystride;
788 const GFC_INTEGER_2 one = 1, zero = 0;
789 const int lda = (axstride == 1) ? aystride : axstride,
790 ldb = (bxstride == 1) ? bystride : bxstride;
791
792 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
793 {
794 assert (gemm != NULL);
795 const char *transa, *transb;
796 if (try_blas & 2)
797 transa = "C";
798 else
799 transa = axstride == 1 ? "N" : "T";
800
801 if (try_blas & 4)
802 transb = "C";
803 else
804 transb = bxstride == 1 ? "N" : "T";
805
806 gemm (transa, transb , &m,
807 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
808 &ldc, 1, 1);
809 return;
810 }
811 }
812
813 if (rxstride == 1 && axstride == 1 && bxstride == 1)
814 {
815 /* This block of code implements a tuned matmul, derived from
816 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
817
818 Bo Kagstrom and Per Ling
819 Department of Computing Science
820 Umea University
821 S-901 87 Umea, Sweden
822
823 from netlib.org, translated to C, and modified for matmul.m4. */
824
825 const GFC_INTEGER_2 *a, *b;
826 GFC_INTEGER_2 *c;
827 const index_type m = xcount, n = ycount, k = count;
828
829 /* System generated locals */
830 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
831 i1, i2, i3, i4, i5, i6;
832
833 /* Local variables */
834 GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
835 f13, f14, f23, f24, f33, f34, f43, f44;
836 index_type i, j, l, ii, jj, ll;
837 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
838 GFC_INTEGER_2 *t1;
839
840 a = abase;
841 b = bbase;
842 c = retarray->base_addr;
843
844 /* Parameter adjustments */
845 c_dim1 = rystride;
846 c_offset = 1 + c_dim1;
847 c -= c_offset;
848 a_dim1 = aystride;
849 a_offset = 1 + a_dim1;
850 a -= a_offset;
851 b_dim1 = bystride;
852 b_offset = 1 + b_dim1;
853 b -= b_offset;
854
855 /* Empty c first. */
856 for (j=1; j<=n; j++)
857 for (i=1; i<=m; i++)
858 c[i + j * c_dim1] = (GFC_INTEGER_2)0;
859
860 /* Early exit if possible */
861 if (m == 0 || n == 0 || k == 0)
862 return;
863
864 /* Adjust size of t1 to what is needed. */
865 index_type t1_dim, a_sz;
866 if (aystride == 1)
867 a_sz = rystride;
868 else
869 a_sz = a_dim1;
870
871 t1_dim = a_sz * 256 + b_dim1;
872 if (t1_dim > 65536)
873 t1_dim = 65536;
874
875 t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
876
877 /* Start turning the crank. */
878 i1 = n;
879 for (jj = 1; jj <= i1; jj += 512)
880 {
881 /* Computing MIN */
882 i2 = 512;
883 i3 = n - jj + 1;
884 jsec = min(i2,i3);
885 ujsec = jsec - jsec % 4;
886 i2 = k;
887 for (ll = 1; ll <= i2; ll += 256)
888 {
889 /* Computing MIN */
890 i3 = 256;
891 i4 = k - ll + 1;
892 lsec = min(i3,i4);
893 ulsec = lsec - lsec % 2;
894
895 i3 = m;
896 for (ii = 1; ii <= i3; ii += 256)
897 {
898 /* Computing MIN */
899 i4 = 256;
900 i5 = m - ii + 1;
901 isec = min(i4,i5);
902 uisec = isec - isec % 2;
903 i4 = ll + ulsec - 1;
904 for (l = ll; l <= i4; l += 2)
905 {
906 i5 = ii + uisec - 1;
907 for (i = ii; i <= i5; i += 2)
908 {
909 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
910 a[i + l * a_dim1];
911 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
912 a[i + (l + 1) * a_dim1];
913 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
914 a[i + 1 + l * a_dim1];
915 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
916 a[i + 1 + (l + 1) * a_dim1];
917 }
918 if (uisec < isec)
919 {
920 t1[l - ll + 1 + (isec << 8) - 257] =
921 a[ii + isec - 1 + l * a_dim1];
922 t1[l - ll + 2 + (isec << 8) - 257] =
923 a[ii + isec - 1 + (l + 1) * a_dim1];
924 }
925 }
926 if (ulsec < lsec)
927 {
928 i4 = ii + isec - 1;
929 for (i = ii; i<= i4; ++i)
930 {
931 t1[lsec + ((i - ii + 1) << 8) - 257] =
932 a[i + (ll + lsec - 1) * a_dim1];
933 }
934 }
935
936 uisec = isec - isec % 4;
937 i4 = jj + ujsec - 1;
938 for (j = jj; j <= i4; j += 4)
939 {
940 i5 = ii + uisec - 1;
941 for (i = ii; i <= i5; i += 4)
942 {
943 f11 = c[i + j * c_dim1];
944 f21 = c[i + 1 + j * c_dim1];
945 f12 = c[i + (j + 1) * c_dim1];
946 f22 = c[i + 1 + (j + 1) * c_dim1];
947 f13 = c[i + (j + 2) * c_dim1];
948 f23 = c[i + 1 + (j + 2) * c_dim1];
949 f14 = c[i + (j + 3) * c_dim1];
950 f24 = c[i + 1 + (j + 3) * c_dim1];
951 f31 = c[i + 2 + j * c_dim1];
952 f41 = c[i + 3 + j * c_dim1];
953 f32 = c[i + 2 + (j + 1) * c_dim1];
954 f42 = c[i + 3 + (j + 1) * c_dim1];
955 f33 = c[i + 2 + (j + 2) * c_dim1];
956 f43 = c[i + 3 + (j + 2) * c_dim1];
957 f34 = c[i + 2 + (j + 3) * c_dim1];
958 f44 = c[i + 3 + (j + 3) * c_dim1];
959 i6 = ll + lsec - 1;
960 for (l = ll; l <= i6; ++l)
961 {
962 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
963 * b[l + j * b_dim1];
964 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
965 * b[l + j * b_dim1];
966 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
967 * b[l + (j + 1) * b_dim1];
968 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
969 * b[l + (j + 1) * b_dim1];
970 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
971 * b[l + (j + 2) * b_dim1];
972 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
973 * b[l + (j + 2) * b_dim1];
974 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
975 * b[l + (j + 3) * b_dim1];
976 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
977 * b[l + (j + 3) * b_dim1];
978 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
979 * b[l + j * b_dim1];
980 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
981 * b[l + j * b_dim1];
982 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
983 * b[l + (j + 1) * b_dim1];
984 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
985 * b[l + (j + 1) * b_dim1];
986 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
987 * b[l + (j + 2) * b_dim1];
988 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
989 * b[l + (j + 2) * b_dim1];
990 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
991 * b[l + (j + 3) * b_dim1];
992 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
993 * b[l + (j + 3) * b_dim1];
994 }
995 c[i + j * c_dim1] = f11;
996 c[i + 1 + j * c_dim1] = f21;
997 c[i + (j + 1) * c_dim1] = f12;
998 c[i + 1 + (j + 1) * c_dim1] = f22;
999 c[i + (j + 2) * c_dim1] = f13;
1000 c[i + 1 + (j + 2) * c_dim1] = f23;
1001 c[i + (j + 3) * c_dim1] = f14;
1002 c[i + 1 + (j + 3) * c_dim1] = f24;
1003 c[i + 2 + j * c_dim1] = f31;
1004 c[i + 3 + j * c_dim1] = f41;
1005 c[i + 2 + (j + 1) * c_dim1] = f32;
1006 c[i + 3 + (j + 1) * c_dim1] = f42;
1007 c[i + 2 + (j + 2) * c_dim1] = f33;
1008 c[i + 3 + (j + 2) * c_dim1] = f43;
1009 c[i + 2 + (j + 3) * c_dim1] = f34;
1010 c[i + 3 + (j + 3) * c_dim1] = f44;
1011 }
1012 if (uisec < isec)
1013 {
1014 i5 = ii + isec - 1;
1015 for (i = ii + uisec; i <= i5; ++i)
1016 {
1017 f11 = c[i + j * c_dim1];
1018 f12 = c[i + (j + 1) * c_dim1];
1019 f13 = c[i + (j + 2) * c_dim1];
1020 f14 = c[i + (j + 3) * c_dim1];
1021 i6 = ll + lsec - 1;
1022 for (l = ll; l <= i6; ++l)
1023 {
1024 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1025 257] * b[l + j * b_dim1];
1026 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027 257] * b[l + (j + 1) * b_dim1];
1028 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029 257] * b[l + (j + 2) * b_dim1];
1030 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031 257] * b[l + (j + 3) * b_dim1];
1032 }
1033 c[i + j * c_dim1] = f11;
1034 c[i + (j + 1) * c_dim1] = f12;
1035 c[i + (j + 2) * c_dim1] = f13;
1036 c[i + (j + 3) * c_dim1] = f14;
1037 }
1038 }
1039 }
1040 if (ujsec < jsec)
1041 {
1042 i4 = jj + jsec - 1;
1043 for (j = jj + ujsec; j <= i4; ++j)
1044 {
1045 i5 = ii + uisec - 1;
1046 for (i = ii; i <= i5; i += 4)
1047 {
1048 f11 = c[i + j * c_dim1];
1049 f21 = c[i + 1 + j * c_dim1];
1050 f31 = c[i + 2 + j * c_dim1];
1051 f41 = c[i + 3 + j * c_dim1];
1052 i6 = ll + lsec - 1;
1053 for (l = ll; l <= i6; ++l)
1054 {
1055 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1056 257] * b[l + j * b_dim1];
1057 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1058 257] * b[l + j * b_dim1];
1059 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1060 257] * b[l + j * b_dim1];
1061 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1062 257] * b[l + j * b_dim1];
1063 }
1064 c[i + j * c_dim1] = f11;
1065 c[i + 1 + j * c_dim1] = f21;
1066 c[i + 2 + j * c_dim1] = f31;
1067 c[i + 3 + j * c_dim1] = f41;
1068 }
1069 i5 = ii + isec - 1;
1070 for (i = ii + uisec; i <= i5; ++i)
1071 {
1072 f11 = c[i + j * c_dim1];
1073 i6 = ll + lsec - 1;
1074 for (l = ll; l <= i6; ++l)
1075 {
1076 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1077 257] * b[l + j * b_dim1];
1078 }
1079 c[i + j * c_dim1] = f11;
1080 }
1081 }
1082 }
1083 }
1084 }
1085 }
1086 free(t1);
1087 return;
1088 }
1089 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1090 {
1091 if (GFC_DESCRIPTOR_RANK (a) != 1)
1092 {
1093 const GFC_INTEGER_2 *restrict abase_x;
1094 const GFC_INTEGER_2 *restrict bbase_y;
1095 GFC_INTEGER_2 *restrict dest_y;
1096 GFC_INTEGER_2 s;
1097
1098 for (y = 0; y < ycount; y++)
1099 {
1100 bbase_y = &bbase[y*bystride];
1101 dest_y = &dest[y*rystride];
1102 for (x = 0; x < xcount; x++)
1103 {
1104 abase_x = &abase[x*axstride];
1105 s = (GFC_INTEGER_2) 0;
1106 for (n = 0; n < count; n++)
1107 s += abase_x[n] * bbase_y[n];
1108 dest_y[x] = s;
1109 }
1110 }
1111 }
1112 else
1113 {
1114 const GFC_INTEGER_2 *restrict bbase_y;
1115 GFC_INTEGER_2 s;
1116
1117 for (y = 0; y < ycount; y++)
1118 {
1119 bbase_y = &bbase[y*bystride];
1120 s = (GFC_INTEGER_2) 0;
1121 for (n = 0; n < count; n++)
1122 s += abase[n*axstride] * bbase_y[n];
1123 dest[y*rystride] = s;
1124 }
1125 }
1126 }
1127 else if (axstride < aystride)
1128 {
1129 for (y = 0; y < ycount; y++)
1130 for (x = 0; x < xcount; x++)
1131 dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
1132
1133 for (y = 0; y < ycount; y++)
1134 for (n = 0; n < count; n++)
1135 for (x = 0; x < xcount; x++)
1136 /* dest[x,y] += a[x,n] * b[n,y] */
1137 dest[x*rxstride + y*rystride] +=
1138 abase[x*axstride + n*aystride] *
1139 bbase[n*bxstride + y*bystride];
1140 }
1141 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1142 {
1143 const GFC_INTEGER_2 *restrict bbase_y;
1144 GFC_INTEGER_2 s;
1145
1146 for (y = 0; y < ycount; y++)
1147 {
1148 bbase_y = &bbase[y*bystride];
1149 s = (GFC_INTEGER_2) 0;
1150 for (n = 0; n < count; n++)
1151 s += abase[n*axstride] * bbase_y[n*bxstride];
1152 dest[y*rxstride] = s;
1153 }
1154 }
1155 else
1156 {
1157 const GFC_INTEGER_2 *restrict abase_x;
1158 const GFC_INTEGER_2 *restrict bbase_y;
1159 GFC_INTEGER_2 *restrict dest_y;
1160 GFC_INTEGER_2 s;
1161
1162 for (y = 0; y < ycount; y++)
1163 {
1164 bbase_y = &bbase[y*bystride];
1165 dest_y = &dest[y*rystride];
1166 for (x = 0; x < xcount; x++)
1167 {
1168 abase_x = &abase[x*axstride];
1169 s = (GFC_INTEGER_2) 0;
1170 for (n = 0; n < count; n++)
1171 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1172 dest_y[x*rxstride] = s;
1173 }
1174 }
1175 }
1176 }
1177 #undef POW3
1178 #undef min
1179 #undef max
1180
1181 #endif
1182
1183 #endif
1184