From: Thomas Koenig Date: Tue, 24 Sep 2024 19:51:42 +0000 (+0200) Subject: Implement MATMUL and DOT_PRODUCT for unsigned. X-Git-Tag: basepoints/gcc-16~5712 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5d98fe096b5d17021875806ffc32ba41ea0e87b0;p=thirdparty%2Fgcc.git Implement MATMUL and DOT_PRODUCT for unsigned. gcc/fortran/ChangeLog: * arith.cc (gfc_arith_uminus): Fix warning. (gfc_arith_minus): Correctly truncate unsigneds. * check.cc (gfc_check_dot_product): Handle unsigned arguments. (gfc_check_matmul): Likewise. * expr.cc (gfc_get_unsigned_expr): New function. * gfortran.h (gfc_get_unsigned_expr): Add prototype. * iresolve.cc (gfc_resolve_matmul): If using UNSIGNED, use the signed integer version. * gfortran.texi: Document MATMUL and DOT_PRODUCT for unsigned. * simplify.cc (compute_dot_product): Handle unsigneds. libgfortran/ChangeLog: * m4/iparm.m4: Add UNSIGED if type is m. * m4/matmul.m4: If type is GFC_INTEGER, use GFC_UINTEGER instead. Whitespace fixes. * m4/matmul_internal.m4: Whitespace fixes. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c17.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regeneraated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r17.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. * libgfortran.h: Add array types for unsiged. gcc/testsuite/ChangeLog: * gfortran.dg/unsigned_25.f90: New test. * gfortran.dg/unsigned_26.f90: New test. --- diff --git a/gcc/fortran/arith.cc b/gcc/fortran/arith.cc index 66a3635404a..a214b8bc1b3 100644 --- a/gcc/fortran/arith.cc +++ b/gcc/fortran/arith.cc @@ -711,17 +711,9 @@ gfc_arith_uminus (gfc_expr *op1, gfc_expr **resultp) case BT_UNSIGNED: { if (pedantic) - return ARITH_UNSIGNED_NEGATIVE; + return check_result (ARITH_UNSIGNED_NEGATIVE, op1, result, resultp); - arith neg_rc; mpz_neg (result->value.integer, op1->value.integer); - neg_rc = gfc_range_check (result); - if (neg_rc != ARITH_OK) - gfc_warning (0, gfc_arith_error (neg_rc), &result->where); - - gfc_reduce_unsigned (result); - if (pedantic) - rc = neg_rc; } break; @@ -738,7 +730,15 @@ gfc_arith_uminus (gfc_expr *op1, gfc_expr **resultp) } rc = gfc_range_check (result); - + if (op1->ts.type == BT_UNSIGNED) + { + if (rc != ARITH_OK) + { + gfc_warning (0, gfc_arith_error (rc), &op1->where); + rc = ARITH_OK; + } + gfc_reduce_unsigned (result); + } return check_result (rc, op1, result, resultp); } @@ -799,8 +799,12 @@ gfc_arith_minus (gfc_expr *op1, gfc_expr *op2, gfc_expr **resultp) switch (op1->ts.type) { case BT_INTEGER: + mpz_sub (result->value.integer, op1->value.integer, op2->value.integer); + break; + case BT_UNSIGNED: mpz_sub (result->value.integer, op1->value.integer, op2->value.integer); + gfc_reduce_unsigned (result); break; case BT_REAL: diff --git a/gcc/fortran/check.cc b/gcc/fortran/check.cc index cfafdb7974f..7c630dd73f4 100644 --- a/gcc/fortran/check.cc +++ b/gcc/fortran/check.cc @@ -2804,6 +2804,10 @@ gfc_check_dot_product (gfc_expr *vector_a, gfc_expr *vector_b) return false; break; + case BT_UNSIGNED: + /* Check comes later. */ + break; + default: gfc_error ("%qs argument of %qs intrinsic at %L must be numeric " "or LOGICAL", gfc_current_intrinsic_arg[0]->name, @@ -2811,6 +2815,14 @@ gfc_check_dot_product (gfc_expr *vector_a, gfc_expr *vector_b) return false; } + if (gfc_invalid_unsigned_ops (vector_a, vector_b)) + { + gfc_error ("Argument types of %qs intrinsic at %L must match (%s/%s)", + gfc_current_intrinsic, &vector_a->where, + gfc_typename(&vector_a->ts), gfc_typename(&vector_b->ts)); + return false; + } + if (!rank_check (vector_a, 0, 1)) return false; @@ -4092,7 +4104,8 @@ gfc_check_matmul (gfc_expr *matrix_a, gfc_expr *matrix_b) } if ((matrix_a->ts.type == BT_LOGICAL && gfc_numeric_ts (&matrix_b->ts)) - || (gfc_numeric_ts (&matrix_a->ts) && matrix_b->ts.type == BT_LOGICAL)) + || (gfc_numeric_ts (&matrix_a->ts) && matrix_b->ts.type == BT_LOGICAL) + || gfc_invalid_unsigned_ops (matrix_a, matrix_b)) { gfc_error ("Argument types of %qs intrinsic at %L must match (%s/%s)", gfc_current_intrinsic, &matrix_a->where, diff --git a/gcc/fortran/expr.cc b/gcc/fortran/expr.cc index 81c641e2322..36baa9bb4c8 100644 --- a/gcc/fortran/expr.cc +++ b/gcc/fortran/expr.cc @@ -224,6 +224,19 @@ gfc_get_int_expr (int kind, locus *where, HOST_WIDE_INT value) return p; } +/* Get a new expression node that is an unsigned constant. */ + +gfc_expr * +gfc_get_unsigned_expr (int kind, locus *where, HOST_WIDE_INT value) +{ + gfc_expr *p; + p = gfc_get_constant_expr (BT_UNSIGNED, kind, + where ? where : &gfc_current_locus); + const wide_int w = wi::shwi (value, kind * BITS_PER_UNIT); + wi::to_mpz (w, p->value.integer, UNSIGNED); + + return p; +} /* Get a new expression node that is a logical constant. */ diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h index 66c9736122a..917866a7ef0 100644 --- a/gcc/fortran/gfortran.h +++ b/gcc/fortran/gfortran.h @@ -3794,6 +3794,7 @@ gfc_expr *gfc_get_structure_constructor_expr (bt, int, locus *); gfc_expr *gfc_get_constant_expr (bt, int, locus *); gfc_expr *gfc_get_character_expr (int, locus *, const char *, gfc_charlen_t len); gfc_expr *gfc_get_int_expr (int, locus *, HOST_WIDE_INT); +gfc_expr *gfc_get_unsigned_expr (int, locus *, HOST_WIDE_INT); gfc_expr *gfc_get_logical_expr (int, locus *, bool); gfc_expr *gfc_get_iokind_expr (locus *, io_kind); diff --git a/gcc/fortran/gfortran.texi b/gcc/fortran/gfortran.texi index 60c93d7fe54..829ab00c665 100644 --- a/gcc/fortran/gfortran.texi +++ b/gcc/fortran/gfortran.texi @@ -2788,6 +2788,7 @@ As of now, the following intrinsics take unsigned arguments: @item @code{MVBITS} @item @code{RANGE} @item @code{TRANSFER} +@item @code{MATMUL} and @code{DOT_PRODUCT} @end itemize This list will grow in the near future. @c --------------------------------------------------------------------- diff --git a/gcc/fortran/iresolve.cc b/gcc/fortran/iresolve.cc index 4f1fa977f6a..32b31432e58 100644 --- a/gcc/fortran/iresolve.cc +++ b/gcc/fortran/iresolve.cc @@ -1600,6 +1600,7 @@ void gfc_resolve_matmul (gfc_expr *f, gfc_expr *a, gfc_expr *b) { gfc_expr temp; + bt type; if (a->ts.type == BT_LOGICAL && b->ts.type == BT_LOGICAL) { @@ -1648,8 +1649,16 @@ gfc_resolve_matmul (gfc_expr *f, gfc_expr *a, gfc_expr *b) } } + /* We use the same library version of matmul for INTEGER and UNSIGNED, + which we call as the INTEGER version. */ + + if (f->ts.type == BT_UNSIGNED) + type = BT_INTEGER; + else + type = f->ts.type; + f->value.function.name - = gfc_get_string (PREFIX ("matmul_%c%d"), gfc_type_letter (f->ts.type), + = gfc_get_string (PREFIX ("matmul_%c%d"), gfc_type_letter (type), gfc_type_abi_kind (&f->ts)); } diff --git a/gcc/fortran/simplify.cc b/gcc/fortran/simplify.cc index febf60e4d31..83d0fdc9ea9 100644 --- a/gcc/fortran/simplify.cc +++ b/gcc/fortran/simplify.cc @@ -420,13 +420,20 @@ compute_dot_product (gfc_expr *matrix_a, int stride_a, int offset_a, { gfc_expr *result, *a, *b, *c; - /* Set result to an INTEGER(1) 0 for numeric types and .false. for + /* Set result to an UNSIGNED of correct kind for unsigned, + INTEGER(1) 0 for other numeric types, and .false. for LOGICAL. Mixed-mode math in the loop will promote result to the correct type and kind. */ if (matrix_a->ts.type == BT_LOGICAL) result = gfc_get_logical_expr (gfc_default_logical_kind, NULL, false); + else if (matrix_a->ts.type == BT_UNSIGNED) + { + int kind = MAX (matrix_a->ts.kind, matrix_b->ts.kind); + result = gfc_get_unsigned_expr (kind, NULL, 0); + } else result = gfc_get_int_expr (1, NULL, 0); + result->where = matrix_a->where; a = gfc_constructor_lookup_expr (matrix_a->value.constructor, offset_a); @@ -446,6 +453,7 @@ compute_dot_product (gfc_expr *matrix_a, int stride_a, int offset_a, case BT_INTEGER: case BT_REAL: case BT_COMPLEX: + case BT_UNSIGNED: if (conj_a && a->ts.type == BT_COMPLEX) c = gfc_simplify_conjg (a); else diff --git a/gcc/testsuite/gfortran.dg/unsigned_25.f90 b/gcc/testsuite/gfortran.dg/unsigned_25.f90 new file mode 100644 index 00000000000..f6144988d82 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/unsigned_25.f90 @@ -0,0 +1,35 @@ +! { dg-do run } +! { dg-options "-funsigned" } +! Test matrix multiplication +program memain + implicit none + call test1 + call test2 +contains + subroutine test1 + integer, parameter :: n = 10, m = 28 + unsigned, dimension(n,n) :: u, v, w + integer(kind=8), dimension(n,n) :: i, j, k + real(8), dimension(n,n) :: a, b + + call random_number(a) + call random_number(b) + u = uint(a*2.0**m) + v = uint(b*2.0**m) + i = int(a*2.0**m,8) + j = int(b*2.0**m,8) + w = matmul(u,v) + k = mod(matmul(i,j),2_8**32) + if (any(uint(k) /= w)) error stop 1 + end subroutine test1 + subroutine test2 + unsigned, parameter :: u(3,3) = reshape ([1u, uint(-2), 3u, uint(-4), & + 5u, uint(-6), 7u, uint(-8), 9u],[3,3]) + unsigned, parameter :: v(3,3) = 1u - u + unsigned, parameter :: w(3,3) = matmul(u,v) + integer(kind=16), dimension(3,3), parameter :: & + i = int(u,16), j = int(v,16) + integer(kind=16), dimension(3,3) :: k = matmul(i,j) + if (any(uint(k) /= w)) error stop 2 + end subroutine test2 +end program memain diff --git a/gcc/testsuite/gfortran.dg/unsigned_26.f90 b/gcc/testsuite/gfortran.dg/unsigned_26.f90 new file mode 100644 index 00000000000..b8bad9dcd32 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/unsigned_26.f90 @@ -0,0 +1,40 @@ +! { dg-do run } +! { dg-options "-funsigned" } +! Test dot_product both for runtime and compile +program memain + call test1 + call test2 +contains + subroutine test1 + integer, parameter :: n = 10 + real(8), dimension(n) :: a, b + unsigned, dimension(n) :: u, v + integer(8), dimension(n) :: i, j + unsigned :: res_u + integer(8) :: res_i + integer :: k + + do k=1,10 + call random_number(a) + call random_number(b) + u = uint(a*2**32) + v = uint(b*2**32) + i = int(u,8) + j = int(v,8) + res_u = dot_product(u,v) + res_i = dot_product(i,j) + if (res_u /= uint(res_i)) error stop 1 + end do + end subroutine test1 + + subroutine test2 + integer, parameter :: n = 5 + unsigned, parameter, dimension(n) :: & + u = [1149221887u, 214388752u, 724301838u, 1618160523u, 1783282425u], & + v = [1428464973u, 1887264271u, 1830319906u, 983537781u, 13514400u] + integer(8), parameter, dimension(n) :: i = int(u,8), j=int(v,8) + unsigned, parameter :: res_1 = dot_product(u,v) + integer(8), parameter :: res_2 = dot_product(i,j) + if (res_1 /= uint(res_2)) error stop 2 + end subroutine test2 +end program diff --git a/libgfortran/generated/matmul_c10.c b/libgfortran/generated/matmul_c10.c index c3dbb6d7b0f..54a8364436c 100644 --- a/libgfortran/generated/matmul_c10.c +++ b/libgfortran/generated/matmul_c10.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *, - const int *, const GFC_COMPLEX_10 *, const int *, - const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *, - int, int); + const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *, + const int *, const GFC_COMPLEX_10 *, const int *, + const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_c10 (gfc_array_c10 * const restrict retarray, +extern void matmul_c10 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_c10); @@ -80,11 +80,11 @@ export_proto(matmul_c10); #ifdef HAVE_AVX static void -matmul_c10_avx (gfc_array_c10 * const restrict retarray, +matmul_c10_avx (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_c10_avx (gfc_array_c10 * const restrict retarray, +matmul_c10_avx (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_c10_avx (gfc_array_c10 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, +matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, +matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_c10_avx512f (gfc_array_c10 * const restrict retarray, +matmul_c10_avx512f (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_c10_avx512f (gfc_array_c10 * const restrict retarray, +matmul_c10_avx512f (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray, +matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_c10_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_c10_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray, +matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_c10_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_c10_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_c10_vanilla (gfc_array_c10 * const restrict retarray, +matmul_c10_vanilla (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_c10_vanilla (gfc_array_c10 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_c10 (gfc_array_c10 * const restrict retarray, +void matmul_c10 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_c10 * const restrict retarray, + static void (*matmul_p) (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_c10 * const restrict retarray, + void (*matmul_fn) (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_c10 (gfc_array_c10 * const restrict retarray, +matmul_c10 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_c16.c b/libgfortran/generated/matmul_c16.c index 230f17d6e3f..fce4ce295f5 100644 --- a/libgfortran/generated/matmul_c16.c +++ b/libgfortran/generated/matmul_c16.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *, - const int *, const GFC_COMPLEX_16 *, const int *, - const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *, - int, int); + const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *, + const int *, const GFC_COMPLEX_16 *, const int *, + const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_c16 (gfc_array_c16 * const restrict retarray, +extern void matmul_c16 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_c16); @@ -80,11 +80,11 @@ export_proto(matmul_c16); #ifdef HAVE_AVX static void -matmul_c16_avx (gfc_array_c16 * const restrict retarray, +matmul_c16_avx (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_c16_avx (gfc_array_c16 * const restrict retarray, +matmul_c16_avx (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_c16_avx (gfc_array_c16 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, +matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, +matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, +matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, +matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray, +matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_c16_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_c16_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray, +matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_c16_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_c16_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_c16_vanilla (gfc_array_c16 * const restrict retarray, +matmul_c16_vanilla (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_c16_vanilla (gfc_array_c16 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_c16 (gfc_array_c16 * const restrict retarray, +void matmul_c16 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_c16 * const restrict retarray, + static void (*matmul_p) (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_c16 * const restrict retarray, + void (*matmul_fn) (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_c16 (gfc_array_c16 * const restrict retarray, +matmul_c16 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_c17.c b/libgfortran/generated/matmul_c17.c index cbfd25d2725..aee0d5a15f9 100644 --- a/libgfortran/generated/matmul_c17.c +++ b/libgfortran/generated/matmul_c17.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_COMPLEX_17 *, const GFC_COMPLEX_17 *, - const int *, const GFC_COMPLEX_17 *, const int *, - const GFC_COMPLEX_17 *, GFC_COMPLEX_17 *, const int *, - int, int); + const int *, const GFC_COMPLEX_17 *, const GFC_COMPLEX_17 *, + const int *, const GFC_COMPLEX_17 *, const int *, + const GFC_COMPLEX_17 *, GFC_COMPLEX_17 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_c17 (gfc_array_c17 * const restrict retarray, +extern void matmul_c17 (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_c17); @@ -80,11 +80,11 @@ export_proto(matmul_c17); #ifdef HAVE_AVX static void -matmul_c17_avx (gfc_array_c17 * const restrict retarray, +matmul_c17_avx (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_c17_avx (gfc_array_c17 * const restrict retarray, +matmul_c17_avx (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_c17_avx (gfc_array_c17 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_c17_avx2 (gfc_array_c17 * const restrict retarray, +matmul_c17_avx2 (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_c17_avx2 (gfc_array_c17 * const restrict retarray, +matmul_c17_avx2 (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_c17_avx2 (gfc_array_c17 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_c17_avx512f (gfc_array_c17 * const restrict retarray, +matmul_c17_avx512f (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_c17_avx512f (gfc_array_c17 * const restrict retarray, +matmul_c17_avx512f (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_c17_avx512f (gfc_array_c17 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_c17_avx128_fma3 (gfc_array_c17 * const restrict retarray, +matmul_c17_avx128_fma3 (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_c17_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_c17_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_c17_avx128_fma4 (gfc_array_c17 * const restrict retarray, +matmul_c17_avx128_fma4 (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_c17_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_c17_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_c17_vanilla (gfc_array_c17 * const restrict retarray, +matmul_c17_vanilla (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_c17_vanilla (gfc_array_c17 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_c17 (gfc_array_c17 * const restrict retarray, +void matmul_c17 (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_c17 * const restrict retarray, + static void (*matmul_p) (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_c17 * const restrict retarray, + void (*matmul_fn) (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_c17 (gfc_array_c17 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_c17 (gfc_array_c17 * const restrict retarray, +matmul_c17 (gfc_array_c17 * const restrict retarray, gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c index c8f4550b1b8..2ab8a6f317a 100644 --- a/libgfortran/generated/matmul_c4.c +++ b/libgfortran/generated/matmul_c4.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *, - const int *, const GFC_COMPLEX_4 *, const int *, - const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *, - int, int); + const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *, + const int *, const GFC_COMPLEX_4 *, const int *, + const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_c4 (gfc_array_c4 * const restrict retarray, +extern void matmul_c4 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_c4); @@ -80,11 +80,11 @@ export_proto(matmul_c4); #ifdef HAVE_AVX static void -matmul_c4_avx (gfc_array_c4 * const restrict retarray, +matmul_c4_avx (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_c4_avx (gfc_array_c4 * const restrict retarray, +matmul_c4_avx (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_c4_avx (gfc_array_c4 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, +matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, +matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_c4_avx512f (gfc_array_c4 * const restrict retarray, +matmul_c4_avx512f (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_c4_avx512f (gfc_array_c4 * const restrict retarray, +matmul_c4_avx512f (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray, +matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_c4_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_c4_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray, +matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_c4_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_c4_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_c4_vanilla (gfc_array_c4 * const restrict retarray, +matmul_c4_vanilla (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_c4_vanilla (gfc_array_c4 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_c4 (gfc_array_c4 * const restrict retarray, +void matmul_c4 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_c4 * const restrict retarray, + static void (*matmul_p) (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_c4 * const restrict retarray, + void (*matmul_fn) (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_c4 (gfc_array_c4 * const restrict retarray, +matmul_c4 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_c8.c b/libgfortran/generated/matmul_c8.c index 5c5928d824a..fb5246ec78f 100644 --- a/libgfortran/generated/matmul_c8.c +++ b/libgfortran/generated/matmul_c8.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *, - const int *, const GFC_COMPLEX_8 *, const int *, - const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *, - int, int); + const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *, + const int *, const GFC_COMPLEX_8 *, const int *, + const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_c8 (gfc_array_c8 * const restrict retarray, +extern void matmul_c8 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_c8); @@ -80,11 +80,11 @@ export_proto(matmul_c8); #ifdef HAVE_AVX static void -matmul_c8_avx (gfc_array_c8 * const restrict retarray, +matmul_c8_avx (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_c8_avx (gfc_array_c8 * const restrict retarray, +matmul_c8_avx (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_c8_avx (gfc_array_c8 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, +matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, +matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_c8_avx512f (gfc_array_c8 * const restrict retarray, +matmul_c8_avx512f (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_c8_avx512f (gfc_array_c8 * const restrict retarray, +matmul_c8_avx512f (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray, +matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_c8_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_c8_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray, +matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_c8_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_c8_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_c8_vanilla (gfc_array_c8 * const restrict retarray, +matmul_c8_vanilla (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_c8_vanilla (gfc_array_c8 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_c8 (gfc_array_c8 * const restrict retarray, +void matmul_c8 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_c8 * const restrict retarray, + static void (*matmul_p) (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_c8 * const restrict retarray, + void (*matmul_fn) (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_c8 (gfc_array_c8 * const restrict retarray, +matmul_c8 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_i1.c b/libgfortran/generated/matmul_i1.c index 7a30ad818a2..51e020afb5c 100644 --- a/libgfortran/generated/matmul_i1.c +++ b/libgfortran/generated/matmul_i1.c @@ -28,17 +28,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include -#if defined (HAVE_GFC_INTEGER_1) +#if defined (HAVE_GFC_UINTEGER_1) /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be passed to us by the front-end, in which case we call it for large matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *, - const int *, const GFC_INTEGER_1 *, const int *, - const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *, - int, int); + const int *, const GFC_UINTEGER_1 *, const GFC_UINTEGER_1 *, + const int *, const GFC_UINTEGER_1 *, const int *, + const GFC_UINTEGER_1 *, GFC_UINTEGER_1 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,8 +69,8 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_i1 (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +extern void matmul_i1 (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_i1); @@ -80,17 +80,17 @@ export_proto(matmul_i1); #ifdef HAVE_AVX static void -matmul_i1_avx (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_i1_avx (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_1 * restrict abase; - const GFC_INTEGER_1 * restrict bbase; - GFC_INTEGER_1 * restrict dest; + const GFC_UINTEGER_1 * restrict abase; + const GFC_UINTEGER_1 * restrict bbase; + GFC_UINTEGER_1 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -132,7 +132,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -251,7 +251,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_1 one = 1, zero = 0; + const GFC_UINTEGER_1 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -289,8 +289,8 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_1 *a, *b; - GFC_INTEGER_1 *c; + const GFC_UINTEGER_1 *a, *b; + GFC_UINTEGER_1 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -298,11 +298,11 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_1 *t1; + GFC_UINTEGER_1 *t1; a = abase; b = bbase; @@ -322,7 +322,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_1)0; + c[i + j * c_dim1] = (GFC_UINTEGER_1)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -339,7 +339,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1)); /* Start turning the crank. */ i1 = n; @@ -557,10 +557,10 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -569,7 +569,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -578,13 +578,13 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -593,13 +593,13 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -609,7 +609,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -621,10 +621,10 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -633,7 +633,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -649,17 +649,17 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx2 (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx2 (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_1 * restrict abase; - const GFC_INTEGER_1 * restrict bbase; - GFC_INTEGER_1 * restrict dest; + const GFC_UINTEGER_1 * restrict abase; + const GFC_UINTEGER_1 * restrict bbase; + GFC_UINTEGER_1 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -701,7 +701,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -820,7 +820,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_1 one = 1, zero = 0; + const GFC_UINTEGER_1 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -858,8 +858,8 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_1 *a, *b; - GFC_INTEGER_1 *c; + const GFC_UINTEGER_1 *a, *b; + GFC_UINTEGER_1 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -867,11 +867,11 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_1 *t1; + GFC_UINTEGER_1 *t1; a = abase; b = bbase; @@ -891,7 +891,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_1)0; + c[i + j * c_dim1] = (GFC_UINTEGER_1)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -908,7 +908,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1)); /* Start turning the crank. */ i1 = n; @@ -1126,10 +1126,10 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -1138,7 +1138,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1147,13 +1147,13 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1162,13 +1162,13 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1178,7 +1178,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1190,10 +1190,10 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -1202,7 +1202,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1218,17 +1218,17 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx512f (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx512f (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_1 * restrict abase; - const GFC_INTEGER_1 * restrict bbase; - GFC_INTEGER_1 * restrict dest; + const GFC_UINTEGER_1 * restrict abase; + const GFC_UINTEGER_1 * restrict bbase; + GFC_UINTEGER_1 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1270,7 +1270,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1389,7 +1389,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_1 one = 1, zero = 0; + const GFC_UINTEGER_1 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -1427,8 +1427,8 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_1 *a, *b; - GFC_INTEGER_1 *c; + const GFC_UINTEGER_1 *a, *b; + GFC_UINTEGER_1 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -1436,11 +1436,11 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_1 *t1; + GFC_UINTEGER_1 *t1; a = abase; b = bbase; @@ -1460,7 +1460,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_1)0; + c[i + j * c_dim1] = (GFC_UINTEGER_1)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -1477,7 +1477,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1)); /* Start turning the crank. */ i1 = n; @@ -1695,10 +1695,10 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -1707,7 +1707,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1716,13 +1716,13 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1731,13 +1731,13 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1747,7 +1747,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1759,10 +1759,10 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -1771,7 +1771,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1789,29 +1789,29 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx128_fma3 (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_i1_avx128_fma3); #endif #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_avx128_fma4 (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_i1_avx128_fma4); #endif /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1_vanilla (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_1 * restrict abase; - const GFC_INTEGER_1 * restrict bbase; - GFC_INTEGER_1 * restrict dest; + const GFC_UINTEGER_1 * restrict abase; + const GFC_UINTEGER_1 * restrict bbase; + GFC_UINTEGER_1 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1853,7 +1853,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1972,7 +1972,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_1 one = 1, zero = 0; + const GFC_UINTEGER_1 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2010,8 +2010,8 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_1 *a, *b; - GFC_INTEGER_1 *c; + const GFC_UINTEGER_1 *a, *b; + GFC_UINTEGER_1 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2019,11 +2019,11 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_1 *t1; + GFC_UINTEGER_1 *t1; a = abase; b = bbase; @@ -2043,7 +2043,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_1)0; + c[i + j * c_dim1] = (GFC_UINTEGER_1)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2060,7 +2060,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1)); /* Start turning the crank. */ i1 = n; @@ -2278,10 +2278,10 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -2290,7 +2290,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2299,13 +2299,13 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2314,13 +2314,13 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2330,7 +2330,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2342,10 +2342,10 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -2354,7 +2354,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -2371,16 +2371,16 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_i1 (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +void matmul_i1 (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + static void (*matmul_p) (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + void (*matmul_fn) (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm); matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); @@ -2447,13 +2447,13 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_i1 (gfc_array_i1 * const restrict retarray, - gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, +matmul_i1 (gfc_array_m1 * const restrict retarray, + gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_1 * restrict abase; - const GFC_INTEGER_1 * restrict bbase; - GFC_INTEGER_1 * restrict dest; + const GFC_UINTEGER_1 * restrict abase; + const GFC_UINTEGER_1 * restrict bbase; + GFC_UINTEGER_1 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -2495,7 +2495,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -2614,7 +2614,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_1 one = 1, zero = 0; + const GFC_UINTEGER_1 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2652,8 +2652,8 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_1 *a, *b; - GFC_INTEGER_1 *c; + const GFC_UINTEGER_1 *a, *b; + GFC_UINTEGER_1 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2661,11 +2661,11 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_1 *t1; + GFC_UINTEGER_1 *t1; a = abase; b = bbase; @@ -2685,7 +2685,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_1)0; + c[i + j * c_dim1] = (GFC_UINTEGER_1)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2702,7 +2702,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1)); /* Start turning the crank. */ i1 = n; @@ -2920,10 +2920,10 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -2932,7 +2932,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2941,13 +2941,13 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2956,13 +2956,13 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2972,7 +2972,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2984,10 +2984,10 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, } else { - const GFC_INTEGER_1 *restrict abase_x; - const GFC_INTEGER_1 *restrict bbase_y; - GFC_INTEGER_1 *restrict dest_y; - GFC_INTEGER_1 s; + const GFC_UINTEGER_1 *restrict abase_x; + const GFC_UINTEGER_1 *restrict bbase_y; + GFC_UINTEGER_1 *restrict dest_y; + GFC_UINTEGER_1 s; for (y = 0; y < ycount; y++) { @@ -2996,7 +2996,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_1) 0; + s = (GFC_UINTEGER_1) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; diff --git a/libgfortran/generated/matmul_i16.c b/libgfortran/generated/matmul_i16.c index cf72f656511..9a7eee4ddc9 100644 --- a/libgfortran/generated/matmul_i16.c +++ b/libgfortran/generated/matmul_i16.c @@ -28,17 +28,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include -#if defined (HAVE_GFC_INTEGER_16) +#if defined (HAVE_GFC_UINTEGER_16) /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be passed to us by the front-end, in which case we call it for large matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *, - const int *, const GFC_INTEGER_16 *, const int *, - const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *, - int, int); + const int *, const GFC_UINTEGER_16 *, const GFC_UINTEGER_16 *, + const int *, const GFC_UINTEGER_16 *, const int *, + const GFC_UINTEGER_16 *, GFC_UINTEGER_16 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,8 +69,8 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_i16 (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +extern void matmul_i16 (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_i16); @@ -80,17 +80,17 @@ export_proto(matmul_i16); #ifdef HAVE_AVX static void -matmul_i16_avx (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_i16_avx (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_16 * restrict abase; - const GFC_INTEGER_16 * restrict bbase; - GFC_INTEGER_16 * restrict dest; + const GFC_UINTEGER_16 * restrict abase; + const GFC_UINTEGER_16 * restrict bbase; + GFC_UINTEGER_16 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -132,7 +132,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -251,7 +251,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_16 one = 1, zero = 0; + const GFC_UINTEGER_16 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -289,8 +289,8 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_16 *a, *b; - GFC_INTEGER_16 *c; + const GFC_UINTEGER_16 *a, *b; + GFC_UINTEGER_16 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -298,11 +298,11 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_16 *t1; + GFC_UINTEGER_16 *t1; a = abase; b = bbase; @@ -322,7 +322,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_16)0; + c[i + j * c_dim1] = (GFC_UINTEGER_16)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -339,7 +339,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16)); /* Start turning the crank. */ i1 = n; @@ -557,10 +557,10 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -569,7 +569,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -578,13 +578,13 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -593,13 +593,13 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -609,7 +609,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -621,10 +621,10 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -633,7 +633,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -649,17 +649,17 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx2 (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx2 (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_16 * restrict abase; - const GFC_INTEGER_16 * restrict bbase; - GFC_INTEGER_16 * restrict dest; + const GFC_UINTEGER_16 * restrict abase; + const GFC_UINTEGER_16 * restrict bbase; + GFC_UINTEGER_16 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -701,7 +701,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -820,7 +820,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_16 one = 1, zero = 0; + const GFC_UINTEGER_16 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -858,8 +858,8 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_16 *a, *b; - GFC_INTEGER_16 *c; + const GFC_UINTEGER_16 *a, *b; + GFC_UINTEGER_16 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -867,11 +867,11 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_16 *t1; + GFC_UINTEGER_16 *t1; a = abase; b = bbase; @@ -891,7 +891,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_16)0; + c[i + j * c_dim1] = (GFC_UINTEGER_16)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -908,7 +908,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16)); /* Start turning the crank. */ i1 = n; @@ -1126,10 +1126,10 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -1138,7 +1138,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1147,13 +1147,13 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1162,13 +1162,13 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1178,7 +1178,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1190,10 +1190,10 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -1202,7 +1202,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1218,17 +1218,17 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx512f (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx512f (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_16 * restrict abase; - const GFC_INTEGER_16 * restrict bbase; - GFC_INTEGER_16 * restrict dest; + const GFC_UINTEGER_16 * restrict abase; + const GFC_UINTEGER_16 * restrict bbase; + GFC_UINTEGER_16 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1270,7 +1270,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1389,7 +1389,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_16 one = 1, zero = 0; + const GFC_UINTEGER_16 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -1427,8 +1427,8 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_16 *a, *b; - GFC_INTEGER_16 *c; + const GFC_UINTEGER_16 *a, *b; + GFC_UINTEGER_16 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -1436,11 +1436,11 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_16 *t1; + GFC_UINTEGER_16 *t1; a = abase; b = bbase; @@ -1460,7 +1460,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_16)0; + c[i + j * c_dim1] = (GFC_UINTEGER_16)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -1477,7 +1477,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16)); /* Start turning the crank. */ i1 = n; @@ -1695,10 +1695,10 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -1707,7 +1707,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1716,13 +1716,13 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1731,13 +1731,13 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1747,7 +1747,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1759,10 +1759,10 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -1771,7 +1771,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1789,29 +1789,29 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx128_fma3 (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_i16_avx128_fma3); #endif #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_avx128_fma4 (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_i16_avx128_fma4); #endif /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16_vanilla (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_16 * restrict abase; - const GFC_INTEGER_16 * restrict bbase; - GFC_INTEGER_16 * restrict dest; + const GFC_UINTEGER_16 * restrict abase; + const GFC_UINTEGER_16 * restrict bbase; + GFC_UINTEGER_16 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1853,7 +1853,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1972,7 +1972,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_16 one = 1, zero = 0; + const GFC_UINTEGER_16 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2010,8 +2010,8 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_16 *a, *b; - GFC_INTEGER_16 *c; + const GFC_UINTEGER_16 *a, *b; + GFC_UINTEGER_16 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2019,11 +2019,11 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_16 *t1; + GFC_UINTEGER_16 *t1; a = abase; b = bbase; @@ -2043,7 +2043,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_16)0; + c[i + j * c_dim1] = (GFC_UINTEGER_16)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2060,7 +2060,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16)); /* Start turning the crank. */ i1 = n; @@ -2278,10 +2278,10 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -2290,7 +2290,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2299,13 +2299,13 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2314,13 +2314,13 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2330,7 +2330,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2342,10 +2342,10 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -2354,7 +2354,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -2371,16 +2371,16 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_i16 (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +void matmul_i16 (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + static void (*matmul_p) (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + void (*matmul_fn) (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); @@ -2447,13 +2447,13 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_i16 (gfc_array_i16 * const restrict retarray, - gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, +matmul_i16 (gfc_array_m16 * const restrict retarray, + gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_16 * restrict abase; - const GFC_INTEGER_16 * restrict bbase; - GFC_INTEGER_16 * restrict dest; + const GFC_UINTEGER_16 * restrict abase; + const GFC_UINTEGER_16 * restrict bbase; + GFC_UINTEGER_16 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -2495,7 +2495,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -2614,7 +2614,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_16 one = 1, zero = 0; + const GFC_UINTEGER_16 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2652,8 +2652,8 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_16 *a, *b; - GFC_INTEGER_16 *c; + const GFC_UINTEGER_16 *a, *b; + GFC_UINTEGER_16 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2661,11 +2661,11 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_16 *t1; + GFC_UINTEGER_16 *t1; a = abase; b = bbase; @@ -2685,7 +2685,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_16)0; + c[i + j * c_dim1] = (GFC_UINTEGER_16)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2702,7 +2702,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16)); /* Start turning the crank. */ i1 = n; @@ -2920,10 +2920,10 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -2932,7 +2932,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2941,13 +2941,13 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2956,13 +2956,13 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2972,7 +2972,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2984,10 +2984,10 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, } else { - const GFC_INTEGER_16 *restrict abase_x; - const GFC_INTEGER_16 *restrict bbase_y; - GFC_INTEGER_16 *restrict dest_y; - GFC_INTEGER_16 s; + const GFC_UINTEGER_16 *restrict abase_x; + const GFC_UINTEGER_16 *restrict bbase_y; + GFC_UINTEGER_16 *restrict dest_y; + GFC_UINTEGER_16 s; for (y = 0; y < ycount; y++) { @@ -2996,7 +2996,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_16) 0; + s = (GFC_UINTEGER_16) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; diff --git a/libgfortran/generated/matmul_i2.c b/libgfortran/generated/matmul_i2.c index 1b727e46588..89e326e6be5 100644 --- a/libgfortran/generated/matmul_i2.c +++ b/libgfortran/generated/matmul_i2.c @@ -28,17 +28,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include -#if defined (HAVE_GFC_INTEGER_2) +#if defined (HAVE_GFC_UINTEGER_2) /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be passed to us by the front-end, in which case we call it for large matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *, - const int *, const GFC_INTEGER_2 *, const int *, - const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *, - int, int); + const int *, const GFC_UINTEGER_2 *, const GFC_UINTEGER_2 *, + const int *, const GFC_UINTEGER_2 *, const int *, + const GFC_UINTEGER_2 *, GFC_UINTEGER_2 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,8 +69,8 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_i2 (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +extern void matmul_i2 (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_i2); @@ -80,17 +80,17 @@ export_proto(matmul_i2); #ifdef HAVE_AVX static void -matmul_i2_avx (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_i2_avx (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_2 * restrict abase; - const GFC_INTEGER_2 * restrict bbase; - GFC_INTEGER_2 * restrict dest; + const GFC_UINTEGER_2 * restrict abase; + const GFC_UINTEGER_2 * restrict bbase; + GFC_UINTEGER_2 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -132,7 +132,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -251,7 +251,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_2 one = 1, zero = 0; + const GFC_UINTEGER_2 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -289,8 +289,8 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_2 *a, *b; - GFC_INTEGER_2 *c; + const GFC_UINTEGER_2 *a, *b; + GFC_UINTEGER_2 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -298,11 +298,11 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_2 *t1; + GFC_UINTEGER_2 *t1; a = abase; b = bbase; @@ -322,7 +322,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_2)0; + c[i + j * c_dim1] = (GFC_UINTEGER_2)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -339,7 +339,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2)); /* Start turning the crank. */ i1 = n; @@ -557,10 +557,10 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -569,7 +569,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -578,13 +578,13 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -593,13 +593,13 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -609,7 +609,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -621,10 +621,10 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -633,7 +633,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -649,17 +649,17 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx2 (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx2 (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_2 * restrict abase; - const GFC_INTEGER_2 * restrict bbase; - GFC_INTEGER_2 * restrict dest; + const GFC_UINTEGER_2 * restrict abase; + const GFC_UINTEGER_2 * restrict bbase; + GFC_UINTEGER_2 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -701,7 +701,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -820,7 +820,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_2 one = 1, zero = 0; + const GFC_UINTEGER_2 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -858,8 +858,8 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_2 *a, *b; - GFC_INTEGER_2 *c; + const GFC_UINTEGER_2 *a, *b; + GFC_UINTEGER_2 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -867,11 +867,11 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_2 *t1; + GFC_UINTEGER_2 *t1; a = abase; b = bbase; @@ -891,7 +891,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_2)0; + c[i + j * c_dim1] = (GFC_UINTEGER_2)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -908,7 +908,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2)); /* Start turning the crank. */ i1 = n; @@ -1126,10 +1126,10 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -1138,7 +1138,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1147,13 +1147,13 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1162,13 +1162,13 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1178,7 +1178,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1190,10 +1190,10 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -1202,7 +1202,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1218,17 +1218,17 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx512f (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx512f (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_2 * restrict abase; - const GFC_INTEGER_2 * restrict bbase; - GFC_INTEGER_2 * restrict dest; + const GFC_UINTEGER_2 * restrict abase; + const GFC_UINTEGER_2 * restrict bbase; + GFC_UINTEGER_2 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1270,7 +1270,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1389,7 +1389,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_2 one = 1, zero = 0; + const GFC_UINTEGER_2 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -1427,8 +1427,8 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_2 *a, *b; - GFC_INTEGER_2 *c; + const GFC_UINTEGER_2 *a, *b; + GFC_UINTEGER_2 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -1436,11 +1436,11 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_2 *t1; + GFC_UINTEGER_2 *t1; a = abase; b = bbase; @@ -1460,7 +1460,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_2)0; + c[i + j * c_dim1] = (GFC_UINTEGER_2)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -1477,7 +1477,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2)); /* Start turning the crank. */ i1 = n; @@ -1695,10 +1695,10 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -1707,7 +1707,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1716,13 +1716,13 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1731,13 +1731,13 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1747,7 +1747,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1759,10 +1759,10 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -1771,7 +1771,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1789,29 +1789,29 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx128_fma3 (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_i2_avx128_fma3); #endif #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_avx128_fma4 (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_i2_avx128_fma4); #endif /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2_vanilla (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_2 * restrict abase; - const GFC_INTEGER_2 * restrict bbase; - GFC_INTEGER_2 * restrict dest; + const GFC_UINTEGER_2 * restrict abase; + const GFC_UINTEGER_2 * restrict bbase; + GFC_UINTEGER_2 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1853,7 +1853,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1972,7 +1972,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_2 one = 1, zero = 0; + const GFC_UINTEGER_2 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2010,8 +2010,8 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_2 *a, *b; - GFC_INTEGER_2 *c; + const GFC_UINTEGER_2 *a, *b; + GFC_UINTEGER_2 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2019,11 +2019,11 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_2 *t1; + GFC_UINTEGER_2 *t1; a = abase; b = bbase; @@ -2043,7 +2043,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_2)0; + c[i + j * c_dim1] = (GFC_UINTEGER_2)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2060,7 +2060,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2)); /* Start turning the crank. */ i1 = n; @@ -2278,10 +2278,10 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -2290,7 +2290,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2299,13 +2299,13 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2314,13 +2314,13 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2330,7 +2330,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2342,10 +2342,10 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -2354,7 +2354,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -2371,16 +2371,16 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_i2 (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +void matmul_i2 (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + static void (*matmul_p) (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + void (*matmul_fn) (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm); matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); @@ -2447,13 +2447,13 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_i2 (gfc_array_i2 * const restrict retarray, - gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, +matmul_i2 (gfc_array_m2 * const restrict retarray, + gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_2 * restrict abase; - const GFC_INTEGER_2 * restrict bbase; - GFC_INTEGER_2 * restrict dest; + const GFC_UINTEGER_2 * restrict abase; + const GFC_UINTEGER_2 * restrict bbase; + GFC_UINTEGER_2 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -2495,7 +2495,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -2614,7 +2614,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_2 one = 1, zero = 0; + const GFC_UINTEGER_2 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2652,8 +2652,8 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_2 *a, *b; - GFC_INTEGER_2 *c; + const GFC_UINTEGER_2 *a, *b; + GFC_UINTEGER_2 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2661,11 +2661,11 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_2 *t1; + GFC_UINTEGER_2 *t1; a = abase; b = bbase; @@ -2685,7 +2685,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_2)0; + c[i + j * c_dim1] = (GFC_UINTEGER_2)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2702,7 +2702,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2)); /* Start turning the crank. */ i1 = n; @@ -2920,10 +2920,10 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -2932,7 +2932,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2941,13 +2941,13 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2956,13 +2956,13 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2972,7 +2972,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2984,10 +2984,10 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, } else { - const GFC_INTEGER_2 *restrict abase_x; - const GFC_INTEGER_2 *restrict bbase_y; - GFC_INTEGER_2 *restrict dest_y; - GFC_INTEGER_2 s; + const GFC_UINTEGER_2 *restrict abase_x; + const GFC_UINTEGER_2 *restrict bbase_y; + GFC_UINTEGER_2 *restrict dest_y; + GFC_UINTEGER_2 s; for (y = 0; y < ycount; y++) { @@ -2996,7 +2996,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_2) 0; + s = (GFC_UINTEGER_2) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; diff --git a/libgfortran/generated/matmul_i4.c b/libgfortran/generated/matmul_i4.c index ba421d72c35..2601f6453b7 100644 --- a/libgfortran/generated/matmul_i4.c +++ b/libgfortran/generated/matmul_i4.c @@ -28,17 +28,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include -#if defined (HAVE_GFC_INTEGER_4) +#if defined (HAVE_GFC_UINTEGER_4) /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be passed to us by the front-end, in which case we call it for large matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *, - const int *, const GFC_INTEGER_4 *, const int *, - const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *, - int, int); + const int *, const GFC_UINTEGER_4 *, const GFC_UINTEGER_4 *, + const int *, const GFC_UINTEGER_4 *, const int *, + const GFC_UINTEGER_4 *, GFC_UINTEGER_4 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,8 +69,8 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_i4 (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +extern void matmul_i4 (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_i4); @@ -80,17 +80,17 @@ export_proto(matmul_i4); #ifdef HAVE_AVX static void -matmul_i4_avx (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_i4_avx (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_4 * restrict abase; - const GFC_INTEGER_4 * restrict bbase; - GFC_INTEGER_4 * restrict dest; + const GFC_UINTEGER_4 * restrict abase; + const GFC_UINTEGER_4 * restrict bbase; + GFC_UINTEGER_4 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -132,7 +132,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -251,7 +251,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_4 one = 1, zero = 0; + const GFC_UINTEGER_4 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -289,8 +289,8 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_4 *a, *b; - GFC_INTEGER_4 *c; + const GFC_UINTEGER_4 *a, *b; + GFC_UINTEGER_4 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -298,11 +298,11 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_4 *t1; + GFC_UINTEGER_4 *t1; a = abase; b = bbase; @@ -322,7 +322,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_4)0; + c[i + j * c_dim1] = (GFC_UINTEGER_4)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -339,7 +339,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4)); /* Start turning the crank. */ i1 = n; @@ -557,10 +557,10 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -569,7 +569,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -578,13 +578,13 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -593,13 +593,13 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -609,7 +609,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -621,10 +621,10 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -633,7 +633,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -649,17 +649,17 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx2 (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx2 (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_4 * restrict abase; - const GFC_INTEGER_4 * restrict bbase; - GFC_INTEGER_4 * restrict dest; + const GFC_UINTEGER_4 * restrict abase; + const GFC_UINTEGER_4 * restrict bbase; + GFC_UINTEGER_4 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -701,7 +701,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -820,7 +820,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_4 one = 1, zero = 0; + const GFC_UINTEGER_4 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -858,8 +858,8 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_4 *a, *b; - GFC_INTEGER_4 *c; + const GFC_UINTEGER_4 *a, *b; + GFC_UINTEGER_4 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -867,11 +867,11 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_4 *t1; + GFC_UINTEGER_4 *t1; a = abase; b = bbase; @@ -891,7 +891,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_4)0; + c[i + j * c_dim1] = (GFC_UINTEGER_4)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -908,7 +908,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4)); /* Start turning the crank. */ i1 = n; @@ -1126,10 +1126,10 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -1138,7 +1138,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1147,13 +1147,13 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1162,13 +1162,13 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1178,7 +1178,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1190,10 +1190,10 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -1202,7 +1202,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1218,17 +1218,17 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx512f (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx512f (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_4 * restrict abase; - const GFC_INTEGER_4 * restrict bbase; - GFC_INTEGER_4 * restrict dest; + const GFC_UINTEGER_4 * restrict abase; + const GFC_UINTEGER_4 * restrict bbase; + GFC_UINTEGER_4 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1270,7 +1270,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1389,7 +1389,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_4 one = 1, zero = 0; + const GFC_UINTEGER_4 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -1427,8 +1427,8 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_4 *a, *b; - GFC_INTEGER_4 *c; + const GFC_UINTEGER_4 *a, *b; + GFC_UINTEGER_4 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -1436,11 +1436,11 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_4 *t1; + GFC_UINTEGER_4 *t1; a = abase; b = bbase; @@ -1460,7 +1460,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_4)0; + c[i + j * c_dim1] = (GFC_UINTEGER_4)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -1477,7 +1477,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4)); /* Start turning the crank. */ i1 = n; @@ -1695,10 +1695,10 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -1707,7 +1707,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1716,13 +1716,13 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1731,13 +1731,13 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1747,7 +1747,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1759,10 +1759,10 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -1771,7 +1771,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1789,29 +1789,29 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx128_fma3 (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_i4_avx128_fma3); #endif #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_avx128_fma4 (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_i4_avx128_fma4); #endif /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4_vanilla (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_4 * restrict abase; - const GFC_INTEGER_4 * restrict bbase; - GFC_INTEGER_4 * restrict dest; + const GFC_UINTEGER_4 * restrict abase; + const GFC_UINTEGER_4 * restrict bbase; + GFC_UINTEGER_4 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1853,7 +1853,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1972,7 +1972,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_4 one = 1, zero = 0; + const GFC_UINTEGER_4 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2010,8 +2010,8 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_4 *a, *b; - GFC_INTEGER_4 *c; + const GFC_UINTEGER_4 *a, *b; + GFC_UINTEGER_4 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2019,11 +2019,11 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_4 *t1; + GFC_UINTEGER_4 *t1; a = abase; b = bbase; @@ -2043,7 +2043,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_4)0; + c[i + j * c_dim1] = (GFC_UINTEGER_4)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2060,7 +2060,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4)); /* Start turning the crank. */ i1 = n; @@ -2278,10 +2278,10 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -2290,7 +2290,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2299,13 +2299,13 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2314,13 +2314,13 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2330,7 +2330,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2342,10 +2342,10 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -2354,7 +2354,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -2371,16 +2371,16 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_i4 (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +void matmul_i4 (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + static void (*matmul_p) (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + void (*matmul_fn) (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); @@ -2447,13 +2447,13 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_i4 (gfc_array_i4 * const restrict retarray, - gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, +matmul_i4 (gfc_array_m4 * const restrict retarray, + gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_4 * restrict abase; - const GFC_INTEGER_4 * restrict bbase; - GFC_INTEGER_4 * restrict dest; + const GFC_UINTEGER_4 * restrict abase; + const GFC_UINTEGER_4 * restrict bbase; + GFC_UINTEGER_4 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -2495,7 +2495,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -2614,7 +2614,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_4 one = 1, zero = 0; + const GFC_UINTEGER_4 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2652,8 +2652,8 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_4 *a, *b; - GFC_INTEGER_4 *c; + const GFC_UINTEGER_4 *a, *b; + GFC_UINTEGER_4 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2661,11 +2661,11 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_4 *t1; + GFC_UINTEGER_4 *t1; a = abase; b = bbase; @@ -2685,7 +2685,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_4)0; + c[i + j * c_dim1] = (GFC_UINTEGER_4)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2702,7 +2702,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4)); /* Start turning the crank. */ i1 = n; @@ -2920,10 +2920,10 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -2932,7 +2932,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2941,13 +2941,13 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2956,13 +2956,13 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2972,7 +2972,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2984,10 +2984,10 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, } else { - const GFC_INTEGER_4 *restrict abase_x; - const GFC_INTEGER_4 *restrict bbase_y; - GFC_INTEGER_4 *restrict dest_y; - GFC_INTEGER_4 s; + const GFC_UINTEGER_4 *restrict abase_x; + const GFC_UINTEGER_4 *restrict bbase_y; + GFC_UINTEGER_4 *restrict dest_y; + GFC_UINTEGER_4 s; for (y = 0; y < ycount; y++) { @@ -2996,7 +2996,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_4) 0; + s = (GFC_UINTEGER_4) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; diff --git a/libgfortran/generated/matmul_i8.c b/libgfortran/generated/matmul_i8.c index 9405abc23b8..96ef7e69456 100644 --- a/libgfortran/generated/matmul_i8.c +++ b/libgfortran/generated/matmul_i8.c @@ -28,17 +28,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include -#if defined (HAVE_GFC_INTEGER_8) +#if defined (HAVE_GFC_UINTEGER_8) /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be passed to us by the front-end, in which case we call it for large matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *, - const int *, const GFC_INTEGER_8 *, const int *, - const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *, - int, int); + const int *, const GFC_UINTEGER_8 *, const GFC_UINTEGER_8 *, + const int *, const GFC_UINTEGER_8 *, const int *, + const GFC_UINTEGER_8 *, GFC_UINTEGER_8 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,8 +69,8 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_i8 (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +extern void matmul_i8 (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_i8); @@ -80,17 +80,17 @@ export_proto(matmul_i8); #ifdef HAVE_AVX static void -matmul_i8_avx (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_i8_avx (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_8 * restrict abase; - const GFC_INTEGER_8 * restrict bbase; - GFC_INTEGER_8 * restrict dest; + const GFC_UINTEGER_8 * restrict abase; + const GFC_UINTEGER_8 * restrict bbase; + GFC_UINTEGER_8 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -132,7 +132,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -251,7 +251,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_8 one = 1, zero = 0; + const GFC_UINTEGER_8 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -289,8 +289,8 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_8 *a, *b; - GFC_INTEGER_8 *c; + const GFC_UINTEGER_8 *a, *b; + GFC_UINTEGER_8 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -298,11 +298,11 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_8 *t1; + GFC_UINTEGER_8 *t1; a = abase; b = bbase; @@ -322,7 +322,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_8)0; + c[i + j * c_dim1] = (GFC_UINTEGER_8)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -339,7 +339,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8)); /* Start turning the crank. */ i1 = n; @@ -557,10 +557,10 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -569,7 +569,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -578,13 +578,13 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -593,13 +593,13 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -609,7 +609,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -621,10 +621,10 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -633,7 +633,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -649,17 +649,17 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx2 (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx2 (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_8 * restrict abase; - const GFC_INTEGER_8 * restrict bbase; - GFC_INTEGER_8 * restrict dest; + const GFC_UINTEGER_8 * restrict abase; + const GFC_UINTEGER_8 * restrict bbase; + GFC_UINTEGER_8 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -701,7 +701,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -820,7 +820,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_8 one = 1, zero = 0; + const GFC_UINTEGER_8 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -858,8 +858,8 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_8 *a, *b; - GFC_INTEGER_8 *c; + const GFC_UINTEGER_8 *a, *b; + GFC_UINTEGER_8 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -867,11 +867,11 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_8 *t1; + GFC_UINTEGER_8 *t1; a = abase; b = bbase; @@ -891,7 +891,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_8)0; + c[i + j * c_dim1] = (GFC_UINTEGER_8)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -908,7 +908,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8)); /* Start turning the crank. */ i1 = n; @@ -1126,10 +1126,10 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -1138,7 +1138,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1147,13 +1147,13 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1162,13 +1162,13 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1178,7 +1178,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1190,10 +1190,10 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -1202,7 +1202,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1218,17 +1218,17 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx512f (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx512f (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_8 * restrict abase; - const GFC_INTEGER_8 * restrict bbase; - GFC_INTEGER_8 * restrict dest; + const GFC_UINTEGER_8 * restrict abase; + const GFC_UINTEGER_8 * restrict bbase; + GFC_UINTEGER_8 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1270,7 +1270,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1389,7 +1389,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_8 one = 1, zero = 0; + const GFC_UINTEGER_8 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -1427,8 +1427,8 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_8 *a, *b; - GFC_INTEGER_8 *c; + const GFC_UINTEGER_8 *a, *b; + GFC_UINTEGER_8 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -1436,11 +1436,11 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_8 *t1; + GFC_UINTEGER_8 *t1; a = abase; b = bbase; @@ -1460,7 +1460,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_8)0; + c[i + j * c_dim1] = (GFC_UINTEGER_8)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -1477,7 +1477,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8)); /* Start turning the crank. */ i1 = n; @@ -1695,10 +1695,10 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -1707,7 +1707,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -1716,13 +1716,13 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -1731,13 +1731,13 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -1747,7 +1747,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -1759,10 +1759,10 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -1771,7 +1771,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -1789,29 +1789,29 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx128_fma3 (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_i8_avx128_fma3); #endif #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_avx128_fma4 (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_i8_avx128_fma4); #endif /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8_vanilla (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_8 * restrict abase; - const GFC_INTEGER_8 * restrict bbase; - GFC_INTEGER_8 * restrict dest; + const GFC_UINTEGER_8 * restrict abase; + const GFC_UINTEGER_8 * restrict bbase; + GFC_UINTEGER_8 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -1853,7 +1853,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -1972,7 +1972,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_8 one = 1, zero = 0; + const GFC_UINTEGER_8 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2010,8 +2010,8 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_8 *a, *b; - GFC_INTEGER_8 *c; + const GFC_UINTEGER_8 *a, *b; + GFC_UINTEGER_8 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2019,11 +2019,11 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_8 *t1; + GFC_UINTEGER_8 *t1; a = abase; b = bbase; @@ -2043,7 +2043,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_8)0; + c[i + j * c_dim1] = (GFC_UINTEGER_8)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2060,7 +2060,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8)); /* Start turning the crank. */ i1 = n; @@ -2278,10 +2278,10 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -2290,7 +2290,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2299,13 +2299,13 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2314,13 +2314,13 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2330,7 +2330,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2342,10 +2342,10 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -2354,7 +2354,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; @@ -2371,16 +2371,16 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_i8 (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +void matmul_i8 (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + static void (*matmul_p) (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + void (*matmul_fn) (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); @@ -2447,13 +2447,13 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_i8 (gfc_array_i8 * const restrict retarray, - gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, +matmul_i8 (gfc_array_m8 * const restrict retarray, + gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - const GFC_INTEGER_8 * restrict abase; - const GFC_INTEGER_8 * restrict bbase; - GFC_INTEGER_8 * restrict dest; + const GFC_UINTEGER_8 * restrict abase; + const GFC_UINTEGER_8 * restrict bbase; + GFC_UINTEGER_8 * restrict dest; index_type rxstride, rystride, axstride, aystride, bxstride, bystride; index_type x, y, n, count, xcount, ycount; @@ -2495,7 +2495,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, } retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8)); retarray->offset = 0; } else if (unlikely (compile_options.bounds_check)) @@ -2614,7 +2614,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, > POW3(blas_limit))) { const int m = xcount, n = ycount, k = count, ldc = rystride; - const GFC_INTEGER_8 one = 1, zero = 0; + const GFC_UINTEGER_8 one = 1, zero = 0; const int lda = (axstride == 1) ? aystride : axstride, ldb = (bxstride == 1) ? bystride : bxstride; @@ -2652,8 +2652,8 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, from netlib.org, translated to C, and modified for matmul.m4. */ - const GFC_INTEGER_8 *a, *b; - GFC_INTEGER_8 *c; + const GFC_UINTEGER_8 *a, *b; + GFC_UINTEGER_8 *c; const index_type m = xcount, n = ycount, k = count; /* System generated locals */ @@ -2661,11 +2661,11 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, i1, i2, i3, i4, i5, i6; /* Local variables */ - GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, + GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42, f13, f14, f23, f24, f33, f34, f43, f44; index_type i, j, l, ii, jj, ll; index_type isec, jsec, lsec, uisec, ujsec, ulsec; - GFC_INTEGER_8 *t1; + GFC_UINTEGER_8 *t1; a = abase; b = bbase; @@ -2685,7 +2685,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, /* Empty c first. */ for (j=1; j<=n; j++) for (i=1; i<=m; i++) - c[i + j * c_dim1] = (GFC_INTEGER_8)0; + c[i + j * c_dim1] = (GFC_UINTEGER_8)0; /* Early exit if possible */ if (m == 0 || n == 0 || k == 0) @@ -2702,7 +2702,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, if (t1_dim > 65536) t1_dim = 65536; - t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8)); + t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8)); /* Start turning the crank. */ i1 = n; @@ -2920,10 +2920,10 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, { if (GFC_DESCRIPTOR_RANK (a) != 1) { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -2932,7 +2932,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n] * bbase_y[n]; dest_y[x] = s; @@ -2941,13 +2941,13 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n]; dest[y*rystride] = s; @@ -2956,13 +2956,13 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, } else if (GFC_DESCRIPTOR_RANK (a) == 1) { - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { bbase_y = &bbase[y*bystride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase[n*axstride] * bbase_y[n*bxstride]; dest[y*rxstride] = s; @@ -2972,7 +2972,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, { for (y = 0; y < ycount; y++) for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0; for (y = 0; y < ycount; y++) for (n = 0; n < count; n++) @@ -2984,10 +2984,10 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, } else { - const GFC_INTEGER_8 *restrict abase_x; - const GFC_INTEGER_8 *restrict bbase_y; - GFC_INTEGER_8 *restrict dest_y; - GFC_INTEGER_8 s; + const GFC_UINTEGER_8 *restrict abase_x; + const GFC_UINTEGER_8 *restrict bbase_y; + GFC_UINTEGER_8 *restrict dest_y; + GFC_UINTEGER_8 s; for (y = 0; y < ycount; y++) { @@ -2996,7 +2996,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, for (x = 0; x < xcount; x++) { abase_x = &abase[x*axstride]; - s = (GFC_INTEGER_8) 0; + s = (GFC_UINTEGER_8) 0; for (n = 0; n < count; n++) s += abase_x[n*aystride] * bbase_y[n*bxstride]; dest_y[x*rxstride] = s; diff --git a/libgfortran/generated/matmul_r10.c b/libgfortran/generated/matmul_r10.c index c3434c2543f..9d28bf3a131 100644 --- a/libgfortran/generated/matmul_r10.c +++ b/libgfortran/generated/matmul_r10.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_REAL_10 *, const GFC_REAL_10 *, - const int *, const GFC_REAL_10 *, const int *, - const GFC_REAL_10 *, GFC_REAL_10 *, const int *, - int, int); + const int *, const GFC_REAL_10 *, const GFC_REAL_10 *, + const int *, const GFC_REAL_10 *, const int *, + const GFC_REAL_10 *, GFC_REAL_10 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_r10 (gfc_array_r10 * const restrict retarray, +extern void matmul_r10 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_r10); @@ -80,11 +80,11 @@ export_proto(matmul_r10); #ifdef HAVE_AVX static void -matmul_r10_avx (gfc_array_r10 * const restrict retarray, +matmul_r10_avx (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_r10_avx (gfc_array_r10 * const restrict retarray, +matmul_r10_avx (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_r10_avx (gfc_array_r10 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, +matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, +matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_r10_avx512f (gfc_array_r10 * const restrict retarray, +matmul_r10_avx512f (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_r10_avx512f (gfc_array_r10 * const restrict retarray, +matmul_r10_avx512f (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, +matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_r10_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_r10_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, +matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_r10_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_r10_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_r10_vanilla (gfc_array_r10 * const restrict retarray, +matmul_r10_vanilla (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_r10_vanilla (gfc_array_r10 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_r10 (gfc_array_r10 * const restrict retarray, +void matmul_r10 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_r10 * const restrict retarray, + static void (*matmul_p) (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_r10 * const restrict retarray, + void (*matmul_fn) (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_r10 (gfc_array_r10 * const restrict retarray, +matmul_r10 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_r16.c b/libgfortran/generated/matmul_r16.c index 2fe50d21667..889280cb4ca 100644 --- a/libgfortran/generated/matmul_r16.c +++ b/libgfortran/generated/matmul_r16.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_REAL_16 *, const GFC_REAL_16 *, - const int *, const GFC_REAL_16 *, const int *, - const GFC_REAL_16 *, GFC_REAL_16 *, const int *, - int, int); + const int *, const GFC_REAL_16 *, const GFC_REAL_16 *, + const int *, const GFC_REAL_16 *, const int *, + const GFC_REAL_16 *, GFC_REAL_16 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_r16 (gfc_array_r16 * const restrict retarray, +extern void matmul_r16 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_r16); @@ -80,11 +80,11 @@ export_proto(matmul_r16); #ifdef HAVE_AVX static void -matmul_r16_avx (gfc_array_r16 * const restrict retarray, +matmul_r16_avx (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_r16_avx (gfc_array_r16 * const restrict retarray, +matmul_r16_avx (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_r16_avx (gfc_array_r16 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, +matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, +matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_r16_avx512f (gfc_array_r16 * const restrict retarray, +matmul_r16_avx512f (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_r16_avx512f (gfc_array_r16 * const restrict retarray, +matmul_r16_avx512f (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray, +matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_r16_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_r16_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray, +matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_r16_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_r16_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_r16_vanilla (gfc_array_r16 * const restrict retarray, +matmul_r16_vanilla (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_r16_vanilla (gfc_array_r16 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_r16 (gfc_array_r16 * const restrict retarray, +void matmul_r16 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_r16 * const restrict retarray, + static void (*matmul_p) (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_r16 * const restrict retarray, + void (*matmul_fn) (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_r16 (gfc_array_r16 * const restrict retarray, +matmul_r16 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_r17.c b/libgfortran/generated/matmul_r17.c index 67ff8e601e2..7ab9f2ff3dc 100644 --- a/libgfortran/generated/matmul_r17.c +++ b/libgfortran/generated/matmul_r17.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_REAL_17 *, const GFC_REAL_17 *, - const int *, const GFC_REAL_17 *, const int *, - const GFC_REAL_17 *, GFC_REAL_17 *, const int *, - int, int); + const int *, const GFC_REAL_17 *, const GFC_REAL_17 *, + const int *, const GFC_REAL_17 *, const int *, + const GFC_REAL_17 *, GFC_REAL_17 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_r17 (gfc_array_r17 * const restrict retarray, +extern void matmul_r17 (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_r17); @@ -80,11 +80,11 @@ export_proto(matmul_r17); #ifdef HAVE_AVX static void -matmul_r17_avx (gfc_array_r17 * const restrict retarray, +matmul_r17_avx (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_r17_avx (gfc_array_r17 * const restrict retarray, +matmul_r17_avx (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_r17_avx (gfc_array_r17 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_r17_avx2 (gfc_array_r17 * const restrict retarray, +matmul_r17_avx2 (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_r17_avx2 (gfc_array_r17 * const restrict retarray, +matmul_r17_avx2 (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_r17_avx2 (gfc_array_r17 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_r17_avx512f (gfc_array_r17 * const restrict retarray, +matmul_r17_avx512f (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_r17_avx512f (gfc_array_r17 * const restrict retarray, +matmul_r17_avx512f (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_r17_avx512f (gfc_array_r17 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_r17_avx128_fma3 (gfc_array_r17 * const restrict retarray, +matmul_r17_avx128_fma3 (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_r17_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_r17_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_r17_avx128_fma4 (gfc_array_r17 * const restrict retarray, +matmul_r17_avx128_fma4 (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_r17_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_r17_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_r17_vanilla (gfc_array_r17 * const restrict retarray, +matmul_r17_vanilla (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_r17_vanilla (gfc_array_r17 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_r17 (gfc_array_r17 * const restrict retarray, +void matmul_r17 (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_r17 * const restrict retarray, + static void (*matmul_p) (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_r17 * const restrict retarray, + void (*matmul_fn) (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_r17 (gfc_array_r17 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_r17 (gfc_array_r17 * const restrict retarray, +matmul_r17 (gfc_array_r17 * const restrict retarray, gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_r4.c b/libgfortran/generated/matmul_r4.c index f1df57749c2..8117af34edd 100644 --- a/libgfortran/generated/matmul_r4.c +++ b/libgfortran/generated/matmul_r4.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_REAL_4 *, const GFC_REAL_4 *, - const int *, const GFC_REAL_4 *, const int *, - const GFC_REAL_4 *, GFC_REAL_4 *, const int *, - int, int); + const int *, const GFC_REAL_4 *, const GFC_REAL_4 *, + const int *, const GFC_REAL_4 *, const int *, + const GFC_REAL_4 *, GFC_REAL_4 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_r4 (gfc_array_r4 * const restrict retarray, +extern void matmul_r4 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_r4); @@ -80,11 +80,11 @@ export_proto(matmul_r4); #ifdef HAVE_AVX static void -matmul_r4_avx (gfc_array_r4 * const restrict retarray, +matmul_r4_avx (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_r4_avx (gfc_array_r4 * const restrict retarray, +matmul_r4_avx (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_r4_avx (gfc_array_r4 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, +matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, +matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, +matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, +matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray, +matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_r4_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_r4_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray, +matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_r4_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_r4_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_r4_vanilla (gfc_array_r4 * const restrict retarray, +matmul_r4_vanilla (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_r4_vanilla (gfc_array_r4 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_r4 (gfc_array_r4 * const restrict retarray, +void matmul_r4 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_r4 * const restrict retarray, + static void (*matmul_p) (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_r4 * const restrict retarray, + void (*matmul_fn) (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_r4 (gfc_array_r4 * const restrict retarray, +matmul_r4 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/generated/matmul_r8.c b/libgfortran/generated/matmul_r8.c index ddfe0a72f77..d05dede27b2 100644 --- a/libgfortran/generated/matmul_r8.c +++ b/libgfortran/generated/matmul_r8.c @@ -35,10 +35,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const GFC_REAL_8 *, const GFC_REAL_8 *, - const int *, const GFC_REAL_8 *, const int *, - const GFC_REAL_8 *, GFC_REAL_8 *, const int *, - int, int); + const int *, const GFC_REAL_8 *, const GFC_REAL_8 *, + const int *, const GFC_REAL_8 *, const int *, + const GFC_REAL_8 *, GFC_REAL_8 *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -69,7 +69,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_r8 (gfc_array_r8 * const restrict retarray, +extern void matmul_r8 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_r8); @@ -80,11 +80,11 @@ export_proto(matmul_r8); #ifdef HAVE_AVX static void -matmul_r8_avx (gfc_array_r8 * const restrict retarray, +matmul_r8_avx (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static void -matmul_r8_avx (gfc_array_r8 * const restrict retarray, +matmul_r8_avx (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -649,11 +649,11 @@ matmul_r8_avx (gfc_array_r8 * const restrict retarray, #ifdef HAVE_AVX2 static void -matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, +matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void -matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, +matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1218,11 +1218,11 @@ matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, #ifdef HAVE_AVX512F static void -matmul_r8_avx512f (gfc_array_r8 * const restrict retarray, +matmul_r8_avx512f (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static void -matmul_r8_avx512f (gfc_array_r8 * const restrict retarray, +matmul_r8_avx512f (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -1789,7 +1789,7 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray, #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) void -matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray, +matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto(matmul_r8_avx128_fma3); @@ -1797,7 +1797,7 @@ internal_proto(matmul_r8_avx128_fma3); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) void -matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray, +matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto(matmul_r8_avx128_fma4); @@ -1805,7 +1805,7 @@ internal_proto(matmul_r8_avx128_fma4); /* Function to fall back to if there is no special processor-specific version. */ static void -matmul_r8_vanilla (gfc_array_r8 * const restrict retarray, +matmul_r8_vanilla (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { @@ -2371,15 +2371,15 @@ matmul_r8_vanilla (gfc_array_r8 * const restrict retarray, /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_r8 (gfc_array_r8 * const restrict retarray, +void matmul_r8 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) (gfc_array_r8 * const restrict retarray, + static void (*matmul_p) (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) (gfc_array_r8 * const restrict retarray, + void (*matmul_fn) (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm); @@ -2447,7 +2447,7 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray, #else /* Just the vanilla function. */ void -matmul_r8 (gfc_array_r8 * const restrict retarray, +matmul_r8 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) { diff --git a/libgfortran/libgfortran.h b/libgfortran/libgfortran.h index faf57a33358..aaa9222c43b 100644 --- a/libgfortran/libgfortran.h +++ b/libgfortran/libgfortran.h @@ -403,6 +403,13 @@ typedef GFC_ARRAY_DESCRIPTOR (index_type) gfc_array_index_type; #ifdef HAVE_GFC_INTEGER_16 typedef GFC_ARRAY_DESCRIPTOR (GFC_INTEGER_16) gfc_array_i16; #endif +typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_1) gfc_array_m1; +typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_2) gfc_array_m2; +typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_4) gfc_array_m4; +typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_8) gfc_array_m8; +#ifdef HAVE_GFC_UINTEGER_16 +typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_16) gfc_array_m16; +#endif typedef GFC_ARRAY_DESCRIPTOR (GFC_REAL_4) gfc_array_r4; typedef GFC_ARRAY_DESCRIPTOR (GFC_REAL_8) gfc_array_r8; #ifdef HAVE_GFC_REAL_10 diff --git a/libgfortran/m4/iparm.m4 b/libgfortran/m4/iparm.m4 index b474620424b..0c4c76c2428 100644 --- a/libgfortran/m4/iparm.m4 +++ b/libgfortran/m4/iparm.m4 @@ -4,7 +4,7 @@ dnl This file is part of the GNU Fortran 95 Runtime Library (libgfortran) dnl Distributed under the GNU GPL with exception. See COPYING for details. dnl M4 macro file to get type names from filenames define(get_typename2, `GFC_$1_$2')dnl -define(get_typename, `get_typename2(ifelse($1,i,INTEGER,ifelse($1,r,REAL,ifelse($1,l,LOGICAL,ifelse($1,c,COMPLEX,ifelse($1,s,UINTEGER,unknown))))),`$2')')dnl +define(get_typename, `get_typename2(ifelse($1,i,INTEGER,ifelse($1,r,REAL,ifelse($1,l,LOGICAL,ifelse($1,c,COMPLEX,ifelse($1,m,UINTEGER,ifelse($1,s,UINTEGER,unknown)))))),`$2')')dnl define(get_arraytype, `gfc_array_$1$2')dnl define(define_type, `dnl ifelse(regexp($2,`^[0-9]'),-1,`dnl diff --git a/libgfortran/m4/matmul.m4 b/libgfortran/m4/matmul.m4 index 7fc1f5fa75f..cd804e8be06 100644 --- a/libgfortran/m4/matmul.m4 +++ b/libgfortran/m4/matmul.m4 @@ -28,6 +28,9 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include ' include(iparm.m4)dnl +ifelse(index(rtype_name,`GFC_INTEGER'),`0',dnl +define(`rtype_name',patsubst(rtype_name,`GFC_INTEGER',`GFC_UINTEGER'))dnl +define(`rtype',patsubst(rtype,`gfc_array_i',`gfc_array_m')))dnl `#if defined (HAVE_'rtype_name`) @@ -36,10 +39,10 @@ include(iparm.m4)dnl matrices. */ typedef void (*blas_call)(const char *, const char *, const int *, const int *, - const int *, const 'rtype_name` *, const 'rtype_name` *, - const int *, const 'rtype_name` *, const int *, - const 'rtype_name` *, 'rtype_name` *, const int *, - int, int); + const int *, const 'rtype_name` *, const 'rtype_name` *, + const int *, const 'rtype_name` *, const int *, + const 'rtype_name` *, 'rtype_name` *, const int *, + int, int); /* The order of loops is different in the case of plain matrix multiplication C=MATMUL(A,B), and in the frequent special case where @@ -70,7 +73,7 @@ typedef void (*blas_call)(const char *, const char *, const int *, const int *, see if there is a way to perform the matrix multiplication by a call to the BLAS gemm function. */ -extern void matmul_'rtype_code` ('rtype` * const restrict retarray, +extern void matmul_'rtype_code` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm); export_proto(matmul_'rtype_code`); @@ -82,7 +85,7 @@ export_proto(matmul_'rtype_code`); #ifdef HAVE_AVX 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl `static void -'matmul_name` ('rtype` * const restrict retarray, +'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); static' include(matmul_internal.m4)dnl @@ -91,7 +94,7 @@ static' include(matmul_internal.m4)dnl #ifdef HAVE_AVX2 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl `static void -'matmul_name` ('rtype` * const restrict retarray, +'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static' include(matmul_internal.m4)dnl @@ -100,7 +103,7 @@ static' include(matmul_internal.m4)dnl #ifdef HAVE_AVX512F 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl `static void -'matmul_name` ('rtype` * const restrict retarray, +'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); static' include(matmul_internal.m4)dnl @@ -111,7 +114,7 @@ static' include(matmul_internal.m4)dnl #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128) 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl `void -'matmul_name` ('rtype` * const restrict retarray, +'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma"))); internal_proto('matmul_name`); @@ -120,7 +123,7 @@ internal_proto('matmul_name`); #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128) 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl `void -'matmul_name` ('rtype` * const restrict retarray, +'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4"))); internal_proto('matmul_name`); @@ -134,15 +137,15 @@ internal_proto('matmul_name`); /* Currently, this is i386 only. Adjust for other architectures. */ -void matmul_'rtype_code` ('rtype` * const restrict retarray, +void matmul_'rtype_code` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) { - static void (*matmul_p) ('rtype` * const restrict retarray, + static void (*matmul_p) ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm); - void (*matmul_fn) ('rtype` * const restrict retarray, + void (*matmul_fn) ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm); diff --git a/libgfortran/m4/matmul_internal.m4 b/libgfortran/m4/matmul_internal.m4 index 0e96207a0fc..20b1a486a4a 100644 --- a/libgfortran/m4/matmul_internal.m4 +++ b/libgfortran/m4/matmul_internal.m4 @@ -1,5 +1,5 @@ `void -'matmul_name` ('rtype` * const restrict retarray, +'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) {