case BT_UNSIGNED:
{
if (pedantic)
- return ARITH_UNSIGNED_NEGATIVE;
+ return check_result (ARITH_UNSIGNED_NEGATIVE, op1, result, resultp);
- arith neg_rc;
mpz_neg (result->value.integer, op1->value.integer);
- neg_rc = gfc_range_check (result);
- if (neg_rc != ARITH_OK)
- gfc_warning (0, gfc_arith_error (neg_rc), &result->where);
-
- gfc_reduce_unsigned (result);
- if (pedantic)
- rc = neg_rc;
}
break;
}
rc = gfc_range_check (result);
-
+ if (op1->ts.type == BT_UNSIGNED)
+ {
+ if (rc != ARITH_OK)
+ {
+ gfc_warning (0, gfc_arith_error (rc), &op1->where);
+ rc = ARITH_OK;
+ }
+ gfc_reduce_unsigned (result);
+ }
return check_result (rc, op1, result, resultp);
}
switch (op1->ts.type)
{
case BT_INTEGER:
+ mpz_sub (result->value.integer, op1->value.integer, op2->value.integer);
+ break;
+
case BT_UNSIGNED:
mpz_sub (result->value.integer, op1->value.integer, op2->value.integer);
+ gfc_reduce_unsigned (result);
break;
case BT_REAL:
return false;
break;
+ case BT_UNSIGNED:
+ /* Check comes later. */
+ break;
+
default:
gfc_error ("%qs argument of %qs intrinsic at %L must be numeric "
"or LOGICAL", gfc_current_intrinsic_arg[0]->name,
return false;
}
+ if (gfc_invalid_unsigned_ops (vector_a, vector_b))
+ {
+ gfc_error ("Argument types of %qs intrinsic at %L must match (%s/%s)",
+ gfc_current_intrinsic, &vector_a->where,
+ gfc_typename(&vector_a->ts), gfc_typename(&vector_b->ts));
+ return false;
+ }
+
if (!rank_check (vector_a, 0, 1))
return false;
}
if ((matrix_a->ts.type == BT_LOGICAL && gfc_numeric_ts (&matrix_b->ts))
- || (gfc_numeric_ts (&matrix_a->ts) && matrix_b->ts.type == BT_LOGICAL))
+ || (gfc_numeric_ts (&matrix_a->ts) && matrix_b->ts.type == BT_LOGICAL)
+ || gfc_invalid_unsigned_ops (matrix_a, matrix_b))
{
gfc_error ("Argument types of %qs intrinsic at %L must match (%s/%s)",
gfc_current_intrinsic, &matrix_a->where,
return p;
}
+/* Get a new expression node that is an unsigned constant. */
+
+gfc_expr *
+gfc_get_unsigned_expr (int kind, locus *where, HOST_WIDE_INT value)
+{
+ gfc_expr *p;
+ p = gfc_get_constant_expr (BT_UNSIGNED, kind,
+ where ? where : &gfc_current_locus);
+ const wide_int w = wi::shwi (value, kind * BITS_PER_UNIT);
+ wi::to_mpz (w, p->value.integer, UNSIGNED);
+
+ return p;
+}
/* Get a new expression node that is a logical constant. */
gfc_expr *gfc_get_constant_expr (bt, int, locus *);
gfc_expr *gfc_get_character_expr (int, locus *, const char *, gfc_charlen_t len);
gfc_expr *gfc_get_int_expr (int, locus *, HOST_WIDE_INT);
+gfc_expr *gfc_get_unsigned_expr (int, locus *, HOST_WIDE_INT);
gfc_expr *gfc_get_logical_expr (int, locus *, bool);
gfc_expr *gfc_get_iokind_expr (locus *, io_kind);
@item @code{MVBITS}
@item @code{RANGE}
@item @code{TRANSFER}
+@item @code{MATMUL} and @code{DOT_PRODUCT}
@end itemize
This list will grow in the near future.
@c ---------------------------------------------------------------------
gfc_resolve_matmul (gfc_expr *f, gfc_expr *a, gfc_expr *b)
{
gfc_expr temp;
+ bt type;
if (a->ts.type == BT_LOGICAL && b->ts.type == BT_LOGICAL)
{
}
}
+ /* We use the same library version of matmul for INTEGER and UNSIGNED,
+ which we call as the INTEGER version. */
+
+ if (f->ts.type == BT_UNSIGNED)
+ type = BT_INTEGER;
+ else
+ type = f->ts.type;
+
f->value.function.name
- = gfc_get_string (PREFIX ("matmul_%c%d"), gfc_type_letter (f->ts.type),
+ = gfc_get_string (PREFIX ("matmul_%c%d"), gfc_type_letter (type),
gfc_type_abi_kind (&f->ts));
}
{
gfc_expr *result, *a, *b, *c;
- /* Set result to an INTEGER(1) 0 for numeric types and .false. for
+ /* Set result to an UNSIGNED of correct kind for unsigned,
+ INTEGER(1) 0 for other numeric types, and .false. for
LOGICAL. Mixed-mode math in the loop will promote result to the
correct type and kind. */
if (matrix_a->ts.type == BT_LOGICAL)
result = gfc_get_logical_expr (gfc_default_logical_kind, NULL, false);
+ else if (matrix_a->ts.type == BT_UNSIGNED)
+ {
+ int kind = MAX (matrix_a->ts.kind, matrix_b->ts.kind);
+ result = gfc_get_unsigned_expr (kind, NULL, 0);
+ }
else
result = gfc_get_int_expr (1, NULL, 0);
+
result->where = matrix_a->where;
a = gfc_constructor_lookup_expr (matrix_a->value.constructor, offset_a);
case BT_INTEGER:
case BT_REAL:
case BT_COMPLEX:
+ case BT_UNSIGNED:
if (conj_a && a->ts.type == BT_COMPLEX)
c = gfc_simplify_conjg (a);
else
--- /dev/null
+! { dg-do run }
+! { dg-options "-funsigned" }
+! Test matrix multiplication
+program memain
+ implicit none
+ call test1
+ call test2
+contains
+ subroutine test1
+ integer, parameter :: n = 10, m = 28
+ unsigned, dimension(n,n) :: u, v, w
+ integer(kind=8), dimension(n,n) :: i, j, k
+ real(8), dimension(n,n) :: a, b
+
+ call random_number(a)
+ call random_number(b)
+ u = uint(a*2.0**m)
+ v = uint(b*2.0**m)
+ i = int(a*2.0**m,8)
+ j = int(b*2.0**m,8)
+ w = matmul(u,v)
+ k = mod(matmul(i,j),2_8**32)
+ if (any(uint(k) /= w)) error stop 1
+ end subroutine test1
+ subroutine test2
+ unsigned, parameter :: u(3,3) = reshape ([1u, uint(-2), 3u, uint(-4), &
+ 5u, uint(-6), 7u, uint(-8), 9u],[3,3])
+ unsigned, parameter :: v(3,3) = 1u - u
+ unsigned, parameter :: w(3,3) = matmul(u,v)
+ integer(kind=16), dimension(3,3), parameter :: &
+ i = int(u,16), j = int(v,16)
+ integer(kind=16), dimension(3,3) :: k = matmul(i,j)
+ if (any(uint(k) /= w)) error stop 2
+ end subroutine test2
+end program memain
--- /dev/null
+! { dg-do run }
+! { dg-options "-funsigned" }
+! Test dot_product both for runtime and compile
+program memain
+ call test1
+ call test2
+contains
+ subroutine test1
+ integer, parameter :: n = 10
+ real(8), dimension(n) :: a, b
+ unsigned, dimension(n) :: u, v
+ integer(8), dimension(n) :: i, j
+ unsigned :: res_u
+ integer(8) :: res_i
+ integer :: k
+
+ do k=1,10
+ call random_number(a)
+ call random_number(b)
+ u = uint(a*2**32)
+ v = uint(b*2**32)
+ i = int(u,8)
+ j = int(v,8)
+ res_u = dot_product(u,v)
+ res_i = dot_product(i,j)
+ if (res_u /= uint(res_i)) error stop 1
+ end do
+ end subroutine test1
+
+ subroutine test2
+ integer, parameter :: n = 5
+ unsigned, parameter, dimension(n) :: &
+ u = [1149221887u, 214388752u, 724301838u, 1618160523u, 1783282425u], &
+ v = [1428464973u, 1887264271u, 1830319906u, 983537781u, 13514400u]
+ integer(8), parameter, dimension(n) :: i = int(u,8), j=int(v,8)
+ unsigned, parameter :: res_1 = dot_product(u,v)
+ integer(8), parameter :: res_2 = dot_product(i,j)
+ if (res_1 /= uint(res_2)) error stop 2
+ end subroutine test2
+end program
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
- const int *, const GFC_COMPLEX_10 *, const int *,
- const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
- int, int);
+ const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
+ const int *, const GFC_COMPLEX_10 *, const int *,
+ const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
+extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c10);
#ifdef HAVE_AVX
static void
-matmul_c10_avx (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_c10_avx (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c10_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
+matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c10_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
+matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_c10 (gfc_array_c10 * const restrict retarray,
+void matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_c10 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_c10 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_c10 (gfc_array_c10 * const restrict retarray,
+matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
- const int *, const GFC_COMPLEX_16 *, const int *,
- const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
- int, int);
+ const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
+ const int *, const GFC_COMPLEX_16 *, const int *,
+ const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
+extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c16);
#ifdef HAVE_AVX
static void
-matmul_c16_avx (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_c16_avx (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c16_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
+matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c16_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
+matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_c16 (gfc_array_c16 * const restrict retarray,
+void matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_c16 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_c16 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_c16 (gfc_array_c16 * const restrict retarray,
+matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_COMPLEX_17 *, const GFC_COMPLEX_17 *,
- const int *, const GFC_COMPLEX_17 *, const int *,
- const GFC_COMPLEX_17 *, GFC_COMPLEX_17 *, const int *,
- int, int);
+ const int *, const GFC_COMPLEX_17 *, const GFC_COMPLEX_17 *,
+ const int *, const GFC_COMPLEX_17 *, const int *,
+ const GFC_COMPLEX_17 *, GFC_COMPLEX_17 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_c17 (gfc_array_c17 * const restrict retarray,
+extern void matmul_c17 (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c17);
#ifdef HAVE_AVX
static void
-matmul_c17_avx (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_c17_avx (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_c17_avx2 (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx2 (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_c17_avx2 (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx2 (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_c17_avx512f (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx512f (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_c17_avx512f (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx512f (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_c17_avx128_fma3 (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx128_fma3 (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c17_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_c17_avx128_fma4 (gfc_array_c17 * const restrict retarray,
+matmul_c17_avx128_fma4 (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c17_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_c17_vanilla (gfc_array_c17 * const restrict retarray,
+matmul_c17_vanilla (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_c17 (gfc_array_c17 * const restrict retarray,
+void matmul_c17 (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_c17 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_c17 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_c17 (gfc_array_c17 * const restrict retarray,
+matmul_c17 (gfc_array_c17 * const restrict retarray,
gfc_array_c17 * const restrict a, gfc_array_c17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
- const int *, const GFC_COMPLEX_4 *, const int *,
- const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
- int, int);
+ const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
+ const int *, const GFC_COMPLEX_4 *, const int *,
+ const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
+extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c4);
#ifdef HAVE_AVX
static void
-matmul_c4_avx (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_c4_avx (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c4_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
+matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c4_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
+matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_c4 (gfc_array_c4 * const restrict retarray,
+void matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_c4 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_c4 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_c4 (gfc_array_c4 * const restrict retarray,
+matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
- const int *, const GFC_COMPLEX_8 *, const int *,
- const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
- int, int);
+ const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
+ const int *, const GFC_COMPLEX_8 *, const int *,
+ const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
+extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c8);
#ifdef HAVE_AVX
static void
-matmul_c8_avx (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_c8_avx (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c8_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
+matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c8_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
+matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_c8 (gfc_array_c8 * const restrict retarray,
+void matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_c8 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_c8 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_c8 (gfc_array_c8 * const restrict retarray,
+matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#include <assert.h>
-#if defined (HAVE_GFC_INTEGER_1)
+#if defined (HAVE_GFC_UINTEGER_1)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_INTEGER_1 *, const GFC_INTEGER_1 *,
- const int *, const GFC_INTEGER_1 *, const int *,
- const GFC_INTEGER_1 *, GFC_INTEGER_1 *, const int *,
- int, int);
+ const int *, const GFC_UINTEGER_1 *, const GFC_UINTEGER_1 *,
+ const int *, const GFC_UINTEGER_1 *, const int *,
+ const GFC_UINTEGER_1 *, GFC_UINTEGER_1 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_i1 (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+extern void matmul_i1 (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i1);
#ifdef HAVE_AVX
static void
-matmul_i1_avx (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_i1_avx (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_1 * restrict abase;
- const GFC_INTEGER_1 * restrict bbase;
- GFC_INTEGER_1 * restrict dest;
+ const GFC_UINTEGER_1 * restrict abase;
+ const GFC_UINTEGER_1 * restrict bbase;
+ GFC_UINTEGER_1 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_1 one = 1, zero = 0;
+ const GFC_UINTEGER_1 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_1 *a, *b;
- GFC_INTEGER_1 *c;
+ const GFC_UINTEGER_1 *a, *b;
+ GFC_UINTEGER_1 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_1 *t1;
+ GFC_UINTEGER_1 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_1)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX2
static void
-matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx2 (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx2 (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_1 * restrict abase;
- const GFC_INTEGER_1 * restrict bbase;
- GFC_INTEGER_1 * restrict dest;
+ const GFC_UINTEGER_1 * restrict abase;
+ const GFC_UINTEGER_1 * restrict bbase;
+ GFC_UINTEGER_1 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_1 one = 1, zero = 0;
+ const GFC_UINTEGER_1 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_1 *a, *b;
- GFC_INTEGER_1 *c;
+ const GFC_UINTEGER_1 *a, *b;
+ GFC_UINTEGER_1 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_1 *t1;
+ GFC_UINTEGER_1 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_1)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX512F
static void
-matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx512f (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx512f (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_1 * restrict abase;
- const GFC_INTEGER_1 * restrict bbase;
- GFC_INTEGER_1 * restrict dest;
+ const GFC_UINTEGER_1 * restrict abase;
+ const GFC_UINTEGER_1 * restrict bbase;
+ GFC_UINTEGER_1 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_1 one = 1, zero = 0;
+ const GFC_UINTEGER_1 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_1 *a, *b;
- GFC_INTEGER_1 *c;
+ const GFC_UINTEGER_1 *a, *b;
+ GFC_UINTEGER_1 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_1 *t1;
+ GFC_UINTEGER_1 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_1)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx128_fma3 (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i1_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_avx128_fma4 (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i1_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1_vanilla (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_1 * restrict abase;
- const GFC_INTEGER_1 * restrict bbase;
- GFC_INTEGER_1 * restrict dest;
+ const GFC_UINTEGER_1 * restrict abase;
+ const GFC_UINTEGER_1 * restrict bbase;
+ GFC_UINTEGER_1 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_1 one = 1, zero = 0;
+ const GFC_UINTEGER_1 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_1 *a, *b;
- GFC_INTEGER_1 *c;
+ const GFC_UINTEGER_1 *a, *b;
+ GFC_UINTEGER_1 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_1 *t1;
+ GFC_UINTEGER_1 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_1)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_i1 (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+void matmul_i1 (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ static void (*matmul_p) (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ void (*matmul_fn) (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
#else /* Just the vanilla function. */
void
-matmul_i1 (gfc_array_i1 * const restrict retarray,
- gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+matmul_i1 (gfc_array_m1 * const restrict retarray,
+ gfc_array_m1 * const restrict a, gfc_array_m1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_1 * restrict abase;
- const GFC_INTEGER_1 * restrict bbase;
- GFC_INTEGER_1 * restrict dest;
+ const GFC_UINTEGER_1 * restrict abase;
+ const GFC_UINTEGER_1 * restrict bbase;
+ GFC_UINTEGER_1 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_1));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_1 one = 1, zero = 0;
+ const GFC_UINTEGER_1 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_1 *a, *b;
- GFC_INTEGER_1 *c;
+ const GFC_UINTEGER_1 *a, *b;
+ GFC_UINTEGER_1 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_1 *t1;
+ GFC_UINTEGER_1 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_1)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_1)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_1));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_1));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_1)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_1 *restrict abase_x;
- const GFC_INTEGER_1 *restrict bbase_y;
- GFC_INTEGER_1 *restrict dest_y;
- GFC_INTEGER_1 s;
+ const GFC_UINTEGER_1 *restrict abase_x;
+ const GFC_UINTEGER_1 *restrict bbase_y;
+ GFC_UINTEGER_1 *restrict dest_y;
+ GFC_UINTEGER_1 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_1) 0;
+ s = (GFC_UINTEGER_1) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#include <assert.h>
-#if defined (HAVE_GFC_INTEGER_16)
+#if defined (HAVE_GFC_UINTEGER_16)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
- const int *, const GFC_INTEGER_16 *, const int *,
- const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
- int, int);
+ const int *, const GFC_UINTEGER_16 *, const GFC_UINTEGER_16 *,
+ const int *, const GFC_UINTEGER_16 *, const int *,
+ const GFC_UINTEGER_16 *, GFC_UINTEGER_16 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+extern void matmul_i16 (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i16);
#ifdef HAVE_AVX
static void
-matmul_i16_avx (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_i16_avx (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_16 * restrict abase;
- const GFC_INTEGER_16 * restrict bbase;
- GFC_INTEGER_16 * restrict dest;
+ const GFC_UINTEGER_16 * restrict abase;
+ const GFC_UINTEGER_16 * restrict bbase;
+ GFC_UINTEGER_16 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_16 one = 1, zero = 0;
+ const GFC_UINTEGER_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_16 *a, *b;
- GFC_INTEGER_16 *c;
+ const GFC_UINTEGER_16 *a, *b;
+ GFC_UINTEGER_16 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_16 *t1;
+ GFC_UINTEGER_16 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_16)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX2
static void
-matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx2 (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx2 (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_16 * restrict abase;
- const GFC_INTEGER_16 * restrict bbase;
- GFC_INTEGER_16 * restrict dest;
+ const GFC_UINTEGER_16 * restrict abase;
+ const GFC_UINTEGER_16 * restrict bbase;
+ GFC_UINTEGER_16 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_16 one = 1, zero = 0;
+ const GFC_UINTEGER_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_16 *a, *b;
- GFC_INTEGER_16 *c;
+ const GFC_UINTEGER_16 *a, *b;
+ GFC_UINTEGER_16 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_16 *t1;
+ GFC_UINTEGER_16 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_16)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX512F
static void
-matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx512f (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx512f (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_16 * restrict abase;
- const GFC_INTEGER_16 * restrict bbase;
- GFC_INTEGER_16 * restrict dest;
+ const GFC_UINTEGER_16 * restrict abase;
+ const GFC_UINTEGER_16 * restrict bbase;
+ GFC_UINTEGER_16 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_16 one = 1, zero = 0;
+ const GFC_UINTEGER_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_16 *a, *b;
- GFC_INTEGER_16 *c;
+ const GFC_UINTEGER_16 *a, *b;
+ GFC_UINTEGER_16 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_16 *t1;
+ GFC_UINTEGER_16 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_16)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx128_fma3 (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i16_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_avx128_fma4 (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i16_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16_vanilla (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_16 * restrict abase;
- const GFC_INTEGER_16 * restrict bbase;
- GFC_INTEGER_16 * restrict dest;
+ const GFC_UINTEGER_16 * restrict abase;
+ const GFC_UINTEGER_16 * restrict bbase;
+ GFC_UINTEGER_16 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_16 one = 1, zero = 0;
+ const GFC_UINTEGER_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_16 *a, *b;
- GFC_INTEGER_16 *c;
+ const GFC_UINTEGER_16 *a, *b;
+ GFC_UINTEGER_16 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_16 *t1;
+ GFC_UINTEGER_16 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_16)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_i16 (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+void matmul_i16 (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ static void (*matmul_p) (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ void (*matmul_fn) (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
#else /* Just the vanilla function. */
void
-matmul_i16 (gfc_array_i16 * const restrict retarray,
- gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+matmul_i16 (gfc_array_m16 * const restrict retarray,
+ gfc_array_m16 * const restrict a, gfc_array_m16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_16 * restrict abase;
- const GFC_INTEGER_16 * restrict bbase;
- GFC_INTEGER_16 * restrict dest;
+ const GFC_UINTEGER_16 * restrict abase;
+ const GFC_UINTEGER_16 * restrict bbase;
+ GFC_UINTEGER_16 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_16));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_16 one = 1, zero = 0;
+ const GFC_UINTEGER_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_16 *a, *b;
- GFC_INTEGER_16 *c;
+ const GFC_UINTEGER_16 *a, *b;
+ GFC_UINTEGER_16 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_16 *t1;
+ GFC_UINTEGER_16 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_16)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_16)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_16));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_16));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_16)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_16 *restrict abase_x;
- const GFC_INTEGER_16 *restrict bbase_y;
- GFC_INTEGER_16 *restrict dest_y;
- GFC_INTEGER_16 s;
+ const GFC_UINTEGER_16 *restrict abase_x;
+ const GFC_UINTEGER_16 *restrict bbase_y;
+ GFC_UINTEGER_16 *restrict dest_y;
+ GFC_UINTEGER_16 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_16) 0;
+ s = (GFC_UINTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#include <assert.h>
-#if defined (HAVE_GFC_INTEGER_2)
+#if defined (HAVE_GFC_UINTEGER_2)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_INTEGER_2 *, const GFC_INTEGER_2 *,
- const int *, const GFC_INTEGER_2 *, const int *,
- const GFC_INTEGER_2 *, GFC_INTEGER_2 *, const int *,
- int, int);
+ const int *, const GFC_UINTEGER_2 *, const GFC_UINTEGER_2 *,
+ const int *, const GFC_UINTEGER_2 *, const int *,
+ const GFC_UINTEGER_2 *, GFC_UINTEGER_2 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_i2 (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+extern void matmul_i2 (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i2);
#ifdef HAVE_AVX
static void
-matmul_i2_avx (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_i2_avx (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_2 * restrict abase;
- const GFC_INTEGER_2 * restrict bbase;
- GFC_INTEGER_2 * restrict dest;
+ const GFC_UINTEGER_2 * restrict abase;
+ const GFC_UINTEGER_2 * restrict bbase;
+ GFC_UINTEGER_2 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_2 one = 1, zero = 0;
+ const GFC_UINTEGER_2 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_2 *a, *b;
- GFC_INTEGER_2 *c;
+ const GFC_UINTEGER_2 *a, *b;
+ GFC_UINTEGER_2 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_2 *t1;
+ GFC_UINTEGER_2 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_2)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX2
static void
-matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx2 (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx2 (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_2 * restrict abase;
- const GFC_INTEGER_2 * restrict bbase;
- GFC_INTEGER_2 * restrict dest;
+ const GFC_UINTEGER_2 * restrict abase;
+ const GFC_UINTEGER_2 * restrict bbase;
+ GFC_UINTEGER_2 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_2 one = 1, zero = 0;
+ const GFC_UINTEGER_2 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_2 *a, *b;
- GFC_INTEGER_2 *c;
+ const GFC_UINTEGER_2 *a, *b;
+ GFC_UINTEGER_2 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_2 *t1;
+ GFC_UINTEGER_2 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_2)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX512F
static void
-matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx512f (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx512f (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_2 * restrict abase;
- const GFC_INTEGER_2 * restrict bbase;
- GFC_INTEGER_2 * restrict dest;
+ const GFC_UINTEGER_2 * restrict abase;
+ const GFC_UINTEGER_2 * restrict bbase;
+ GFC_UINTEGER_2 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_2 one = 1, zero = 0;
+ const GFC_UINTEGER_2 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_2 *a, *b;
- GFC_INTEGER_2 *c;
+ const GFC_UINTEGER_2 *a, *b;
+ GFC_UINTEGER_2 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_2 *t1;
+ GFC_UINTEGER_2 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_2)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx128_fma3 (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i2_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_avx128_fma4 (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i2_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2_vanilla (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_2 * restrict abase;
- const GFC_INTEGER_2 * restrict bbase;
- GFC_INTEGER_2 * restrict dest;
+ const GFC_UINTEGER_2 * restrict abase;
+ const GFC_UINTEGER_2 * restrict bbase;
+ GFC_UINTEGER_2 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_2 one = 1, zero = 0;
+ const GFC_UINTEGER_2 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_2 *a, *b;
- GFC_INTEGER_2 *c;
+ const GFC_UINTEGER_2 *a, *b;
+ GFC_UINTEGER_2 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_2 *t1;
+ GFC_UINTEGER_2 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_2)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_i2 (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+void matmul_i2 (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ static void (*matmul_p) (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ void (*matmul_fn) (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
#else /* Just the vanilla function. */
void
-matmul_i2 (gfc_array_i2 * const restrict retarray,
- gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+matmul_i2 (gfc_array_m2 * const restrict retarray,
+ gfc_array_m2 * const restrict a, gfc_array_m2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_2 * restrict abase;
- const GFC_INTEGER_2 * restrict bbase;
- GFC_INTEGER_2 * restrict dest;
+ const GFC_UINTEGER_2 * restrict abase;
+ const GFC_UINTEGER_2 * restrict bbase;
+ GFC_UINTEGER_2 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_2));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_2 one = 1, zero = 0;
+ const GFC_UINTEGER_2 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_2 *a, *b;
- GFC_INTEGER_2 *c;
+ const GFC_UINTEGER_2 *a, *b;
+ GFC_UINTEGER_2 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_2 *t1;
+ GFC_UINTEGER_2 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_2)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_2)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_2));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_2));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_2)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_2 *restrict abase_x;
- const GFC_INTEGER_2 *restrict bbase_y;
- GFC_INTEGER_2 *restrict dest_y;
- GFC_INTEGER_2 s;
+ const GFC_UINTEGER_2 *restrict abase_x;
+ const GFC_UINTEGER_2 *restrict bbase_y;
+ GFC_UINTEGER_2 *restrict dest_y;
+ GFC_UINTEGER_2 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_2) 0;
+ s = (GFC_UINTEGER_2) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#include <assert.h>
-#if defined (HAVE_GFC_INTEGER_4)
+#if defined (HAVE_GFC_UINTEGER_4)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
- const int *, const GFC_INTEGER_4 *, const int *,
- const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
- int, int);
+ const int *, const GFC_UINTEGER_4 *, const GFC_UINTEGER_4 *,
+ const int *, const GFC_UINTEGER_4 *, const int *,
+ const GFC_UINTEGER_4 *, GFC_UINTEGER_4 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+extern void matmul_i4 (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i4);
#ifdef HAVE_AVX
static void
-matmul_i4_avx (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_i4_avx (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_4 * restrict abase;
- const GFC_INTEGER_4 * restrict bbase;
- GFC_INTEGER_4 * restrict dest;
+ const GFC_UINTEGER_4 * restrict abase;
+ const GFC_UINTEGER_4 * restrict bbase;
+ GFC_UINTEGER_4 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_4 one = 1, zero = 0;
+ const GFC_UINTEGER_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_4 *a, *b;
- GFC_INTEGER_4 *c;
+ const GFC_UINTEGER_4 *a, *b;
+ GFC_UINTEGER_4 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_4 *t1;
+ GFC_UINTEGER_4 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_4)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX2
static void
-matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx2 (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx2 (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_4 * restrict abase;
- const GFC_INTEGER_4 * restrict bbase;
- GFC_INTEGER_4 * restrict dest;
+ const GFC_UINTEGER_4 * restrict abase;
+ const GFC_UINTEGER_4 * restrict bbase;
+ GFC_UINTEGER_4 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_4 one = 1, zero = 0;
+ const GFC_UINTEGER_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_4 *a, *b;
- GFC_INTEGER_4 *c;
+ const GFC_UINTEGER_4 *a, *b;
+ GFC_UINTEGER_4 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_4 *t1;
+ GFC_UINTEGER_4 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_4)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX512F
static void
-matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx512f (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx512f (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_4 * restrict abase;
- const GFC_INTEGER_4 * restrict bbase;
- GFC_INTEGER_4 * restrict dest;
+ const GFC_UINTEGER_4 * restrict abase;
+ const GFC_UINTEGER_4 * restrict bbase;
+ GFC_UINTEGER_4 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_4 one = 1, zero = 0;
+ const GFC_UINTEGER_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_4 *a, *b;
- GFC_INTEGER_4 *c;
+ const GFC_UINTEGER_4 *a, *b;
+ GFC_UINTEGER_4 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_4 *t1;
+ GFC_UINTEGER_4 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_4)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx128_fma3 (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i4_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_avx128_fma4 (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i4_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4_vanilla (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_4 * restrict abase;
- const GFC_INTEGER_4 * restrict bbase;
- GFC_INTEGER_4 * restrict dest;
+ const GFC_UINTEGER_4 * restrict abase;
+ const GFC_UINTEGER_4 * restrict bbase;
+ GFC_UINTEGER_4 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_4 one = 1, zero = 0;
+ const GFC_UINTEGER_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_4 *a, *b;
- GFC_INTEGER_4 *c;
+ const GFC_UINTEGER_4 *a, *b;
+ GFC_UINTEGER_4 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_4 *t1;
+ GFC_UINTEGER_4 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_4)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_i4 (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+void matmul_i4 (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ static void (*matmul_p) (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ void (*matmul_fn) (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
#else /* Just the vanilla function. */
void
-matmul_i4 (gfc_array_i4 * const restrict retarray,
- gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+matmul_i4 (gfc_array_m4 * const restrict retarray,
+ gfc_array_m4 * const restrict a, gfc_array_m4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_4 * restrict abase;
- const GFC_INTEGER_4 * restrict bbase;
- GFC_INTEGER_4 * restrict dest;
+ const GFC_UINTEGER_4 * restrict abase;
+ const GFC_UINTEGER_4 * restrict bbase;
+ GFC_UINTEGER_4 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_4));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_4 one = 1, zero = 0;
+ const GFC_UINTEGER_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_4 *a, *b;
- GFC_INTEGER_4 *c;
+ const GFC_UINTEGER_4 *a, *b;
+ GFC_UINTEGER_4 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_4 *t1;
+ GFC_UINTEGER_4 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_4)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_4)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_4));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_4));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_4)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_4 *restrict abase_x;
- const GFC_INTEGER_4 *restrict bbase_y;
- GFC_INTEGER_4 *restrict dest_y;
- GFC_INTEGER_4 s;
+ const GFC_UINTEGER_4 *restrict abase_x;
+ const GFC_UINTEGER_4 *restrict bbase_y;
+ GFC_UINTEGER_4 *restrict dest_y;
+ GFC_UINTEGER_4 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_4) 0;
+ s = (GFC_UINTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#include <assert.h>
-#if defined (HAVE_GFC_INTEGER_8)
+#if defined (HAVE_GFC_UINTEGER_8)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
- const int *, const GFC_INTEGER_8 *, const int *,
- const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
- int, int);
+ const int *, const GFC_UINTEGER_8 *, const GFC_UINTEGER_8 *,
+ const int *, const GFC_UINTEGER_8 *, const int *,
+ const GFC_UINTEGER_8 *, GFC_UINTEGER_8 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+extern void matmul_i8 (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i8);
#ifdef HAVE_AVX
static void
-matmul_i8_avx (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_i8_avx (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_8 * restrict abase;
- const GFC_INTEGER_8 * restrict bbase;
- GFC_INTEGER_8 * restrict dest;
+ const GFC_UINTEGER_8 * restrict abase;
+ const GFC_UINTEGER_8 * restrict bbase;
+ GFC_UINTEGER_8 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_8 one = 1, zero = 0;
+ const GFC_UINTEGER_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_8 *a, *b;
- GFC_INTEGER_8 *c;
+ const GFC_UINTEGER_8 *a, *b;
+ GFC_UINTEGER_8 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_8 *t1;
+ GFC_UINTEGER_8 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_8)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX2
static void
-matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx2 (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx2 (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_8 * restrict abase;
- const GFC_INTEGER_8 * restrict bbase;
- GFC_INTEGER_8 * restrict dest;
+ const GFC_UINTEGER_8 * restrict abase;
+ const GFC_UINTEGER_8 * restrict bbase;
+ GFC_UINTEGER_8 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_8 one = 1, zero = 0;
+ const GFC_UINTEGER_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_8 *a, *b;
- GFC_INTEGER_8 *c;
+ const GFC_UINTEGER_8 *a, *b;
+ GFC_UINTEGER_8 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_8 *t1;
+ GFC_UINTEGER_8 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_8)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#ifdef HAVE_AVX512F
static void
-matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx512f (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx512f (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_8 * restrict abase;
- const GFC_INTEGER_8 * restrict bbase;
- GFC_INTEGER_8 * restrict dest;
+ const GFC_UINTEGER_8 * restrict abase;
+ const GFC_UINTEGER_8 * restrict bbase;
+ GFC_UINTEGER_8 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_8 one = 1, zero = 0;
+ const GFC_UINTEGER_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_8 *a, *b;
- GFC_INTEGER_8 *c;
+ const GFC_UINTEGER_8 *a, *b;
+ GFC_UINTEGER_8 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_8 *t1;
+ GFC_UINTEGER_8 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_8)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx128_fma3 (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i8_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_avx128_fma4 (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i8_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8_vanilla (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_8 * restrict abase;
- const GFC_INTEGER_8 * restrict bbase;
- GFC_INTEGER_8 * restrict dest;
+ const GFC_UINTEGER_8 * restrict abase;
+ const GFC_UINTEGER_8 * restrict bbase;
+ GFC_UINTEGER_8 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_8 one = 1, zero = 0;
+ const GFC_UINTEGER_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_8 *a, *b;
- GFC_INTEGER_8 *c;
+ const GFC_UINTEGER_8 *a, *b;
+ GFC_UINTEGER_8 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_8 *t1;
+ GFC_UINTEGER_8 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_8)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_i8 (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+void matmul_i8 (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ static void (*matmul_p) (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ void (*matmul_fn) (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
#else /* Just the vanilla function. */
void
-matmul_i8 (gfc_array_i8 * const restrict retarray,
- gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+matmul_i8 (gfc_array_m8 * const restrict retarray,
+ gfc_array_m8 * const restrict a, gfc_array_m8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- const GFC_INTEGER_8 * restrict abase;
- const GFC_INTEGER_8 * restrict bbase;
- GFC_INTEGER_8 * restrict dest;
+ const GFC_UINTEGER_8 * restrict abase;
+ const GFC_UINTEGER_8 * restrict bbase;
+ GFC_UINTEGER_8 * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
}
retarray->base_addr
- = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8));
+ = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_UINTEGER_8));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
- const GFC_INTEGER_8 one = 1, zero = 0;
+ const GFC_UINTEGER_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
from netlib.org, translated to C, and modified for matmul.m4. */
- const GFC_INTEGER_8 *a, *b;
- GFC_INTEGER_8 *c;
+ const GFC_UINTEGER_8 *a, *b;
+ GFC_UINTEGER_8 *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
i1, i2, i3, i4, i5, i6;
/* Local variables */
- GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
+ GFC_UINTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
- GFC_INTEGER_8 *t1;
+ GFC_UINTEGER_8 *t1;
a = abase;
b = bbase;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
- c[i + j * c_dim1] = (GFC_INTEGER_8)0;
+ c[i + j * c_dim1] = (GFC_UINTEGER_8)0;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
if (t1_dim > 65536)
t1_dim = 65536;
- t1 = malloc (t1_dim * sizeof(GFC_INTEGER_8));
+ t1 = malloc (t1_dim * sizeof(GFC_UINTEGER_8));
/* Start turning the crank. */
i1 = n;
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
else
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
- dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0;
+ dest[x*rxstride + y*rystride] = (GFC_UINTEGER_8)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
}
else
{
- const GFC_INTEGER_8 *restrict abase_x;
- const GFC_INTEGER_8 *restrict bbase_y;
- GFC_INTEGER_8 *restrict dest_y;
- GFC_INTEGER_8 s;
+ const GFC_UINTEGER_8 *restrict abase_x;
+ const GFC_UINTEGER_8 *restrict bbase_y;
+ GFC_UINTEGER_8 *restrict dest_y;
+ GFC_UINTEGER_8 s;
for (y = 0; y < ycount; y++)
{
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
- s = (GFC_INTEGER_8) 0;
+ s = (GFC_UINTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
- const int *, const GFC_REAL_10 *, const int *,
- const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
- int, int);
+ const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
+ const int *, const GFC_REAL_10 *, const int *,
+ const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
+extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r10);
#ifdef HAVE_AVX
static void
-matmul_r10_avx (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_r10_avx (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r10_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
+matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r10_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
+matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_r10 (gfc_array_r10 * const restrict retarray,
+void matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_r10 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_r10 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_r10 (gfc_array_r10 * const restrict retarray,
+matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
- const int *, const GFC_REAL_16 *, const int *,
- const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
- int, int);
+ const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
+ const int *, const GFC_REAL_16 *, const int *,
+ const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
+extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r16);
#ifdef HAVE_AVX
static void
-matmul_r16_avx (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_r16_avx (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r16_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
+matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r16_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
+matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_r16 (gfc_array_r16 * const restrict retarray,
+void matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_r16 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_r16 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_r16 (gfc_array_r16 * const restrict retarray,
+matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_REAL_17 *, const GFC_REAL_17 *,
- const int *, const GFC_REAL_17 *, const int *,
- const GFC_REAL_17 *, GFC_REAL_17 *, const int *,
- int, int);
+ const int *, const GFC_REAL_17 *, const GFC_REAL_17 *,
+ const int *, const GFC_REAL_17 *, const int *,
+ const GFC_REAL_17 *, GFC_REAL_17 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_r17 (gfc_array_r17 * const restrict retarray,
+extern void matmul_r17 (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r17);
#ifdef HAVE_AVX
static void
-matmul_r17_avx (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_r17_avx (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_r17_avx2 (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx2 (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_r17_avx2 (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx2 (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_r17_avx512f (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx512f (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_r17_avx512f (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx512f (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_r17_avx128_fma3 (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx128_fma3 (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r17_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_r17_avx128_fma4 (gfc_array_r17 * const restrict retarray,
+matmul_r17_avx128_fma4 (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r17_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_r17_vanilla (gfc_array_r17 * const restrict retarray,
+matmul_r17_vanilla (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_r17 (gfc_array_r17 * const restrict retarray,
+void matmul_r17 (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_r17 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_r17 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_r17 (gfc_array_r17 * const restrict retarray,
+matmul_r17 (gfc_array_r17 * const restrict retarray,
gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
- const int *, const GFC_REAL_4 *, const int *,
- const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
- int, int);
+ const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
+ const int *, const GFC_REAL_4 *, const int *,
+ const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
+extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r4);
#ifdef HAVE_AVX
static void
-matmul_r4_avx (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_r4_avx (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r4_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
+matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r4_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
+matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_r4 (gfc_array_r4 * const restrict retarray,
+void matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_r4 (gfc_array_r4 * const restrict retarray,
+matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
- const int *, const GFC_REAL_8 *, const int *,
- const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
- int, int);
+ const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
+ const int *, const GFC_REAL_8 *, const int *,
+ const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
+extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r8);
#ifdef HAVE_AVX
static void
-matmul_r8_avx (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static void
-matmul_r8_avx (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX2
static void
-matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
-matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_AVX512F
static void
-matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static void
-matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
-matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r8_avx128_fma3);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
-matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
+matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r8_avx128_fma4);
/* Function to fall back to if there is no special processor-specific version. */
static void
-matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
+matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_r8 (gfc_array_r8 * const restrict retarray,
+void matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) (gfc_array_r8 * const restrict retarray,
+ static void (*matmul_p) (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) (gfc_array_r8 * const restrict retarray,
+ void (*matmul_fn) (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
#else /* Just the vanilla function. */
void
-matmul_r8 (gfc_array_r8 * const restrict retarray,
+matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
#ifdef HAVE_GFC_INTEGER_16
typedef GFC_ARRAY_DESCRIPTOR (GFC_INTEGER_16) gfc_array_i16;
#endif
+typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_1) gfc_array_m1;
+typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_2) gfc_array_m2;
+typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_4) gfc_array_m4;
+typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_8) gfc_array_m8;
+#ifdef HAVE_GFC_UINTEGER_16
+typedef GFC_ARRAY_DESCRIPTOR (GFC_UINTEGER_16) gfc_array_m16;
+#endif
typedef GFC_ARRAY_DESCRIPTOR (GFC_REAL_4) gfc_array_r4;
typedef GFC_ARRAY_DESCRIPTOR (GFC_REAL_8) gfc_array_r8;
#ifdef HAVE_GFC_REAL_10
dnl Distributed under the GNU GPL with exception. See COPYING for details.
dnl M4 macro file to get type names from filenames
define(get_typename2, `GFC_$1_$2')dnl
-define(get_typename, `get_typename2(ifelse($1,i,INTEGER,ifelse($1,r,REAL,ifelse($1,l,LOGICAL,ifelse($1,c,COMPLEX,ifelse($1,s,UINTEGER,unknown))))),`$2')')dnl
+define(get_typename, `get_typename2(ifelse($1,i,INTEGER,ifelse($1,r,REAL,ifelse($1,l,LOGICAL,ifelse($1,c,COMPLEX,ifelse($1,m,UINTEGER,ifelse($1,s,UINTEGER,unknown)))))),`$2')')dnl
define(get_arraytype, `gfc_array_$1$2')dnl
define(define_type, `dnl
ifelse(regexp($2,`^[0-9]'),-1,`dnl
#include <assert.h>'
include(iparm.m4)dnl
+ifelse(index(rtype_name,`GFC_INTEGER'),`0',dnl
+define(`rtype_name',patsubst(rtype_name,`GFC_INTEGER',`GFC_UINTEGER'))dnl
+define(`rtype',patsubst(rtype,`gfc_array_i',`gfc_array_m')))dnl
`#if defined (HAVE_'rtype_name`)
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
- const int *, const 'rtype_name` *, const 'rtype_name` *,
- const int *, const 'rtype_name` *, const int *,
- const 'rtype_name` *, 'rtype_name` *, const int *,
- int, int);
+ const int *, const 'rtype_name` *, const 'rtype_name` *,
+ const int *, const 'rtype_name` *, const int *,
+ const 'rtype_name` *, 'rtype_name` *, const int *,
+ int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
-extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
+extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_'rtype_code`);
#ifdef HAVE_AVX
'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
`static void
-'matmul_name` ('rtype` * const restrict retarray,
+'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static' include(matmul_internal.m4)dnl
#ifdef HAVE_AVX2
'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
`static void
-'matmul_name` ('rtype` * const restrict retarray,
+'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static' include(matmul_internal.m4)dnl
#ifdef HAVE_AVX512F
'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
`static void
-'matmul_name` ('rtype` * const restrict retarray,
+'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static' include(matmul_internal.m4)dnl
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
`void
-'matmul_name` ('rtype` * const restrict retarray,
+'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto('matmul_name`);
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
`void
-'matmul_name` ('rtype` * const restrict retarray,
+'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto('matmul_name`);
/* Currently, this is i386 only. Adjust for other architectures. */
-void matmul_'rtype_code` ('rtype` * const restrict retarray,
+void matmul_'rtype_code` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
- static void (*matmul_p) ('rtype` * const restrict retarray,
+ static void (*matmul_p) ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
- void (*matmul_fn) ('rtype` * const restrict retarray,
+ void (*matmul_fn) ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
`void
-'matmul_name` ('rtype` * const restrict retarray,
+'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{