;; Unpredicated floating-point ternary operations.
(define_expand "<optab><mode>4"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
[(match_dup 4)
- (const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F_B16B16 1 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
+ (match_dup 5)
+ (match_operand:SVE_F_B16B16 1 "register_operand")
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")]
SVE_COND_FP_TERNARY))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
- operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[4] = aarch64_sve_fp_pred (<MODE>mode, &operands[5]);
}
)
;; Predicated floating-point ternary operations.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
- (unspec:SVE_FULL_F_B16B16
- [(match_operand:<VPRED> 1 "register_operand")
+ [(set (match_operand:SVE_F_B16B16 0 "register_operand")
+ (unspec:SVE_F_B16B16
+ [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
(match_operand:SI 5 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 3 "register_operand")
- (match_operand:SVE_FULL_F_B16B16 4 "register_operand")]
+ (match_operand:SVE_F_B16B16 2 "register_operand")
+ (match_operand:SVE_F_B16B16 3 "register_operand")
+ (match_operand:SVE_F_B16B16 4 "register_operand")]
SVE_COND_FP_TERNARY))]
"TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{@ [ cons: =0 , 1 , %2 , 3 , 4 ; attrs: movprfx , is_rev ]
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048" } */
+
+#define BFMLA(TYPE) \
+ TYPE test_bfmla_##TYPE (TYPE a, TYPE b, TYPE c) \
+ { return a * b + c; }
+
+#define BFMLS(TYPE) \
+ TYPE test_bfmls_##TYPE (TYPE a, TYPE b, TYPE c) \
+ { return a * -b + c; }
+
+#define TEST_TYPE(TYPE, SIZE) \
+ typedef TYPE TYPE##SIZE __attribute__((vector_size(SIZE))); \
+ BFMLA (TYPE##SIZE) \
+ BFMLS (TYPE##SIZE)
+
+#pragma GCC target "arch=armv9-a+sve-b16b16"
+
+TEST_TYPE (__bf16, 128)
+
+TEST_TYPE (__bf16, 64)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tbfmla\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tbfmls\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-trapping-math" } */
+
+#include "unpacked_ternary_bf16_1.C"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tbfmla\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tbfmls\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT) \
+ void \
+ f_##TYPE0##_##TYPE1 (TYPE1 *__restrict out, \
+ TYPE0 *__restrict a, \
+ TYPE0 *__restrict b, \
+ TYPE0 *__restrict c, \
+ TYPE0 *__restrict d) \
+ { \
+ for (unsigned int i = 0; i < COUNT; i++) \
+ if (FN > d[i]) \
+ out[i] = 3; \
+ }
+
+TEST_FN (FMLA (f16), _Float16, uint64_t, 32)
+
+TEST_FN (FMLA (f16), _Float16, uint32_t, 64)
+
+TEST_FN (FMLA (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fmla|fmad)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fmla|fmad)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */
+
+#include "unpacked_fmla_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fmla|fmad)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fmla|fmad)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT) \
+ void \
+ f_##TYPE0##_##TYPE1 (TYPE1 *__restrict out, \
+ TYPE0 *__restrict a, \
+ TYPE0 *__restrict b, \
+ TYPE0 *__restrict c, \
+ TYPE0 *__restrict d) \
+ { \
+ for (unsigned int i = 0; i < COUNT; i++) \
+ if (FN > d[i]) \
+ out[i] = 3; \
+ }
+
+TEST_FN (FMLS (f16), _Float16, uint64_t, 32)
+
+TEST_FN (FMLS (f16), _Float16, uint32_t, 64)
+
+TEST_FN (FMLS (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fmls|fmsb)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fmls|fmsb)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */
+
+#include "unpacked_fmls_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fmls|fmsb)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fmls|fmsb)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT) \
+ void \
+ f_##TYPE0##_##TYPE1 (TYPE1 *__restrict out, \
+ TYPE0 *__restrict a, \
+ TYPE0 *__restrict b, \
+ TYPE0 *__restrict c, \
+ TYPE0 *__restrict d) \
+ { \
+ for (unsigned int i = 0; i < COUNT; i++) \
+ if (FN > d[i]) \
+ out[i] = 3; \
+ }
+
+TEST_FN (FNMLA (f16), _Float16, uint64_t, 32)
+
+TEST_FN (FNMLA (f16), _Float16, uint32_t, 64)
+
+TEST_FN (FNMLA (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fnmla|fnmad)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fnmla|fnmad)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */
+
+#include "unpacked_fnmla_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fnmla|fnmad)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fnmla|fnmad)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
+
+#include <stdint.h>
+
+#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i])
+#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i])
+#define FNMLA(SUFF) -FMLA (SUFF)
+#define FNMLS(SUFF) -FMLS (SUFF)
+
+#define TEST_FN(FN, TYPE0, TYPE1, COUNT) \
+ void \
+ f_##TYPE0##_##TYPE1 (TYPE1 *__restrict out, \
+ TYPE0 *__restrict a, \
+ TYPE0 *__restrict b, \
+ TYPE0 *__restrict c, \
+ TYPE0 *__restrict d) \
+ { \
+ for (unsigned int i = 0; i < COUNT; i++) \
+ if (FN > d[i]) \
+ out[i] = 3; \
+ }
+
+TEST_FN (FNMLS (f16), _Float16, uint64_t, 32)
+
+TEST_FN (FNMLS (f16), _Float16, uint32_t, 64)
+
+TEST_FN (FNMLS (f32), float, uint64_t, 32)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fnmls|fnmsb)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fnmls|fnmsb)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */
+
+#include "unpacked_fnmls_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\t(fnmls|fnmsb)\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(fnmls|fnmsb)\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */