for (int i = 0; i < NUM; ++i) \
a[i] = (TYPE3) b[i] OP (TYPE3) c[i]; \
}
+
+#define DEF_FMA_WVV(PREFIX, NUM, TYPE1, TYPE2) \
+ void __attribute__ ((noinline, noclone)) \
+ PREFIX##_##TYPE1##_##TYPE2##NUM (TYPE2 *restrict a, TYPE1 *restrict b, \
+ TYPE1 *restrict c, TYPE2 *restrict d) \
+ { \
+ for (int i = 0; i < NUM; ++i) \
+ a[i] = (TYPE2) b[i] * (TYPE2) c[i] + d[i]; \
+ }
+
+#define DEF_FMA_WVV_SU(PREFIX, NUM, TYPE1, TYPE2, TYPE3) \
+ void __attribute__ ((noinline, noclone)) \
+ PREFIX##_##TYPE1##_##TYPE2##_##TYPE3##NUM (TYPE3 *restrict a, \
+ TYPE1 *restrict b, \
+ TYPE2 *restrict c, \
+ TYPE3 *restrict d) \
+ { \
+ for (int i = 0; i < NUM; ++i) \
+ a[i] = (TYPE3) b[i] * (TYPE3) c[i] + d[i]; \
+ }
+
+#define DEF_FNMA_WVV(PREFIX, NUM, TYPE1, TYPE2) \
+ void __attribute__ ((noinline, noclone)) \
+ PREFIX##_##TYPE1##_##TYPE2##NUM (TYPE2 *restrict a, TYPE1 *restrict b, \
+ TYPE1 *restrict c, TYPE2 *restrict d) \
+ { \
+ for (int i = 0; i < NUM; ++i) \
+ a[i] = d[i] - (TYPE2) b[i] * (TYPE2) c[i]; \
+ }
+
+#define DEF_FMS_WVV(PREFIX, NUM, TYPE1, TYPE2) \
+ void __attribute__ ((noinline, noclone)) \
+ PREFIX##_##TYPE1##_##TYPE2##NUM (TYPE2 *restrict a, TYPE1 *restrict b, \
+ TYPE1 *restrict c, TYPE2 *restrict d) \
+ { \
+ for (int i = 0; i < NUM; ++i) \
+ a[i] = (TYPE2) b[i] * (TYPE2) c[i] - d[i]; \
+ }
+
+#define DEF_FNMS_WVV(PREFIX, NUM, TYPE1, TYPE2) \
+ void __attribute__ ((noinline, noclone)) \
+ PREFIX##_##TYPE1##_##TYPE2##NUM (TYPE2 *restrict a, TYPE1 *restrict b, \
+ TYPE1 *restrict c, TYPE2 *restrict d) \
+ { \
+ for (int i = 0; i < NUM; ++i) \
+ a[i] = -((TYPE2) b[i] * (TYPE2) c[i]) - d[i]; \
+ }
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_FMA_WVV (wfma, 4, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 8, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 16, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 32, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 64, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 128, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 256, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 512, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 1024, int8_t, int16_t)
+DEF_FMA_WVV (wfma, 2048, int8_t, int16_t)
+
+DEF_FMA_WVV (wfma, 4, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 8, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 16, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 32, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 64, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 128, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 256, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 512, int16_t, int32_t)
+DEF_FMA_WVV (wfma, 1024, int16_t, int32_t)
+
+DEF_FMA_WVV (wfma, 4, int32_t, int64_t)
+DEF_FMA_WVV (wfma, 8, int32_t, int64_t)
+DEF_FMA_WVV (wfma, 16, int32_t, int64_t)
+DEF_FMA_WVV (wfma, 32, int32_t, int64_t)
+DEF_FMA_WVV (wfma, 64, int32_t, int64_t)
+DEF_FMA_WVV (wfma, 128, int32_t, int64_t)
+DEF_FMA_WVV (wfma, 256, int32_t, int64_t)
+DEF_FMA_WVV (wfma, 512, int32_t, int64_t)
+
+DEF_FMA_WVV (wfma, 4, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 8, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 16, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 32, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 64, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 128, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 256, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 512, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 1024, uint8_t, uint16_t)
+DEF_FMA_WVV (wfma, 2048, uint8_t, uint16_t)
+
+DEF_FMA_WVV (wfma, 4, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 8, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 16, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 32, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 64, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 128, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 256, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 512, uint16_t, uint32_t)
+DEF_FMA_WVV (wfma, 1024, uint16_t, uint32_t)
+
+DEF_FMA_WVV (wfma, 4, uint32_t, uint64_t)
+DEF_FMA_WVV (wfma, 8, uint32_t, uint64_t)
+DEF_FMA_WVV (wfma, 16, uint32_t, uint64_t)
+DEF_FMA_WVV (wfma, 32, uint32_t, uint64_t)
+DEF_FMA_WVV (wfma, 64, uint32_t, uint64_t)
+DEF_FMA_WVV (wfma, 128, uint32_t, uint64_t)
+DEF_FMA_WVV (wfma, 256, uint32_t, uint64_t)
+DEF_FMA_WVV (wfma, 512, uint32_t, uint64_t)
+
+/* { dg-final { scan-assembler-times {vwmacc\.vv} 27 } } */
+/* { dg-final { scan-assembler-times {vwmaccu\.vv} 27 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_FMA_WVV(wfma, 4, _Float16, float)
+DEF_FMA_WVV(wfma, 8, _Float16, float)
+DEF_FMA_WVV(wfma, 16, _Float16, float)
+DEF_FMA_WVV(wfma, 32, _Float16, float)
+DEF_FMA_WVV(wfma, 64, _Float16, float)
+DEF_FMA_WVV(wfma, 128, _Float16, float)
+DEF_FMA_WVV(wfma, 256, _Float16, float)
+DEF_FMA_WVV(wfma, 512, _Float16, float)
+DEF_FMA_WVV(wfma, 1024, _Float16, float)
+
+DEF_FMA_WVV(wfma, 4, float, double)
+DEF_FMA_WVV(wfma, 8, float, double)
+DEF_FMA_WVV(wfma, 16, float, double)
+DEF_FMA_WVV(wfma, 32, float, double)
+DEF_FMA_WVV(wfma, 64, float, double)
+DEF_FMA_WVV(wfma, 128, float, double)
+DEF_FMA_WVV(wfma, 256, float, double)
+DEF_FMA_WVV(wfma, 512, float, double)
+
+/* { dg-final { scan-assembler-times {vfwmacc\.vv} 17 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_FMA_WVV_SU (wfma, 4, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 8, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 16, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 32, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 64, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 128, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 256, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 512, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 1024, int8_t, uint8_t, int16_t)
+DEF_FMA_WVV_SU (wfma, 2048, int8_t, uint8_t, int16_t)
+
+DEF_FMA_WVV_SU (wfma, 4, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 8, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 16, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 32, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 64, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 128, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 256, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 512, int16_t, uint16_t, int32_t)
+DEF_FMA_WVV_SU (wfma, 1024, int16_t, uint16_t, int32_t)
+
+DEF_FMA_WVV_SU (wfma, 4, int32_t, uint32_t, int64_t)
+DEF_FMA_WVV_SU (wfma, 8, int32_t, uint32_t, int64_t)
+DEF_FMA_WVV_SU (wfma, 16, int32_t, uint32_t, int64_t)
+DEF_FMA_WVV_SU (wfma, 32, int32_t, uint32_t, int64_t)
+DEF_FMA_WVV_SU (wfma, 64, int32_t, uint32_t, int64_t)
+DEF_FMA_WVV_SU (wfma, 128, int32_t, uint32_t, int64_t)
+DEF_FMA_WVV_SU (wfma, 256, int32_t, uint32_t, int64_t)
+DEF_FMA_WVV_SU (wfma, 512, int32_t, uint32_t, int64_t)
+
+/* { dg-final { scan-assembler-times {vwmaccsu\.vv} 27 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_FMS_WVV(wfms, 4, _Float16, float)
+DEF_FMS_WVV(wfms, 8, _Float16, float)
+DEF_FMS_WVV(wfms, 16, _Float16, float)
+DEF_FMS_WVV(wfms, 32, _Float16, float)
+DEF_FMS_WVV(wfms, 64, _Float16, float)
+DEF_FMS_WVV(wfms, 128, _Float16, float)
+DEF_FMS_WVV(wfms, 256, _Float16, float)
+DEF_FMS_WVV(wfms, 512, _Float16, float)
+DEF_FMS_WVV(wfms, 1024, _Float16, float)
+
+DEF_FMS_WVV(wfms, 4, float, double)
+DEF_FMS_WVV(wfms, 8, float, double)
+DEF_FMS_WVV(wfms, 16, float, double)
+DEF_FMS_WVV(wfms, 32, float, double)
+DEF_FMS_WVV(wfms, 64, float, double)
+DEF_FMS_WVV(wfms, 128, float, double)
+DEF_FMS_WVV(wfms, 256, float, double)
+DEF_FMS_WVV(wfms, 512, float, double)
+
+/* { dg-final { scan-assembler-times {vfwmsac\.vv} 17 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_FNMA_WVV(wfnma, 4, _Float16, float)
+DEF_FNMA_WVV(wfnma, 8, _Float16, float)
+DEF_FNMA_WVV(wfnma, 16, _Float16, float)
+DEF_FNMA_WVV(wfnma, 32, _Float16, float)
+DEF_FNMA_WVV(wfnma, 64, _Float16, float)
+DEF_FNMA_WVV(wfnma, 128, _Float16, float)
+DEF_FNMA_WVV(wfnma, 256, _Float16, float)
+DEF_FNMA_WVV(wfnma, 512, _Float16, float)
+DEF_FNMA_WVV(wfnma, 1024, _Float16, float)
+
+DEF_FNMA_WVV(wfnma, 4, float, double)
+DEF_FNMA_WVV(wfnma, 8, float, double)
+DEF_FNMA_WVV(wfnma, 16, float, double)
+DEF_FNMA_WVV(wfnma, 32, float, double)
+DEF_FNMA_WVV(wfnma, 64, float, double)
+DEF_FNMA_WVV(wfnma, 128, float, double)
+DEF_FNMA_WVV(wfnma, 256, float, double)
+DEF_FNMA_WVV(wfnma, 512, float, double)
+
+/* { dg-final { scan-assembler-times {vfwnmsac\.vv} 17 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_FNMS_WVV(wfms, 4, _Float16, float)
+DEF_FNMS_WVV(wfms, 8, _Float16, float)
+DEF_FNMS_WVV(wfms, 16, _Float16, float)
+DEF_FNMS_WVV(wfms, 32, _Float16, float)
+DEF_FNMS_WVV(wfms, 64, _Float16, float)
+DEF_FNMS_WVV(wfms, 128, _Float16, float)
+DEF_FNMS_WVV(wfms, 256, _Float16, float)
+DEF_FNMS_WVV(wfms, 512, _Float16, float)
+DEF_FNMS_WVV(wfms, 1024, _Float16, float)
+
+DEF_FNMS_WVV(wfms, 4, float, double)
+DEF_FNMS_WVV(wfms, 8, float, double)
+DEF_FNMS_WVV(wfms, 16, float, double)
+DEF_FNMS_WVV(wfms, 32, float, double)
+DEF_FNMS_WVV(wfms, 64, float, double)
+DEF_FNMS_WVV(wfms, 128, float, double)
+DEF_FNMS_WVV(wfms, 256, float, double)
+DEF_FNMS_WVV(wfms, 512, float, double)
+
+/* { dg-final { scan-assembler-times {vfwnmacc\.vv} 17 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */