This patch extends support for BF16 vector operations in GCC, including bitwise AND, ANDNOT, ABS, NEG, COPYSIGN, and XORSIGN for V8BF, V16BF, and V32BF modes.
gcc/ChangeLog:
* config/i386/i386-expand.cc (ix86_expand_fp_absneg_operator): Add VBF modes.
(ix86_expand_copysign): Ditto.
(ix86_expand_xorsign): Ditto.
* config/i386/i386.cc (ix86_build_const_vector): Ditto.
(ix86_build_signbit_mask): Ditto.
* config/i386/sse.md: Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx2-bf16-vec-absneg.c: New test.
* gcc.target/i386/avx512f-bf16-vec-absneg.c: New test.
machine_mode vmode = mode;
rtvec par;
- if (vector_mode || mode == TFmode || mode == HFmode)
- {
- use_sse = true;
- if (mode == HFmode)
- vmode = V8HFmode;
- }
- else if (TARGET_SSE_MATH)
- {
- use_sse = SSE_FLOAT_MODE_P (mode);
- if (mode == SFmode)
- vmode = V4SFmode;
- else if (mode == DFmode)
- vmode = V2DFmode;
- }
+ switch (mode)
+ {
+ case HFmode:
+ use_sse = true;
+ vmode = V8HFmode;
+ break;
+ case BFmode:
+ use_sse = true;
+ vmode = V8BFmode;
+ break;
+ case SFmode:
+ use_sse = TARGET_SSE_MATH && TARGET_SSE;
+ vmode = V4SFmode;
+ break;
+ case DFmode:
+ use_sse = TARGET_SSE_MATH && TARGET_SSE2;
+ vmode = V2DFmode;
+ break;
+ default:
+ use_sse = vector_mode || mode == TFmode;
+ break;
+ }
dst = operands[0];
src = operands[1];
mode = GET_MODE (operands[0]);
- if (mode == HFmode)
+ switch (mode)
+ {
+ case HFmode:
vmode = V8HFmode;
- else if (mode == SFmode)
+ break;
+ case BFmode:
+ vmode = V8BFmode;
+ break;
+ case SFmode:
vmode = V4SFmode;
- else if (mode == DFmode)
+ break;
+ case DFmode:
vmode = V2DFmode;
- else if (mode == TFmode)
+ break;
+ case TFmode:
vmode = mode;
- else
- gcc_unreachable ();
+ break;
+ default:
+ gcc_unreachable();
+ }
if (rtx_equal_p (operands[1], operands[2]))
{
mode = GET_MODE (dest);
- if (mode == HFmode)
+ switch (mode)
+ {
+ case HFmode:
vmode = V8HFmode;
- else if (mode == SFmode)
+ break;
+ case BFmode:
+ vmode = V8BFmode;
+ break;
+ case SFmode:
vmode = V4SFmode;
- else if (mode == DFmode)
+ break;
+ case DFmode:
vmode = V2DFmode;
- else
+ break;
+ default:
gcc_unreachable ();
+ break;
+ }
temp = gen_reg_rtx (vmode);
mask = ix86_build_signbit_mask (vmode, 0, 0);
case E_V8DFmode:
case E_V4DFmode:
case E_V2DFmode:
+ case E_V32BFmode:
+ case E_V16BFmode:
+ case E_V8BFmode:
n_elt = GET_MODE_NUNITS (mode);
v = rtvec_alloc (n_elt);
scalar_mode = GET_MODE_INNER (mode);
case E_V8HFmode:
case E_V16HFmode:
case E_V32HFmode:
+ case E_V32BFmode:
+ case E_V16BFmode:
+ case E_V8BFmode:
vec_mode = mode;
imode = HImode;
break;
;; 128-, 256- and 512-bit float vector modes for bitwise operations
(define_mode_iterator VFB
- [(V32HF "TARGET_AVX512F && TARGET_EVEX512")
+ [(V32BF "TARGET_AVX512F && TARGET_EVEX512")
+ (V16BF "TARGET_AVX") (V8BF "TARGET_SSE2")
+ (V32HF "TARGET_AVX512F && TARGET_EVEX512")
(V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F && TARGET_EVEX512")
;; 128- and 256-bit float vector modes for bitwise operations
(define_mode_iterator VFB_128_256
- [(V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
+ [(V16BF "TARGET_AVX") (V8BF "TARGET_SSE2")
+ (V16HF "TARGET_AVX") (V8HF "TARGET_SSE2")
(V8SF "TARGET_AVX") V4SF
(V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
;; All 512bit vector float modes for bitwise operations
(define_mode_iterator VFB_512
- [(V32HF "TARGET_EVEX512") (V16SF "TARGET_EVEX512") (V8DF "TARGET_EVEX512")])
+ [(V32BF "TARGET_EVEX512")
+ (V32HF "TARGET_EVEX512")
+ (V16SF "TARGET_EVEX512")
+ (V8DF "TARGET_EVEX512")])
(define_mode_iterator V4SF_V8HF
[V4SF V8HF])
(define_mode_attr sse
[(SF "sse") (DF "sse2") (HF "avx512fp16")
(V4SF "sse") (V2DF "sse2")
+ (V32BF "avx512bf16") (V16BF "avx512bf16")
+ (V8BF "avx512bf16")
(V32HF "avx512fp16") (V16HF "avx512fp16")
(V8HF "avx512fp16")
(V16SF "avx512f") (V8SF "avx")
(define_mode_attr sseintvecmode2
[(V8DF "XI") (V4DF "OI") (V2DF "TI")
(V8SF "OI") (V4SF "TI")
- (V16HF "OI") (V8HF "TI")])
+ (V16HF "OI") (V8HF "TI")
+ (V16BF "OI") (V8BF "TI")])
(define_mode_attr sseintvecmodelower
[(V32HF "v32hi") (V32BF "v32hi") (V16SF "v16si") (V8DF "v8di")
(match_operand:VFB_128_256 1 "register_operand" "0,x,v,v"))
(match_operand:VFB_128_256 2 "vector_operand" "xBm,xjm,vm,vm")))]
"TARGET_SSE && <mask_avx512vl_condition>
- && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
+ && (!<mask_applied> || <ssescalarsize> != 16)"
{
char buf[128];
const char *ops;
switch (get_attr_mode (insn))
{
+ case MODE_V16BF:
+ case MODE_V8BF:
case MODE_V16HF:
case MODE_V8HF:
case MODE_V8SF:
(not:VFB_512
(match_operand:VFB_512 1 "register_operand" "v"))
(match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
- "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
+ "TARGET_AVX512F && (!<mask_applied> || <ssescalarsize> != 16)"
{
char buf[128];
const char *ops;
/* Since there are no vandnp[sd] without AVX512DQ nor vandnph,
use vp<logic>[dq]. */
- if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode)
+ if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode || <MODE>mode == V32BFmode)
{
suffix = GET_MODE_INNER (<MODE>mode) == DFmode ? "q" : "d";
ops = "p";
(match_operand:VFB_128_256 1 "vector_operand")
(match_operand:VFB_128_256 2 "vector_operand")))]
"TARGET_SSE && <mask_avx512vl_condition>
- && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
+ && (!<mask_applied> || <ssescalarsize> != 16)"
"ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
(define_expand "<code><mode>3<mask_name>"
(any_logic:VFB_512
(match_operand:VFB_512 1 "nonimmediate_operand")
(match_operand:VFB_512 2 "nonimmediate_operand")))]
- "TARGET_AVX512F && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
+ "TARGET_AVX512F && (!<mask_applied> || <ssescalarsize> != 16)"
"ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
(define_insn "*<code><mode>3<mask_name>"
(match_operand:VFB_128_256 1 "vector_operand" "%0,x,v,v")
(match_operand:VFB_128_256 2 "vector_operand" "xBm,xm,vm,vm")))]
"TARGET_SSE && <mask_avx512vl_condition>
- && (!<mask_applied> || <ssescalarmode>mode != HFmode)
+ && (!<mask_applied> || <ssescalarsize> != 16)
&& !(MEM_P (operands[1]) && MEM_P (operands[2]))"
{
char buf[128];
switch (get_attr_mode (insn))
{
+ case MODE_V16BF:
+ case MODE_V8BF:
case MODE_V16HF:
case MODE_V8HF:
case MODE_V8SF:
(match_operand:VFB_512 1 "nonimmediate_operand" "%v")
(match_operand:VFB_512 2 "nonimmediate_operand" "vm")))]
"TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))
- && (!<mask_applied> || <ssescalarmode>mode != HFmode)"
+ && (!<mask_applied> || <ssescalarsize> != 16)"
{
char buf[128];
const char *ops;
/* Since there are no v<logic>p[sd] without AVX512DQ nor v<logic>ph,
use vp<logic>[dq]. */
- if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode)
+ if (!TARGET_AVX512DQ || <MODE>mode == V32HFmode || <MODE>mode == V32BFmode)
{
suffix = GET_MODE_INNER (<MODE>mode) == DFmode ? "q" : "d";
ops = "p";
--- /dev/null
+/* { dg-do run { target avx2 } } */
+/* { dg-options "-O1 -mavx512bf16 -fdump-tree-vect-details -fdump-tree-optimized" } */
+
+extern void abort (void);
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512BF16
+#include "avx512-check.h"
+
+__bf16 b_128[8], r_abs_128[8], r_neg_128[8];
+__bf16 b_256[16], r_abs_256[16], r_neg_256[16];
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+abs_128 (void)
+{
+ for (int i = 0; i < 8; i++)
+ r_abs_128[i] = __builtin_fabsf16(b_128[i]);
+}
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+neg_128 (void)
+{
+ for (int i = 0; i < 8; i++)
+ r_neg_128[i] = -b_128[i];
+}
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+abs_256 (void)
+{
+ for (int i = 0; i < 16; i++)
+ r_abs_256[i] = __builtin_fabsf16(b_256[i]);
+}
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+neg_256 (void)
+{
+ for (int i = 0; i < 16; i++)
+ r_neg_256[i] = -b_256[i];
+}
+
+void
+check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len)
+{
+ for (int i = 0; i < len; i++)
+ {
+ __bf16 expected_abs = __builtin_fabsf16(b[i]);
+ __bf16 expected_neg = -b[i];
+ if (r_abs[i] != expected_abs || r_neg[i] != expected_neg)
+ abort ();
+ }
+}
+
+static void
+__attribute__ ((noinline, noclone))
+do_test (void)
+{
+ /* Initialize test values */
+ float float_b[16] = {-1.2f, 3.4f, -5.6f, 7.8f,
+ -9.0f, 1.0f, -2.0f, 3.0f,
+ -4.0f, -5.0f, 6.0f, 7.0f,
+ -8.0f, -9.0f, 10.0f, 11.0f};
+
+ for (int i = 0; i < 8; i++)
+ b_128[i] = (__bf16)float_b[i];
+
+ for (int i = 0; i < 16; i++)
+ b_256[i] = (__bf16)float_b[i];
+
+ abs_128 ();
+ neg_128 ();
+ check_absneg_results (b_128, r_abs_128, r_neg_128, 8);
+
+ abs_256 ();
+ neg_256 ();
+ check_absneg_results (b_256, r_abs_256, r_neg_256, 16);
+}
+
+/* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 2 "optimized" { target { ! ia32 } } } } */
\ No newline at end of file
--- /dev/null
+/* { dg-do run { target avx512f } } */
+/* { dg-options "-O1 -mavx512bf16 -fdump-tree-vect-details -fdump-tree-optimized" } */
+
+extern void abort (void);
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512BF16
+#include "avx512-check.h"
+
+__bf16 b_512[32], r_abs_512[32], r_neg_512[32];
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf,
+target("prefer-vector-width=512")))
+abs_512 (void)
+{
+ for (int i = 0; i < 32; i++)
+ r_abs_512[i] = __builtin_fabsf16(b_512[i]);
+}
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf,
+target("prefer-vector-width=512")))
+neg_512 (void)
+{
+ for (int i = 0; i < 32; i++)
+ r_neg_512[i] = -b_512[i];
+}
+
+void
+check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len)
+{
+ for (int i = 0; i < len; i++)
+ {
+ __bf16 expected_abs = __builtin_fabsf16(b[i]);
+ __bf16 expected_neg = -b[i];
+ if (r_abs[i] != expected_abs || r_neg[i] != expected_neg)
+ abort ();
+ }
+}
+
+static void
+__attribute__ ((noinline, noclone))
+do_test (void)
+{
+ /* Initialize test values */
+ float float_b[32] = {-1.2f, 3.4f, -5.6f, 7.8f,
+ -9.0f, 1.0f, -2.0f, 3.0f,
+ -4.0f, -5.0f, 6.0f, 7.0f,
+ -8.0f, -9.0f, 10.0f, 11.0f,
+ -1.2f, 3.4f, -5.6f, 7.8f,
+ -9.0f, 1.0f, -2.0f, 3.0f,
+ -4.0f, -5.0f, 6.0f, 7.0f,
+ -8.0f, -9.0f, 10.0f, 11.0f};
+
+ for (int i = 0; i < 32; i++)
+ b_512[i] = (__bf16)float_b[i];
+
+ abs_512 ();
+ neg_512 ();
+ check_absneg_results (b_512, r_abs_512, r_neg_512, 32);
+}
+
+/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 1 "optimized" { target { ! ia32 } } } } */
\ No newline at end of file