char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
(enum fpmath_unit) 0,
(enum prefer_vector_width) 0,
+ PVW_NONE, PVW_NONE,
false, add_abi_p);
if (!opts)
error ("%qE needs unknown isa option", fndecl);
const char *arch, const char *tune,
enum fpmath_unit fpmath,
enum prefer_vector_width pvw,
+ enum prefer_vector_width move_max,
+ enum prefer_vector_width store_max,
bool add_nl_p, bool add_abi_p)
{
/* Flag options. */
}
}
- /* Add -mprefer-vector-width= option. */
- if (pvw)
+ auto add_vector_width = [&opts, &num] (prefer_vector_width pvw,
+ const char *cmd)
{
- opts[num][0] = "-mprefer-vector-width=";
+ opts[num][0] = cmd;
switch ((int) pvw)
{
case PVW_AVX128:
default:
gcc_unreachable ();
}
- }
+ };
+
+ /* Add -mprefer-vector-width= option. */
+ if (pvw)
+ add_vector_width (pvw, "-mprefer-vector-width=");
+
+ /* Add -mmove-max= option. */
+ if (move_max)
+ add_vector_width (move_max, "-mmove-max=");
+
+ /* Add -mstore-max= option. */
+ if (store_max)
+ add_vector_width (store_max, "-mstore-max=");
/* Any options? */
if (num == 0)
target_flags, ix86_target_flags,
ix86_arch_string, ix86_tune_string,
ix86_fpmath, prefer_vector_width_type,
+ ix86_move_max, ix86_store_max,
true, true);
if (opts)
= ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
ptr->x_target_flags, ptr->x_ix86_target_flags,
NULL, NULL, ptr->x_ix86_fpmath,
- ptr->x_prefer_vector_width_type, false, true);
+ ptr->x_prefer_vector_width_type,
+ ptr->x_ix86_move_max, ptr->x_ix86_store_max,
+ false, true);
gcc_assert (ptr->arch < PROCESSOR_max);
fprintf (file, "%*sarch = %d (%s)\n",
const char *orig_tune_string = opts->x_ix86_tune_string;
enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
enum prefer_vector_width orig_pvw_set = opts_set->x_prefer_vector_width_type;
+ enum prefer_vector_width orig_ix86_move_max_set
+ = opts_set->x_ix86_move_max;
+ enum prefer_vector_width orig_ix86_store_max_set
+ = opts_set->x_ix86_store_max;
int orig_tune_defaulted = ix86_tune_defaulted;
int orig_arch_specified = ix86_arch_specified;
char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
opts->x_ix86_tune_string = orig_tune_string;
opts_set->x_ix86_fpmath = orig_fpmath_set;
opts_set->x_prefer_vector_width_type = orig_pvw_set;
+ opts_set->x_ix86_move_max = orig_ix86_move_max_set;
+ opts_set->x_ix86_store_max = orig_ix86_store_max_set;
opts->x_ix86_excess_precision = orig_ix86_excess_precision;
opts->x_ix86_unsafe_math_optimizations
= orig_ix86_unsafe_math_optimizations;
&& (opts_set->x_prefer_vector_width_type == PVW_NONE))
opts->x_prefer_vector_width_type = PVW_AVX256;
+ if (opts_set->x_ix86_move_max == PVW_NONE)
+ {
+ /* Set the maximum number of bits can be moved from memory to
+ memory efficiently. */
+ if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
+ opts->x_ix86_move_max = PVW_AVX512;
+ else if (ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES])
+ opts->x_ix86_move_max = PVW_AVX256;
+ else
+ {
+ opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
+ if (opts_set->x_ix86_move_max == PVW_NONE)
+ {
+ if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
+ opts->x_ix86_move_max = PVW_AVX512;
+ else
+ opts->x_ix86_move_max = PVW_AVX128;
+ }
+ }
+ }
+
+ if (opts_set->x_ix86_store_max == PVW_NONE)
+ {
+ /* Set the maximum number of bits can be stored to memory
+ efficiently. */
+ if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
+ opts->x_ix86_store_max = PVW_AVX512;
+ else if (ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES])
+ opts->x_ix86_store_max = PVW_AVX256;
+ else
+ {
+ opts->x_ix86_store_max = opts->x_prefer_vector_width_type;
+ if (opts_set->x_ix86_store_max == PVW_NONE)
+ {
+ if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
+ opts->x_ix86_store_max = PVW_AVX512;
+ else
+ opts->x_ix86_store_max = PVW_AVX128;
+ }
+ }
+ }
+
if (opts->x_ix86_recip_name)
{
char *p = ASTRDUP (opts->x_ix86_recip_name);
int flags, int flags2,
const char *arch, const char *tune,
enum fpmath_unit fpmath,
- enum prefer_vector_width pvw, bool add_nl_p,
- bool add_abi_p);
+ enum prefer_vector_width pvw,
+ enum prefer_vector_width move_max,
+ enum prefer_vector_width store_max,
+ bool add_nl_p, bool add_abi_p);
extern enum attr_cpu ix86_schedule;
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
#define TARGET_SOFTWARE_PREFETCHING_BENEFICIAL \
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
-#define TARGET_AVX256_MOVE_BY_PIECES \
- ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES]
-#define TARGET_AVX256_STORE_BY_PIECES \
- ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES]
#define TARGET_AVX256_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
#define TARGET_GENERAL_REGS_SSE_SPILL \
MOVE_MAX_PIECES defaults to MOVE_MAX. */
#define MOVE_MAX \
- ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ((TARGET_AVX512F \
+ && (ix86_move_max == PVW_AVX512 \
+ || ix86_store_max == PVW_AVX512)) \
? 64 \
: ((TARGET_AVX \
- && !TARGET_PREFER_AVX128 \
- && (TARGET_AVX256_MOVE_BY_PIECES \
- || TARGET_AVX256_STORE_BY_PIECES)) \
+ && (ix86_move_max >= PVW_AVX256 \
+ || ix86_store_max >= PVW_AVX256)) \
? 32 \
: ((TARGET_SSE2 \
&& TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
store_by_pieces of 16/32/64 bytes. */
#define STORE_MAX_PIECES \
(TARGET_INTER_UNIT_MOVES_TO_VEC \
- ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ? ((TARGET_AVX512F && ix86_store_max == PVW_AVX512) \
? 64 \
: ((TARGET_AVX \
- && !TARGET_PREFER_AVX128 \
- && TARGET_AVX256_STORE_BY_PIECES) \
+ && ix86_store_max >= PVW_AVX256) \
? 32 \
: ((TARGET_SSE2 \
&& TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
EnumValue
Enum(prefer_vector_width) String(512) Value(PVW_AVX512)
+mmove-max=
+Target RejectNegative Joined Var(ix86_move_max) Enum(prefer_vector_width) Init(PVW_NONE) Save
+Maximum number of bits that can be moved from memory to memory efficiently.
+
+mstore-max=
+Target RejectNegative Joined Var(ix86_store_max) Enum(prefer_vector_width) Init(PVW_NONE) Save
+Maximum number of bits that can be stored to memory efficiently.
+
;; ISA support
m32
DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
m_CORE_AVX512)
+/* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
+ AVX instructions. */
+DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
+ m_SAPPHIRERAPIDS)
+
+/* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
+ AVX instructions. */
+DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
+ m_SAPPHIRERAPIDS)
+
/*****************************************************************************/
/*****************************************************************************/
/* Historical relics: tuning flags that helps a specific old CPU designs */
-mcld -mcx16 -msahf -mmovbe -mcrc32 -mmwait @gol
-mrecip -mrecip=@var{opt} @gol
-mvzeroupper -mprefer-avx128 -mprefer-vector-width=@var{opt} @gol
+-mmove-max=@var{bits} -mstore-max=@var{bits} @gol
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
-mavx2 -mavx512f -mavx512pf -mavx512er -mavx512cd -mavx512vl @gol
-mavx512bw -mavx512dq -mavx512ifma -mavx512vbmi -msha -maes @gol
This option instructs GCC to use @var{opt}-bit vector width in instructions
instead of default on the selected platform.
+@item -mmove-max=@var{bits}
+@opindex mmove-max
+This option instructs GCC to set the maximum number of bits can be
+moved from memory to memory efficiently to @var{bits}. The valid
+@var{bits} are 128, 256 and 512.
+
+@item -mstore-max=@var{bits}
+@opindex mstore-max
+This option instructs GCC to set the maximum number of bits can be
+stored to memory efficiently to @var{bits}. The valid @var{bits} are
+128, 256 and 512.
+
@table @samp
@item none
No extra limitations applied to GCC other than defined by the selected platform.
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mprefer-vector-width=256 -mavx512f -mmove-max=512" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 2 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sapphirerapids" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 2 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sapphirerapids -mmove-max=128 -mstore-max=128" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 8 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sapphirerapids -mmove-max=256 -mstore-max=256" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu(?:64|)\[ \\t\]+\[^\n\]*%ymm" 4 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=sapphirerapids -march=x86-64 -mavx2" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu(?:64|)\[ \\t\]+\[^\n\]*%ymm" 4 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mprefer-vector-width=256 -mavx512f -mtune-ctrl=avx512_store_by_pieces" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 3, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sapphirerapids" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 3, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovw\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sapphirerapids -mstore-max=128" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 3, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu(?:8|)\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* { dg-final { scan-assembler-times "vmovw\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sapphirerapids -mstore-max=256" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 3, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu(?:8|)\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* { dg-final { scan-assembler-times "vmovw\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=sapphirerapids -march=x86-64 -mavx2" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 3, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu(?:8|)\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here. */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer. */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */