Since
commit
401199377c50045ede560daf3f6e8b51749c2a87
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Jun 17 10:17:17 2025 +0800
x86: Improve vector_loop/unrolled_loop for memset/memcpy
uses move_by_pieces and store_by_pieces to expand memcpy/memset epilogue
with vector_loop even when targetm.use_by_pieces_infrastructure_p returns
false, which triggers
gcc_assert (targetm.use_by_pieces_infrastructure_p
(len, align,
memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
optimize_insn_for_speed_p ()));
in store_by_pieces. Fix it by:
1. Add by_pieces_in_use to machine_function to indicate that by_pieces op
is currently in use.
2. Set and clear by_pieces_in_use when expanding memcpy/memset epilogue
with move_by_pieces and store_by_pieces.
3. Define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P to return true if
by_pieces_in_use is true.
gcc/
PR target/121096
* config/i386/i386-expand.cc (expand_cpymem_epilogue): Set and
clear by_pieces_in_use when using by_pieces op.
(expand_setmem_epilogue): Likewise.
* config/i386/i386.cc (ix86_use_by_pieces_infrastructure_p): New.
(TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): Likewise.
* config/i386/i386.h (machine_function): Add by_pieces_in_use.
gcc/testsuite/
PR target/121096
* gcc.target/i386/memcpy-strategy-14.c: New test.
* gcc.target/i386/memcpy-strategy-15.c: Likewise.
* gcc.target/i386/memset-strategy-10.c: Likewise.
* gcc.target/i386/memset-strategy-11.c: Likewise.
* gcc.target/i386/memset-strategy-12.c: Likewise.
* gcc.target/i386/memset-strategy-13.c: Likewise.
* gcc.target/i386/memset-strategy-14.c: Likewise.
* gcc.target/i386/memset-strategy-15.c: Likewise.
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
unsigned HOST_WIDE_INT countval = UINTVAL (count);
unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
unsigned int destalign = MEM_ALIGN (destmem);
+ cfun->machine->by_pieces_in_use = true;
move_by_pieces (destmem, srcmem, epilogue_size, destalign,
RETURN_BEGIN);
+ cfun->machine->by_pieces_in_use = false;
return;
}
if (max_size > 8)
unsigned HOST_WIDE_INT countval = UINTVAL (count);
unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
unsigned int destalign = MEM_ALIGN (destmem);
+ cfun->machine->by_pieces_in_use = true;
store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
vec_value ? vec_value : value, destalign, true,
RETURN_BEGIN);
+ cfun->machine->by_pieces_in_use = false;
return;
}
if (max_size > 32)
return cost;
}
+
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P. */
+
+bool
+ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+ unsigned int align,
+ enum by_pieces_operation op,
+ bool speed_p)
+{
+ /* Return true when we are currently expanding memcpy/memset epilogue
+ with move_by_pieces or store_by_pieces. */
+ if (cfun->machine->by_pieces_in_use)
+ return true;
+
+ return default_use_by_pieces_infrastructure_p (size, align, op,
+ speed_p);
+}
\f
/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
this is used for to form addresses to local data when -fPIC is in
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST ix86_address_cost
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+ ix86_use_by_pieces_infrastructure_p
+
#undef TARGET_OVERLAP_OP_BY_PIECES_P
#define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
/* True if this is a recursive function. */
BOOL_BITFIELD recursive_function : 1;
+ /* True if by_pieces op is currently in use. */
+ BOOL_BITFIELD by_pieces_in_use : 1;
+
/* The largest alignment, in bytes, of stack slot actually used. */
unsigned int max_used_stack_alignment;
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops -mstringop-strategy=vector_loop" } */
+/* { dg-final { scan-assembler-times "movaps" 8 } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+ __builtin_memcpy (a, b, 2048);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops -mstringop-strategy=vector_loop" } */
+/* { dg-final { scan-assembler-times "movups" 8 } } */
+
+char *a;
+char *b;
+void t (void)
+{
+ __builtin_memcpy (a, b, 2048);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -march=x86-64 -mstringop-strategy=vector_loop" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorps %xmm0, %xmm0
+** xorl %eax, %eax
+** movq %rax, 48\(%(e|r)di\)
+** movups %xmm0, \(%(e|r)di\)
+** movups %xmm0, 16\(%(e|r)di\)
+** movups %xmm0, 32\(%(e|r)di\)
+** ret
+**...
+*/
+
+void
+foo (char *a)
+{
+ __builtin_memset (a, 0, 56);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -minline-all-stringops -mstringop-strategy=vector_loop" } */
+/* { dg-final { scan-assembler-times "movaps" 4 } } */
+
+char a[2048];
+void t (void)
+{
+ __builtin_memset (a, 0, 2048);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -mno-sse -mstringop-strategy=vector_loop" } */
+
+void
+foo (char *a)
+{
+ __builtin_memset (a, 0, 56);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -mno-sse -mstringop-strategy=unrolled_loop" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorl %eax, %eax
+** movq %rax, \(%(e|r)di\)
+** movq %rax, 8\(%(e|r)di\)
+** movq %rax, 16\(%(e|r)di\)
+** movq %rax, 24\(%(e|r)di\)
+** movq %rax, 32\(%(e|r)di\)
+** movq %rax, 40\(%(e|r)di\)
+** movq %rax, 48\(%(e|r)di\)
+** ret
+**...
+*/
+
+void
+foo (char *a)
+{
+ __builtin_memset (a, 0, 56);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -march=x86-64 -mstringop-strategy=vector_loop" } */
+
+void
+foo (char *a, int c)
+{
+ __builtin_memset (a, c, 56);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os -mno-avx -msse2 -mtune=generic -mstringop-strategy=vector_loop" } */
+/* { dg-final { scan-assembler-times "movups" 4} } */
+
+char *a;
+void t (void)
+{
+ __builtin_memset (a, 0, 2048);
+}