1. Don't generate the loop if the loop count is 1.
2. For memset with vector on small size, use vector if small size supports
vector, otherwise use the scalar value.
3. Always expand vector-version of memset for vector_loop.
4. Always duplicate the promoted scalar value for vector_loop if not 0 nor
-1.
5. Use misaligned prologue if alignment isn't needed. When misaligned
prologue is used, check if destination is actually aligned and update
destination alignment if aligned.
6. Use move_by_pieces and store_by_pieces for memcpy and memset epilogues
with the fixed epilogue size to enable overlapping moves and stores.
The included tests show that codegen of vector_loop/unrolled_loop for
memset/memcpy are significantly improved. For
void
foo (void *p1, size_t len)
{
__builtin_memset (p1, 0, len);
}
with
-O2 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -march=x86-64
we used to generate
foo:
.LFB0:
.cfi_startproc
movq %rdi, %rax
pxor %xmm0, %xmm0
cmpq $64, %rsi
jnb .L18
.L2:
andl $63, %esi
je .L1
xorl %edx, %edx
testb $1, %sil
je .L5
movl $1, %edx
movb $0, (%rax)
cmpq %rsi, %rdx
jnb .L19
.L5:
movb $0, (%rax,%rdx)
movb $0, 1(%rax,%rdx)
addq $2, %rdx
cmpq %rsi, %rdx
jb .L5
.L1:
ret
.p2align 4,,10
.p2align 3
.L18:
movq %rsi, %rdx
xorl %eax, %eax
andq $-64, %rdx
.L3:
movups %xmm0, (%rdi,%rax)
movups %xmm0, 16(%rdi,%rax)
movups %xmm0, 32(%rdi,%rax)
movups %xmm0, 48(%rdi,%rax)
addq $64, %rax
cmpq %rdx, %rax
jb .L3
addq %rdi, %rax
jmp .L2
.L19:
ret
.cfi_endproc
with very poor prologue/epilogue. With this patch, we now generate:
foo:
.LFB0:
.cfi_startproc
pxor %xmm0, %xmm0
cmpq $64, %rsi
jnb .L2
testb $32, %sil
jne .L19
testb $16, %sil
jne .L20
testb $8, %sil
jne .L21
testb $4, %sil
jne .L22
testq %rsi, %rsi
jne .L23
.L1:
ret
.p2align 4,,10
.p2align 3
.L2:
movups %xmm0, -64(%rdi,%rsi)
movups %xmm0, -48(%rdi,%rsi)
movups %xmm0, -32(%rdi,%rsi)
movups %xmm0, -16(%rdi,%rsi)
subq $1, %rsi
cmpq $64, %rsi
jb .L1
andq $-64, %rsi
xorl %eax, %eax
.L9:
movups %xmm0, (%rdi,%rax)
movups %xmm0, 16(%rdi,%rax)
movups %xmm0, 32(%rdi,%rax)
movups %xmm0, 48(%rdi,%rax)
addq $64, %rax
cmpq %rsi, %rax
jb .L9
ret
.p2align 4,,10
.p2align 3
.L23:
movb $0, (%rdi)
testb $2, %sil
je .L1
xorl %eax, %eax
movw %ax, -2(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L19:
movups %xmm0, (%rdi)
movups %xmm0, 16(%rdi)
movups %xmm0, -32(%rdi,%rsi)
movups %xmm0, -16(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L20:
movups %xmm0, (%rdi)
movups %xmm0, -16(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L21:
movq $0, (%rdi)
movq $0, -8(%rdi,%rsi)
ret
.p2align 4,,10
.p2align 3
.L22:
movl $0, (%rdi)
movl $0, -4(%rdi,%rsi)
ret
.cfi_endproc
gcc/
PR target/120670
PR target/120683
* config/i386/i386-expand.cc (expand_set_or_cpymem_via_loop):
Don't generate the loop if the loop count is 1.
(expand_cpymem_epilogue): Use move_by_pieces.
(setmem_epilogue_gen_val): New.
(expand_setmem_epilogue): Use store_by_pieces.
(expand_small_cpymem_or_setmem): Choose cpymem mode from MOVE_MAX.
For memset with vector and the size is smaller than the vector
size, first try the narrower vector, otherwise, use the scalar
value.
(promote_duplicated_reg): Duplicate the scalar value for vector.
(ix86_expand_set_or_cpymem): Always expand vector-version of
memset for vector_loop. Use misaligned prologue if alignment
isn't needed and destination isn't aligned. Always initialize
vec_promoted_val from the promoted scalar value for vector_loop.
gcc/testsuite/
PR target/120670
PR target/120683
* gcc.target/i386/auto-init-padding-9.c: Updated.
* gcc.target/i386/memcpy-strategy-12.c: Likewise.
* gcc.target/i386/memset-strategy-25.c: Likewise.
* gcc.target/i386/memset-strategy-29.c: Likewise.
* gcc.target/i386/memset-strategy-30.c: Likewise.
* gcc.target/i386/memset-strategy-31.c: Likewise.
* gcc.target/i386/memcpy-pr120683-1.c: New test.
* gcc.target/i386/memcpy-pr120683-2.c: Likewise.
* gcc.target/i386/memcpy-pr120683-3.c: Likewise.
* gcc.target/i386/memcpy-pr120683-4.c: Likewise.
* gcc.target/i386/memcpy-pr120683-5.c: Likewise.
* gcc.target/i386/memcpy-pr120683-6.c: Likewise.
* gcc.target/i386/memcpy-pr120683-7.c: Likewise.
* gcc.target/i386/memset-pr120683-1.c: Likewise.
* gcc.target/i386/memset-pr120683-2.c: Likewise.
* gcc.target/i386/memset-pr120683-3.c: Likewise.
* gcc.target/i386/memset-pr120683-4.c: Likewise.
* gcc.target/i386/memset-pr120683-5.c: Likewise.
* gcc.target/i386/memset-pr120683-6.c: Likewise.
* gcc.target/i386/memset-pr120683-7.c: Likewise.
* gcc.target/i386/memset-pr120683-8.c: Likewise.
* gcc.target/i386/memset-pr120683-9.c: Likewise.
* gcc.target/i386/memset-pr120683-10.c: Likewise.
* gcc.target/i386/memset-pr120683-11.c: Likewise.
* gcc.target/i386/memset-pr120683-12.c: Likewise.
* gcc.target/i386/memset-pr120683-13.c: Likewise.
* gcc.target/i386/memset-pr120683-14.c: Likewise.
* gcc.target/i386/memset-pr120683-15.c: Likewise.
* gcc.target/i386/memset-pr120683-16.c: Likewise.
* gcc.target/i386/memset-pr120683-17.c: Likewise.
* gcc.target/i386/memset-pr120683-18.c: Likewise.
* gcc.target/i386/memset-pr120683-19.c: Likewise.
* gcc.target/i386/memset-pr120683-20.c: Likewise.
* gcc.target/i386/memset-pr120683-21.c: Likewise.
* gcc.target/i386/memset-pr120683-22.c: Likewise.
* gcc.target/i386/memset-pr120683-23.c: Likewise.
rtx count, machine_mode mode, int unroll,
int expected_size, bool issetmem)
{
- rtx_code_label *out_label, *top_label;
+ rtx_code_label *out_label = nullptr;
+ rtx_code_label *top_label = nullptr;
rtx iter, tmp;
machine_mode iter_mode = counter_mode (count);
int piece_size_n = GET_MODE_SIZE (mode) * unroll;
rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
rtx size;
int i;
+ int loop_count;
- top_label = gen_label_rtx ();
- out_label = gen_label_rtx ();
+ if (expected_size != -1 && CONST_INT_P (count))
+ loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
+ else
+ loop_count = -1;
+
+ /* Don't generate the loop if the loop count is 1. */
+ if (loop_count != 1)
+ {
+ top_label = gen_label_rtx ();
+ out_label = gen_label_rtx ();
+ }
iter = gen_reg_rtx (iter_mode);
size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
}
emit_move_insn (iter, const0_rtx);
- emit_label (top_label);
+ if (loop_count != 1)
+ emit_label (top_label);
tmp = convert_modes (Pmode, iter_mode, iter, true);
if (tmp != iter)
emit_move_insn (iter, tmp);
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
- true, top_label);
- if (expected_size != -1)
+ if (loop_count != 1)
{
- expected_size /= GET_MODE_SIZE (mode) * unroll;
- if (expected_size == 0)
- predict_jump (0);
- else if (expected_size > REG_BR_PROB_BASE)
- predict_jump (REG_BR_PROB_BASE - 1);
+ emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ true, top_label);
+ if (expected_size != -1)
+ {
+ expected_size /= GET_MODE_SIZE (mode) * unroll;
+ if (expected_size == 0)
+ predict_jump (0);
+ else if (expected_size > REG_BR_PROB_BASE)
+ predict_jump (REG_BR_PROB_BASE - 1);
+ else
+ predict_jump (REG_BR_PROB_BASE
+ - (REG_BR_PROB_BASE + expected_size / 2)
+ / expected_size);
+ }
else
- predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
- / expected_size);
+ predict_jump (REG_BR_PROB_BASE * 80 / 100);
}
- else
- predict_jump (REG_BR_PROB_BASE * 80 / 100);
iter = ix86_zero_extend_to_Pmode (iter);
tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
true, OPTAB_LIB_WIDEN);
if (tmp != srcptr)
emit_move_insn (srcptr, tmp);
}
- emit_label (out_label);
+ if (loop_count != 1)
+ emit_label (out_label);
}
/* Divide COUNTREG by SCALE. */
rtx src, dest;
if (CONST_INT_P (count))
{
- HOST_WIDE_INT countval = INTVAL (count);
- HOST_WIDE_INT epilogue_size = countval % max_size;
- int i;
-
- /* For now MAX_SIZE should be a power of 2. This assert could be
- relaxed, but it'll require a bit more complicated epilogue
- expanding. */
- gcc_assert ((max_size & (max_size - 1)) == 0);
- for (i = max_size; i >= 1; i >>= 1)
- {
- if (epilogue_size & i)
- destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
- }
+ unsigned HOST_WIDE_INT countval = UINTVAL (count);
+ unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+ unsigned int destalign = MEM_ALIGN (destmem);
+ move_by_pieces (destmem, srcmem, epilogue_size, destalign,
+ RETURN_BEGIN);
return;
}
if (max_size > 8)
1, max_size / 2, true);
}
+/* Callback routine for store_by_pieces. Return the RTL of a register
+ containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
+ is a word or a word vector register. If PREV_P isn't nullptr, it
+ has the RTL info from the previous iteration. */
+
+static rtx
+setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
+ fixed_size_mode mode)
+{
+ rtx target;
+ by_pieces_prev *prev = (by_pieces_prev *) prev_p;
+ if (prev)
+ {
+ rtx prev_op = prev->data;
+ if (prev_op)
+ {
+ machine_mode prev_mode = GET_MODE (prev_op);
+ if (prev_mode == mode)
+ return prev_op;
+ if (VECTOR_MODE_P (prev_mode)
+ && VECTOR_MODE_P (mode)
+ && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
+ {
+ target = gen_rtx_SUBREG (mode, prev_op, 0);
+ return target;
+ }
+ }
+ }
+
+ rtx op = (rtx) op_p;
+ machine_mode op_mode = GET_MODE (op);
+
+ gcc_assert (op_mode == word_mode
+ || (VECTOR_MODE_P (op_mode)
+ && GET_MODE_INNER (op_mode) == word_mode));
+
+ if (VECTOR_MODE_P (mode))
+ {
+ gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+ unsigned int op_size = GET_MODE_SIZE (op_mode);
+ unsigned int size = GET_MODE_SIZE (mode);
+ unsigned int nunits = op_size / GET_MODE_SIZE (QImode);
+ machine_mode vec_mode
+ = mode_for_vector (QImode, nunits).require ();
+ target = gen_reg_rtx (vec_mode);
+ op = gen_rtx_SUBREG (vec_mode, op, 0);
+ emit_move_insn (target, op);
+ if (op_size == size)
+ return target;
+
+ rtx tmp = gen_reg_rtx (mode);
+ target = gen_rtx_SUBREG (mode, target, 0);
+ emit_move_insn (tmp, target);
+ return tmp;
+ }
+
+ target = gen_reg_rtx (word_mode);
+ if (VECTOR_MODE_P (op_mode))
+ {
+ op = gen_rtx_SUBREG (word_mode, op, 0);
+ emit_move_insn (target, op);
+ }
+ else
+ target = op;
+
+ if (mode == word_mode)
+ return target;
+
+ rtx tmp = gen_reg_rtx (mode);
+ target = gen_rtx_SUBREG (mode, target, 0);
+ emit_move_insn (tmp, target);
+ return tmp;
+}
+
/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
static void
expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
if (CONST_INT_P (count))
{
- HOST_WIDE_INT countval = INTVAL (count);
- HOST_WIDE_INT epilogue_size = countval % max_size;
- int i;
-
- /* For now MAX_SIZE should be a power of 2. This assert could be
- relaxed, but it'll require a bit more complicated epilogue
- expanding. */
- gcc_assert ((max_size & (max_size - 1)) == 0);
- for (i = max_size; i >= 1; i >>= 1)
- {
- if (epilogue_size & i)
- {
- if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
- destmem = emit_memset (destmem, destptr, vec_value, i);
- else
- destmem = emit_memset (destmem, destptr, value, i);
- }
- }
+ unsigned HOST_WIDE_INT countval = UINTVAL (count);
+ unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+ unsigned int destalign = MEM_ALIGN (destmem);
+ store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
+ vec_value ? vec_value : value, destalign, true,
+ RETURN_BEGIN);
return;
}
if (max_size > 32)
rtx_code_label *label = ix86_expand_aligntest (count, size, false);
machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
rtx modesize;
+ rtx scalar_value = value;
int n;
/* If we do not have vector value to copy, we must reduce size. */
{
/* Choose appropriate vector mode. */
if (size >= 32)
- mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+ switch (MOVE_MAX)
+ {
+ case 64:
+ if (size >= 64)
+ {
+ mode = V64QImode;
+ break;
+ }
+ /* FALLTHRU */
+ case 32:
+ mode = V32QImode;
+ break;
+ case 16:
+ mode = V16QImode;
+ break;
+ case 8:
+ mode = DImode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
else if (size >= 16)
mode = TARGET_SSE ? V16QImode : DImode;
srcmem = change_address (srcmem, mode, srcptr);
}
+ if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
+ {
+ /* For memset with vector and the size is smaller than the vector
+ size, first try the narrower vector, otherwise, use the
+ original value. */
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
+ if (nunits > 1)
+ {
+ mode = mode_for_vector (GET_MODE_INNER (mode),
+ nunits).require ();
+ value = gen_rtx_SUBREG (mode, value, 0);
+ }
+ else
+ {
+ scalar_int_mode smode
+ = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
+ gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
+ >= GET_MODE_SIZE (smode));
+ mode = smode;
+ if (GET_MODE (scalar_value) == mode)
+ value = scalar_value;
+ else
+ value = gen_rtx_SUBREG (mode, scalar_value, 0);
+ }
+ }
destmem = change_address (destmem, mode, destptr);
modesize = GEN_INT (GET_MODE_SIZE (mode));
gcc_assert (GET_MODE_SIZE (mode) <= size);
static rtx
promote_duplicated_reg (machine_mode mode, rtx val)
{
+ if (val == const0_rtx)
+ return copy_to_mode_reg (mode, CONST0_RTX (mode));
+
machine_mode valmode = GET_MODE (val);
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* Duplicate the scalar value for integer vector. */
+ gcc_assert ((val == const0_rtx || val == constm1_rtx)
+ || GET_MODE_INNER (mode) == valmode);
+ rtx dup = gen_reg_rtx (mode);
+ bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
+ val);
+ gcc_assert (ok);
+ return dup;
+ }
+
rtx tmp;
int nops = mode == DImode ? 3 : 2;
- gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
- if (val == const0_rtx)
- return copy_to_mode_reg (mode, CONST0_RTX (mode));
+ gcc_assert (mode == SImode || mode == DImode);
if (CONST_INT_P (val))
{
HOST_WIDE_INT v = INTVAL (val) & 255;
return false;
gcc_assert (alg != no_stringop);
- /* For now vector-version of memset is generated only for memory zeroing, as
- creating of promoted vector value is very cheap in this case. */
- if (issetmem && alg == vector_loop && val_exp != const0_rtx)
- alg = unrolled_loop;
-
if (!count)
count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
&& ((desired_align > align && !align_bytes)
|| (!count && epilogue_size_needed > 1)));
+ /* Destination is aligned after the misaligned prologue. */
+ bool aligned_dstmem = misaligned_prologue_used;
+
+ if (noalign && !misaligned_prologue_used)
+ {
+ /* Also use misaligned prologue if alignment isn't needed and
+ destination isn't aligned. Since alignment isn't needed,
+ the destination after prologue won't be aligned. */
+ aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+ <= MEM_ALIGN (dst));
+ if (!aligned_dstmem)
+ misaligned_prologue_used = true;
+ }
+
/* Do the cheap promotion to allow better CSE across the
main loop and epilogue (ie one load of the big constant in the
front of all code.
For now the misaligned move sequences do not have fast path
without broadcasting. */
- if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+ if (issetmem
+ && (alg == vector_loop
+ || CONST_INT_P (val_exp)
+ || misaligned_prologue_used))
{
if (alg == vector_loop)
{
- gcc_assert (val_exp == const0_rtx);
- vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
promoted_val = promote_duplicated_reg_to_size (val_exp,
GET_MODE_SIZE (word_mode),
desired_align, align);
+ /* Duplicate the promoted scalar value if not 0 nor -1. */
+ vec_promoted_val
+ = promote_duplicated_reg (move_mode,
+ (val_exp == const0_rtx
+ || val_exp == constm1_rtx)
+ ? val_exp : promoted_val);
}
else
{
if (!issetmem)
src = change_address (src, BLKmode, srcreg);
dst = change_address (dst, BLKmode, destreg);
- set_mem_align (dst, desired_align * BITS_PER_UNIT);
+ if (aligned_dstmem)
+ set_mem_align (dst, desired_align * BITS_PER_UNIT);
epilogue_size_needed = 0;
if (need_zero_guard
&& min_size < (unsigned HOST_WIDE_INT) size_needed)
/*
**foo:
**...
+** leaq -160\(%rbp\), %rax
+** movq %rax, %rcx
** pxor %xmm0, %xmm0
-**...
+** movl \$160, %edx
+** movl %edx, %edi
+** andl \$-64, %edi
+** movl \$0, %esi
**.L[0-9]+:
-** movl %esi, %ecx
-** movaps %xmm0, \(%rdx,%rcx\)
-** movaps %xmm0, 16\(%rdx,%rcx\)
-** movaps %xmm0, 32\(%rdx,%rcx\)
-** movaps %xmm0, 48\(%rdx,%rcx\)
+** movl %esi, %edx
+** movaps %xmm0, \(%rax,%rdx\)
+** movaps %xmm0, 16\(%rax,%rdx\)
+** movaps %xmm0, 32\(%rax,%rdx\)
+** movaps %xmm0, 48\(%rax,%rdx\)
** addl \$64, %esi
** cmpl %edi, %esi
** jb .L[0-9]+
+** movl %esi, %eax
+** addq %rax, %rcx
+** movaps %xmm0, \(%rcx\)
+** movaps %xmm0, 16\(%rcx\)
+** movzbl -116\(%rbp\), %eax
+** movsbl %al, %eax
**...
*/
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemcpy-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq 221\(%rsi\), %rax
+** xorl %edx, %edx
+** movq %rax, 221\(%rdi\)
+** movq 229\(%rsi\), %rax
+** movq %rax, 229\(%rdi\)
+** movq 237\(%rsi\), %rax
+** movq %rax, 237\(%rdi\)
+** movq 245\(%rsi\), %rax
+** movq %rax, 245\(%rdi\)
+**.L[0-9]+:
+** movl %edx, %eax
+** addl \$32, %edx
+** movq \(%rsi,%rax\), %r10
+** movq 8\(%rsi,%rax\), %r9
+** movq 16\(%rsi,%rax\), %r8
+** movq 24\(%rsi,%rax\), %rcx
+** movq %r10, \(%rdi,%rax\)
+** movq %r9, 8\(%rdi,%rax\)
+** movq %r8, 16\(%rdi,%rax\)
+** movq %rcx, 24\(%rdi,%rax\)
+** cmpl \$224, %edx
+** jb .L[0-9]+
+** ret
+**...
+*/
+
+void
+foo (char *dest, char *src)
+{
+ __builtin_memcpy (dest, src, 253);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorl %edx, %edx
+**.L[0-9]+:
+** movl %edx, %eax
+** addl \$64, %edx
+** movdqa src\(%rax\), %xmm3
+** movdqa src\+16\(%rax\), %xmm2
+** movdqa src\+32\(%rax\), %xmm1
+** movdqa src\+48\(%rax\), %xmm0
+** movaps %xmm3, dest\(%rax\)
+** movaps %xmm2, dest\+16\(%rax\)
+** movaps %xmm1, dest\+32\(%rax\)
+** movaps %xmm0, dest\+48\(%rax\)
+** cmpl \$256, %edx
+** jb .L[0-9]+
+** movdqa src\(%rdx\), %xmm0
+** movaps %xmm0, dest\(%rdx\)
+** ret
+**...
+*/
+
+#define SIZE (16 + 1) * 16
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+ __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorl %edx, %edx
+**.L[0-9]+:
+** movl %edx, %eax
+** addl \$64, %edx
+** movdqa src\(%rax\), %xmm3
+** movdqa src\+16\(%rax\), %xmm2
+** movdqa src\+32\(%rax\), %xmm1
+** movdqa src\+48\(%rax\), %xmm0
+** movaps %xmm3, dest\(%rax\)
+** movaps %xmm2, dest\+16\(%rax\)
+** movaps %xmm1, dest\+32\(%rax\)
+** movaps %xmm0, dest\+48\(%rax\)
+** cmpl \$256, %edx
+** jb .L[0-9]+
+** movdqa src\(%rdx\), %xmm0
+** movaps %xmm0, dest\(%rdx\)
+** movdqu src\+15\(%rdx\), %xmm0
+** movups %xmm0, dest\+15\(%rdx\)
+** ret
+**...
+*/
+
+#define SIZE 16 * 16 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+ __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorl %edx, %edx
+**.L[0-9]+:
+** movl %edx, %eax
+** subl \$-128, %edx
+** vmovdqa src\(%rax\), %ymm3
+** vmovdqa src\+32\(%rax\), %ymm2
+** vmovdqa src\+64\(%rax\), %ymm1
+** vmovdqa src\+96\(%rax\), %ymm0
+** vmovdqa %ymm3, dest\(%rax\)
+** vmovdqa %ymm2, dest\+32\(%rax\)
+** vmovdqa %ymm1, dest\+64\(%rax\)
+** vmovdqa %ymm0, dest\+96\(%rax\)
+** cmpl \$512, %edx
+** jb .L[0-9]+
+** vmovdqa src\(%rdx\), %ymm0
+** vmovdqa %ymm0, dest\(%rdx\)
+** vzeroupper
+** ret
+**...
+*/
+
+#define SIZE (16 + 1) * 32
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+ __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorl %edx, %edx
+**.L[0-9]+:
+** movl %edx, %eax
+** subl \$-128, %edx
+** vmovdqa src\(%rax\), %ymm3
+** vmovdqa src\+32\(%rax\), %ymm2
+** vmovdqa src\+64\(%rax\), %ymm1
+** vmovdqa src\+96\(%rax\), %ymm0
+** vmovdqa %ymm3, dest\(%rax\)
+** vmovdqa %ymm2, dest\+32\(%rax\)
+** vmovdqa %ymm1, dest\+64\(%rax\)
+** vmovdqa %ymm0, dest\+96\(%rax\)
+** cmpl \$512, %edx
+** jb .L[0-9]+
+** vmovdqa src\(%rdx\), %ymm0
+** vmovdqa %ymm0, dest\(%rdx\)
+** vmovdqu src\+31\(%rdx\), %ymm0
+** vmovdqu %ymm0, dest\+31\(%rdx\)
+** vzeroupper
+** ret
+**...
+*/
+
+#define SIZE 16 * 32 + 32 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+ __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorl %edx, %edx
+**.L[0-9]+:
+** movl %edx, %eax
+** addl \$256, %edx
+** vmovdqa64 src\(%rax\), %zmm3
+** vmovdqa64 src\+64\(%rax\), %zmm2
+** vmovdqa64 src\+128\(%rax\), %zmm1
+** vmovdqa64 src\+192\(%rax\), %zmm0
+** vmovdqa64 %zmm3, dest\(%rax\)
+** vmovdqa64 %zmm2, dest\+64\(%rax\)
+** vmovdqa64 %zmm1, dest\+128\(%rax\)
+** vmovdqa64 %zmm0, dest\+192\(%rax\)
+** cmpl \$1024, %edx
+** jb .L[0-9]+
+** vmovdqa64 src\(%rdx\), %zmm0
+** vmovdqa64 %zmm0, dest\(%rdx\)
+** vzeroupper
+** ret
+**...
+*/
+
+#define SIZE (16 + 1) * 64
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+ __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** xorl %edx, %edx
+**.L[0-9]+:
+** movl %edx, %eax
+** addl \$256, %edx
+** vmovdqa64 src\(%rax\), %zmm3
+** vmovdqa64 src\+64\(%rax\), %zmm2
+** vmovdqa64 src\+128\(%rax\), %zmm1
+** vmovdqa64 src\+192\(%rax\), %zmm0
+** vmovdqa64 %zmm3, dest\(%rax\)
+** vmovdqa64 %zmm2, dest\+64\(%rax\)
+** vmovdqa64 %zmm1, dest\+128\(%rax\)
+** vmovdqa64 %zmm0, dest\+192\(%rax\)
+** cmpl \$1024, %edx
+** jb .L[0-9]+
+** vmovdqa src\(%rdx\), %ymm0
+** vmovdqa %ymm0, dest\(%rdx\)
+** vmovdqu src\+31\(%rdx\), %ymm0
+** vmovdqu %ymm0, dest\+31\(%rdx\)
+** vzeroupper
+** ret
+**...
+*/
+
+#define SIZE 16 * 64 + 63
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+ __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
/*
**foo:
**.LFB[0-9]+:
-**...
+** .cfi_startproc
+** movq 221\(%rsi\), %rax
** xorl %edx, %edx
-**...
+** movq %rax, 221\(%rdi\)
+** movq 229\(%rsi\), %rax
+** movq %rax, 229\(%rdi\)
+** movq 237\(%rsi\), %rax
+** movq %rax, 237\(%rdi\)
+** movq 245\(%rsi\), %rax
+** movq %rax, 245\(%rdi\)
**.L[0-9]+:
** movl %edx, %eax
** addl \$32, %edx
** movq %rcx, 24\(%rdi,%rax\)
** cmpl \$224, %edx
** jb .L[0-9]+
+** ret
**...
*/
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** pxor %xmm0, %xmm0
+** xorl %eax, %eax
+** movups %xmm0, 190\(%rdi\)
+** movups %xmm0, 206\(%rdi\)
+** movups %xmm0, 222\(%rdi\)
+** movups %xmm0, 238\(%rdi\)
+**.L[0-9]+:
+** movl %eax, %edx
+** addl \$64, %eax
+** movups %xmm0, \(%rdi,%rdx\)
+** movups %xmm0, 16\(%rdi,%rdx\)
+** movups %xmm0, 32\(%rdi,%rdx\)
+** movups %xmm0, 48\(%rdi,%rdx\)
+** cmpl \$192, %eax
+** jb .L[0-9]+
+** ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq \$0, 48\(%rdi\)
+** movq \$0, \(%rdi\)
+** movq \$0, 8\(%rdi\)
+** movq \$0, 16\(%rdi\)
+** movq \$0, 24\(%rdi\)
+** movq \$0, 32\(%rdi\)
+** movq \$0, 40\(%rdi\)
+** movq \$0, 53\(%rdi\)
+** ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movabsq \$289360691352306692, %rax
+** movq %rax, 48\(%rdi\)
+** movq %rax, \(%rdi\)
+** movq %rax, 8\(%rdi\)
+** movq %rax, 16\(%rdi\)
+** movq %rax, 24\(%rdi\)
+** movq %rax, 32\(%rdi\)
+** movq %rax, 40\(%rdi\)
+** movq %rax, 53\(%rdi\)
+** ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 4, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movabsq \$72340172838076673, %rax
+** movzbl %sil, %esi
+** imulq %rax, %rsi
+** movq %rsi, 48\(%rdi\)
+** movq %rsi, \(%rdi\)
+** movq %rsi, 8\(%rdi\)
+** movq %rsi, 16\(%rdi\)
+** movq %rsi, 24\(%rdi\)
+** movq %rsi, 32\(%rdi\)
+** movq %rsi, 40\(%rdi\)
+** movq %rsi, 53\(%rdi\)
+** ret
+**...
+*/
+
+void
+foo (char *dest, int c)
+{
+ __builtin_memset (dest, c, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** pxor %xmm0, %xmm0
+** xorl %eax, %eax
+**.L[0-9]+:
+** movl %eax, %edx
+** addl \$64, %eax
+** movaps %xmm0, dest\(%rdx\)
+** movaps %xmm0, dest\+16\(%rdx\)
+** movaps %xmm0, dest\+32\(%rdx\)
+** movaps %xmm0, dest\+48\(%rdx\)
+** cmpl \$192, %eax
+** jb .L[0-9]+
+** movaps %xmm0, dest\(%rax\)
+** movaps %xmm0, dest\+16\(%rax\)
+** movaps %xmm0, dest\+32\(%rax\)
+** ret
+**...
+*/
+
+char dest[240];
+
+void
+foo (void)
+{
+ __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** pxor %xmm0, %xmm0
+** cmpq \$64, %rsi
+** jnb .L2
+** testb \$32, %sil
+** jne .L19
+** testb \$16, %sil
+** jne .L20
+** testb \$8, %sil
+** jne .L21
+** testb \$4, %sil
+** jne .L22
+** testq %rsi, %rsi
+** jne .L23
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** movups %xmm0, -64\(%rdi,%rsi\)
+** movups %xmm0, -48\(%rdi,%rsi\)
+** movups %xmm0, -32\(%rdi,%rsi\)
+** movups %xmm0, -16\(%rdi,%rsi\)
+** subq \$1, %rsi
+** cmpq \$64, %rsi
+** jb .L1
+** andq \$-64, %rsi
+** xorl %eax, %eax
+**.L9:
+** movups %xmm0, \(%rdi,%rax\)
+** movups %xmm0, 16\(%rdi,%rax\)
+** movups %xmm0, 32\(%rdi,%rax\)
+** movups %xmm0, 48\(%rdi,%rax\)
+** addq \$64, %rax
+** cmpq %rsi, %rax
+** jb .L9
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** movb \$0, \(%rdi\)
+** testb \$2, %sil
+** je .L1
+** xorl %eax, %eax
+** movw %ax, -2\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** movups %xmm0, \(%rdi\)
+** movups %xmm0, 16\(%rdi\)
+** movups %xmm0, -32\(%rdi,%rsi\)
+** movups %xmm0, -16\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L20:
+** movups %xmm0, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L21:
+** movq \$0, \(%rdi\)
+** movq \$0, -8\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** movl \$0, \(%rdi\)
+** movl \$0, -4\(%rdi,%rsi\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** vpxor %xmm0, %xmm0, %xmm0
+** cmpq \$128, %rsi
+** jnb .L2
+** testb \$64, %sil
+** jne .L22
+** testb \$32, %sil
+** jne .L23
+** testb \$16, %sil
+** jne .L24
+** testb \$8, %sil
+** jne .L25
+** testb \$4, %sil
+** jne .L26
+** testq %rsi, %rsi
+** jne .L27
+**.L20:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** vmovdqu %ymm0, -128\(%rdi,%rsi\)
+** vmovdqu %ymm0, -96\(%rdi,%rsi\)
+** vmovdqu %ymm0, -64\(%rdi,%rsi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** subq \$1, %rsi
+** cmpq \$128, %rsi
+** jb .L19
+** andq \$-128, %rsi
+** xorl %eax, %eax
+**.L10:
+** vmovdqu %ymm0, \(%rdi,%rax\)
+** vmovdqu %ymm0, 32\(%rdi,%rax\)
+** vmovdqu %ymm0, 64\(%rdi,%rax\)
+** vmovdqu %ymm0, 96\(%rdi,%rax\)
+** subq \$-128, %rax
+** cmpq %rsi, %rax
+** jb .L10
+**.L19:
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L27:
+** movb \$0, \(%rdi\)
+** testb \$2, %sil
+** je .L20
+** xorl %eax, %eax
+** movw %ax, -2\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, 32\(%rdi\)
+** vmovdqu %ymm0, -64\(%rdi,%rsi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L24:
+** vmovdqu %xmm0, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L25:
+** movq \$0, \(%rdi\)
+** movq \$0, -8\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L26:
+** movl \$0, \(%rdi\)
+** movl \$0, -4\(%rdi,%rsi\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** vpxor %xmm0, %xmm0, %xmm0
+** cmpq \$256, %rsi
+** jnb .L2
+** testb \$-128, %sil
+** jne .L23
+** testb \$64, %sil
+** jne .L24
+** testb \$32, %sil
+** jne .L25
+** testb \$16, %sil
+** jne .L26
+** testb \$8, %sil
+** jne .L27
+** testb \$4, %sil
+** jne .L28
+** testq %rsi, %rsi
+** jne .L29
+**.L21:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** vmovdqu64 %zmm0, -256\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -192\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rsi\)
+** subq \$1, %rsi
+** cmpq \$256, %rsi
+** jb .L20
+** xorb %sil, %sil
+** xorl %eax, %eax
+**.L11:
+** vmovdqu64 %zmm0, \(%rdi,%rax\)
+** vmovdqu64 %zmm0, 64\(%rdi,%rax\)
+** vmovdqu64 %zmm0, 128\(%rdi,%rax\)
+** vmovdqu64 %zmm0, 192\(%rdi,%rax\)
+** addq \$256, %rax
+** cmpq %rsi, %rax
+** jb .L11
+**.L20:
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L29:
+** movb \$0, \(%rdi\)
+** testb \$2, %sil
+** je .L21
+** xorl %eax, %eax
+** movw %ax, -2\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** vmovdqu64 %zmm0, \(%rdi\)
+** vmovdqu64 %zmm0, 64\(%rdi\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rsi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L24:
+** vmovdqu64 %zmm0, \(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rsi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L25:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L26:
+** vmovdqu %xmm0, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L27:
+** movq \$0, \(%rdi\)
+** movq \$0, -8\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L28:
+** movl \$0, \(%rdi\)
+** movl \$0, -4\(%rdi,%rsi\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** pxor %xmm0, %xmm0
+** xorl %eax, %eax
+**.L[0-9]+:
+** movl %eax, %edx
+** addl \$64, %eax
+** movaps %xmm0, dest\(%rdx\)
+** movaps %xmm0, dest\+16\(%rdx\)
+** movaps %xmm0, dest\+32\(%rdx\)
+** movaps %xmm0, dest\+48\(%rdx\)
+** cmpl \$128, %eax
+** jb .L[0-9]+
+** movq \$0, dest\+48\(%rax\)
+** movaps %xmm0, dest\(%rax\)
+** movaps %xmm0, dest\+16\(%rax\)
+** movaps %xmm0, dest\+32\(%rax\)
+** ret
+**...
+*/
+
+char dest[184];
+
+void
+foo (void)
+{
+ __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** pxor %xmm0, %xmm0
+** xorl %eax, %eax
+**.L[0-9]+:
+** movl %eax, %edx
+** addl \$64, %eax
+** movaps %xmm0, dest\(%rdx\)
+** movaps %xmm0, dest\+16\(%rdx\)
+** movaps %xmm0, dest\+32\(%rdx\)
+** movaps %xmm0, dest\+48\(%rdx\)
+** cmpl \$128, %eax
+** jb .L[0-9]+
+** movaps %xmm0, dest\+32\(%rax\)
+** movaps %xmm0, dest\(%rax\)
+** movl \$0, dest\+47\(%rax\)
+** movaps %xmm0, dest\+16\(%rax\)
+** ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (void)
+{
+ __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** pxor %xmm0, %xmm0
+** xorl %eax, %eax
+**.L[0-9]+:
+** movl %eax, %edx
+** addl \$64, %eax
+** movaps %xmm0, dest\(%rdx\)
+** movaps %xmm0, dest\+16\(%rdx\)
+** movaps %xmm0, dest\+32\(%rdx\)
+** movaps %xmm0, dest\+48\(%rdx\)
+** cmpl \$128, %eax
+** jb .L[0-9]+
+** movb \$0, dest\+48\(%rax\)
+** movaps %xmm0, dest\(%rax\)
+** movaps %xmm0, dest\+16\(%rax\)
+** movaps %xmm0, dest\+32\(%rax\)
+** ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (void)
+{
+ __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vpxor %xmm0, %xmm0, %xmm0
+** vmovdqu %ymm0, 192\(%rdi\)
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, 32\(%rdi\)
+** vmovdqu %ymm0, 64\(%rdi\)
+** vmovdqu %ymm0, 96\(%rdi\)
+** vmovdqu %ymm0, 128\(%rdi\)
+** vmovdqu %ymm0, 160\(%rdi\)
+** vmovdqu %ymm0, 222\(%rdi\)
+** vzeroupper
+** ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movd %edi, %xmm0
+** punpcklbw %xmm0, %xmm0
+** punpcklwd %xmm0, %xmm0
+** pshufd \$0, %xmm0, %xmm0
+** movaps %xmm0, dest\+160\(%rip\)
+** movaps %xmm0, dest\(%rip\)
+** movaps %xmm0, dest\+16\(%rip\)
+** movaps %xmm0, dest\+32\(%rip\)
+** movaps %xmm0, dest\+48\(%rip\)
+** movaps %xmm0, dest\+64\(%rip\)
+** movaps %xmm0, dest\+80\(%rip\)
+** movaps %xmm0, dest\+96\(%rip\)
+** movaps %xmm0, dest\+112\(%rip\)
+** movaps %xmm0, dest\+128\(%rip\)
+** movaps %xmm0, dest\+144\(%rip\)
+** movd %xmm0, dest\+175\(%rip\)
+** ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (int c)
+{
+ __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movd %edi, %xmm0
+** movb %dil, dest\+176\(%rip\)
+** punpcklbw %xmm0, %xmm0
+** punpcklwd %xmm0, %xmm0
+** pshufd \$0, %xmm0, %xmm0
+** movaps %xmm0, dest\(%rip\)
+** movaps %xmm0, dest\+16\(%rip\)
+** movaps %xmm0, dest\+32\(%rip\)
+** movaps %xmm0, dest\+48\(%rip\)
+** movaps %xmm0, dest\+64\(%rip\)
+** movaps %xmm0, dest\+80\(%rip\)
+** movaps %xmm0, dest\+96\(%rip\)
+** movaps %xmm0, dest\+112\(%rip\)
+** movaps %xmm0, dest\+128\(%rip\)
+** movaps %xmm0, dest\+144\(%rip\)
+** movaps %xmm0, dest\+160\(%rip\)
+** ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (int c)
+{
+ __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=rep_8byte:8192:align,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movl \$25, %ecx
+** xorl %eax, %eax
+** movl \$dest, %edi
+** rep stosq
+** movl \$0, \(%rdi\)
+** ret
+**...
+*/
+
+#define SIZE 204
+
+char dest[SIZE];
+
+void
+foo (void)
+{
+ __builtin_memset (dest, 0, sizeof (dest));
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movzbl %dil, %edi
+** movl \$p, %eax
+** movabsq \$72340172838076673, %rdx
+** imulq %rdx, %rdi
+** movq %rdi, %xmm0
+** punpcklqdq %xmm0, %xmm0
+** cmpq \$64, %rsi
+** jnb .L18
+**.L2:
+** movq %rsi, %rcx
+** andl \$63, %ecx
+** je .L1
+** xorl %edx, %edx
+** andl \$1, %esi
+** je .L5
+** movl \$1, %edx
+** movb %dil, \(%rax\)
+** cmpq %rcx, %rdx
+** jnb .L19
+**.L5:
+** movb %dil, \(%rax,%rdx\)
+** movb %dil, 1\(%rax,%rdx\)
+** addq \$2, %rdx
+** cmpq %rcx, %rdx
+** jb .L5
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L18:
+** movq %rsi, %rdx
+** xorl %eax, %eax
+** andq \$-64, %rdx
+**.L3:
+** movaps %xmm0, p\(%rax\)
+** addq \$64, %rax
+** movaps %xmm0, p-48\(%rax\)
+** movaps %xmm0, p-32\(%rax\)
+** movaps %xmm0, p-16\(%rax\)
+** cmpq %rdx, %rax
+** jb .L3
+** addq \$p, %rax
+** jmp .L2
+**.L19:
+** ret
+** .cfi_endproc
+**...
+*/
+
+
+#define WRITE_CHUNK 256
+char p[WRITE_CHUNK];
+
+void
+foo (int c, __SIZE_TYPE__ nbyte)
+{
+ __builtin_memset (p, c, nbyte);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vpxor %xmm0, %xmm0, %xmm0
+** vmovdqu8 %zmm0, 128\(%rdi\)
+** vmovdqu8 %zmm0, \(%rdi\)
+** vmovdqu8 %zmm0, 64\(%rdi\)
+** vmovdqu8 %zmm0, 190\(%rdi\)
+** vzeroupper
+** ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+ __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movabsq \$289360691352306692, %rax
+** movq %rax, %xmm0
+** punpcklqdq %xmm0, %xmm0
+** cmpq \$64, %rsi
+** jnb .L2
+** testb \$32, %sil
+** jne .L19
+** testb \$16, %sil
+** jne .L20
+** testb \$8, %sil
+** jne .L21
+** testb \$4, %sil
+** jne .L22
+** testq %rsi, %rsi
+** jne .L23
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** movups %xmm0, -64\(%rdi,%rsi\)
+** movups %xmm0, -48\(%rdi,%rsi\)
+** movups %xmm0, -32\(%rdi,%rsi\)
+** movups %xmm0, -16\(%rdi,%rsi\)
+** subq \$1, %rsi
+** cmpq \$64, %rsi
+** jb .L1
+** andq \$-64, %rsi
+** xorl %eax, %eax
+**.L9:
+** movups %xmm0, \(%rdi,%rax\)
+** movups %xmm0, 16\(%rdi,%rax\)
+** movups %xmm0, 32\(%rdi,%rax\)
+** movups %xmm0, 48\(%rdi,%rax\)
+** addq \$64, %rax
+** cmpq %rsi, %rax
+** jb .L9
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** movb \$4, \(%rdi\)
+** testb \$2, %sil
+** je .L1
+** movl \$1028, %eax
+** movw %ax, -2\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** movups %xmm0, \(%rdi\)
+** movups %xmm0, 16\(%rdi\)
+** movups %xmm0, -32\(%rdi,%rsi\)
+** movups %xmm0, -16\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L20:
+** movups %xmm0, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L21:
+** movq %rax, \(%rdi\)
+** movq %rax, -8\(%rdi,%rsi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** movl \$67372036, \(%rdi\)
+** movl \$67372036, -4\(%rdi,%rsi\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movabsq \$289360691352306692, %rax
+** vmovq %rax, %xmm1
+** vpbroadcastq %xmm1, %ymm0
+** cmpq \$128, %rsi
+** jnb .L2
+** testb \$64, %sil
+** jne .L21
+** testb \$32, %sil
+** jne .L22
+** testb \$16, %sil
+** jne .L23
+** testb \$8, %sil
+** jne .L24
+** testb \$4, %sil
+** jne .L25
+** testq %rsi, %rsi
+** jne .L26
+**.L19:
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** vmovdqu %ymm0, -128\(%rdi,%rsi\)
+** vmovdqu %ymm0, -96\(%rdi,%rsi\)
+** vmovdqu %ymm0, -64\(%rdi,%rsi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** subq \$1, %rsi
+** cmpq \$128, %rsi
+** jb .L19
+** andq \$-128, %rsi
+** xorl %eax, %eax
+**.L10:
+** vmovdqu %ymm0, \(%rdi,%rax\)
+** vmovdqu %ymm0, 32\(%rdi,%rax\)
+** vmovdqu %ymm0, 64\(%rdi,%rax\)
+** vmovdqu %ymm0, 96\(%rdi,%rax\)
+** subq \$-128, %rax
+** cmpq %rsi, %rax
+** jb .L10
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L26:
+** movb \$4, \(%rdi\)
+** testb \$2, %sil
+** je .L19
+** movl \$1028, %eax
+** movw %ax, -2\(%rdi,%rsi\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L21:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, 32\(%rdi\)
+** vmovdqu %ymm0, -64\(%rdi,%rsi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** vmovdqu %xmm0, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rsi\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L24:
+** movq %rax, \(%rdi\)
+** movq %rax, -8\(%rdi,%rsi\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L25:
+** movl \$67372036, \(%rdi\)
+** movl \$67372036, -4\(%rdi,%rsi\)
+** jmp .L19
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movabsq \$289360691352306692, %rax
+** vpbroadcastq %rax, %zmm0
+** cmpq \$256, %rsi
+** jnb .L2
+** testb \$-128, %sil
+** jne .L22
+** testb \$64, %sil
+** jne .L23
+** testb \$32, %sil
+** jne .L24
+** testb \$16, %sil
+** jne .L25
+** testb \$8, %sil
+** jne .L26
+** testb \$4, %sil
+** jne .L27
+** testq %rsi, %rsi
+** jne .L28
+**.L20:
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** vmovdqu64 %zmm0, -256\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -192\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rsi\)
+** subq \$1, %rsi
+** cmpq \$256, %rsi
+** jb .L20
+** xorb %sil, %sil
+** xorl %eax, %eax
+**.L11:
+** vmovdqu64 %zmm0, \(%rdi,%rax\)
+** vmovdqu64 %zmm0, 64\(%rdi,%rax\)
+** vmovdqu64 %zmm0, 128\(%rdi,%rax\)
+** vmovdqu64 %zmm0, 192\(%rdi,%rax\)
+** addq \$256, %rax
+** cmpq %rsi, %rax
+** jb .L11
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L28:
+** movb \$4, \(%rdi\)
+** testb \$2, %sil
+** je .L20
+** movl \$1028, %eax
+** movw %ax, -2\(%rdi,%rsi\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** vmovdqu64 %zmm0, \(%rdi\)
+** vmovdqu64 %zmm0, 64\(%rdi\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rsi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rsi\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** vmovdqu64 %zmm0, \(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rsi\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L24:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rsi\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L25:
+** vmovdqu %xmm0, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rsi\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L26:
+** movq %rax, \(%rdi\)
+** movq %rax, -8\(%rdi,%rsi\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L27:
+** movl \$67372036, \(%rdi\)
+** movl \$67372036, -4\(%rdi,%rsi\)
+** jmp .L20
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movabsq \$72340172838076673, %rax
+** movzbl %sil, %esi
+** imulq %rax, %rsi
+** movq %rsi, %xmm0
+** punpcklqdq %xmm0, %xmm0
+** cmpq \$64, %rdx
+** jnb .L2
+** testb \$32, %dl
+** jne .L19
+** testb \$16, %dl
+** jne .L20
+** testb \$8, %dl
+** jne .L21
+** testb \$4, %dl
+** jne .L22
+** testq %rdx, %rdx
+** jne .L23
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** movups %xmm0, -64\(%rdi,%rdx\)
+** movups %xmm0, -48\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** subq \$1, %rdx
+** cmpq \$64, %rdx
+** jb .L1
+** andq \$-64, %rdx
+** xorl %eax, %eax
+**.L9:
+** movups %xmm0, \(%rdi,%rax\)
+** movups %xmm0, 16\(%rdi,%rax\)
+** movups %xmm0, 32\(%rdi,%rax\)
+** movups %xmm0, 48\(%rdi,%rax\)
+** addq \$64, %rax
+** cmpq %rdx, %rax
+** jb .L9
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** movb %sil, \(%rdi\)
+** testb \$2, %dl
+** je .L1
+** movw %si, -2\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** movups %xmm0, \(%rdi\)
+** movups %xmm0, 16\(%rdi\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L20:
+** movups %xmm0, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L21:
+** movq %rsi, \(%rdi\)
+** movq %rsi, -8\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** movl %esi, \(%rdi\)
+** movl %esi, -4\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movabsq \$72340172838076673, %rax
+** movzbl %sil, %esi
+** imulq %rax, %rsi
+** vmovq %rsi, %xmm1
+** vpbroadcastq %xmm1, %ymm0
+** cmpq \$128, %rdx
+** jnb .L2
+** testb \$64, %dl
+** jne .L21
+** testb \$32, %dl
+** jne .L22
+** testb \$16, %dl
+** jne .L23
+** testb \$8, %dl
+** jne .L24
+** testb \$4, %dl
+** jne .L25
+** testq %rdx, %rdx
+** jne .L26
+**.L19:
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** vmovdqu %ymm0, -128\(%rdi,%rdx\)
+** vmovdqu %ymm0, -96\(%rdi,%rdx\)
+** vmovdqu %ymm0, -64\(%rdi,%rdx\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** subq \$1, %rdx
+** cmpq \$128, %rdx
+** jb .L19
+** andq \$-128, %rdx
+** xorl %eax, %eax
+**.L10:
+** vmovdqu %ymm0, \(%rdi,%rax\)
+** vmovdqu %ymm0, 32\(%rdi,%rax\)
+** vmovdqu %ymm0, 64\(%rdi,%rax\)
+** vmovdqu %ymm0, 96\(%rdi,%rax\)
+** subq \$-128, %rax
+** cmpq %rdx, %rax
+** jb .L10
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L26:
+** movb %sil, \(%rdi\)
+** testb \$2, %dl
+** je .L19
+** movw %si, -2\(%rdi,%rdx\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L21:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, 32\(%rdi\)
+** vmovdqu %ymm0, -64\(%rdi,%rdx\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** vmovdqu %xmm0, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rdx\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L24:
+** movq %rsi, \(%rdi\)
+** movq %rsi, -8\(%rdi,%rdx\)
+** jmp .L19
+** .p2align 4,,10
+** .p2align 3
+**.L25:
+** movl %esi, \(%rdi\)
+** movl %esi, -4\(%rdi,%rdx\)
+** jmp .L19
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movabsq \$72340172838076673, %rax
+** movzbl %sil, %esi
+** imulq %rax, %rsi
+** vpbroadcastq %rsi, %zmm0
+** cmpq \$256, %rdx
+** jnb .L2
+** testb \$-128, %dl
+** jne .L22
+** testb \$64, %dl
+** jne .L23
+** testb \$32, %dl
+** jne .L24
+** testb \$16, %dl
+** jne .L25
+** testb \$8, %dl
+** jne .L26
+** testb \$4, %dl
+** jne .L27
+** testq %rdx, %rdx
+** jne .L28
+**.L20:
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L2:
+** vmovdqu64 %zmm0, -256\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rdx\)
+** subq \$1, %rdx
+** cmpq \$256, %rdx
+** jb .L20
+** xorb %dl, %dl
+** xorl %eax, %eax
+**.L11:
+** vmovdqu64 %zmm0, \(%rdi,%rax\)
+** vmovdqu64 %zmm0, 64\(%rdi,%rax\)
+** vmovdqu64 %zmm0, 128\(%rdi,%rax\)
+** vmovdqu64 %zmm0, 192\(%rdi,%rax\)
+** addq \$256, %rax
+** cmpq %rdx, %rax
+** jb .L11
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L28:
+** movb %sil, \(%rdi\)
+** testb \$2, %dl
+** je .L20
+** movw %si, -2\(%rdi,%rdx\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L22:
+** vmovdqu64 %zmm0, \(%rdi\)
+** vmovdqu64 %zmm0, 64\(%rdi\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rdx\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L23:
+** vmovdqu64 %zmm0, \(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rdx\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L24:
+** vmovdqu %ymm0, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L25:
+** vmovdqu %xmm0, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rdx\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L26:
+** movq %rsi, \(%rdi\)
+** movq %rsi, -8\(%rdi,%rdx\)
+** jmp .L20
+** .p2align 4,,10
+** .p2align 3
+**.L27:
+** movl %esi, \(%rdi\)
+** movl %esi, -4\(%rdi,%rdx\)
+** jmp .L20
+** .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+ __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
**foo:
**.LFB[0-9]+:
** .cfi_startproc
+** movq \$0, 221\(%rdi\)
** xorl %eax, %eax
+** movq \$0, 229\(%rdi\)
+** movq \$0, 237\(%rdi\)
+** movq \$0, 245\(%rdi\)
**.L[0-9]+:
** movl %eax, %edx
** addl \$32, %eax
** movq \$0, 24\(%rdi,%rdx\)
** cmpl \$224, %eax
** jb .L[0-9]+
+** ret
**...
*/
**...
**.LFB[0-9]+:
** .cfi_startproc
+** movq \$0, 49\(%rdi\)
** xorl %eax, %eax
+** movq \$0, 57\(%rdi\)
+** movq \$0, 65\(%rdi\)
+** movq \$0, 73\(%rdi\)
**.L[0-9]+:
** movl %eax, %edx
** addl \$32, %eax
** movq \$0, 24\(%rdi,%rdx\)
** cmpl \$64, %eax
** jb .L[0-9]+
+** ret
**...
*/
**...
**.LFB[0-9]+:
** .cfi_startproc
+** movq \$0, 63\(%rdi\)
** xorl %eax, %eax
+** movq \$0, 71\(%rdi\)
+** movq \$0, 79\(%rdi\)
+** movq \$0, 87\(%rdi\)
**.L[0-9]+:
** movl %eax, %edx
** addl \$32, %eax
** movq \$0, 24\(%rdi,%rdx\)
** cmpl \$64, %eax
** jb .L[0-9]+
+** ret
**...
*/
**...
** pxor %xmm0, %xmm0
** xorl %eax, %eax
+** movups %xmm0, 190\(%rdi\)
+** movups %xmm0, 206\(%rdi\)
+** movups %xmm0, 222\(%rdi\)
+** movups %xmm0, 238\(%rdi\)
**.L[0-9]+:
** movl %eax, %edx
** addl \$64, %eax