x86: Improve vector_loop/unrolled_loop for memset/memcpy

author H.J. Lu <hjl.tools@gmail.com>

Tue, 17 Jun 2025 02:17:17 +0000 (10:17 +0800)

committer H.J. Lu <hjl.tools@gmail.com>

Mon, 7 Jul 2025 07:39:31 +0000 (15:39 +0800)
author H.J. Lu <hjl.tools@gmail.com>
Tue, 17 Jun 2025 02:17:17 +0000 (10:17 +0800)
committer H.J. Lu <hjl.tools@gmail.com>
Mon, 7 Jul 2025 07:39:31 +0000 (15:39 +0800)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc

index 83076adb55dbfddf0cbd2bf32dcd1717d83ef6cb..8f15c1c6cdfbdf414ed10af30949d5a83af94550 100644 (file)
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -7899,7 +7899,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
                                rtx count, machine_mode mode, int unroll,
                                int expected_size, bool issetmem)
  {
-  rtx_code_label *out_label, *top_label;
+  rtx_code_label *out_label = nullptr;
+  rtx_code_label *top_label = nullptr;
    rtx iter, tmp;
    machine_mode iter_mode = counter_mode (count);
    int piece_size_n = GET_MODE_SIZE (mode) * unroll;
@@ -7907,9 +7908,19 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
    rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
    rtx size;
    int i;
+  int loop_count;
  
-  top_label = gen_label_rtx ();
-  out_label = gen_label_rtx ();
+  if (expected_size != -1 && CONST_INT_P (count))
+    loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
+  else
+    loop_count = -1;
+
+  /* Don't generate the loop if the loop count is 1.  */
+  if (loop_count != 1)
+    {
+      top_label = gen_label_rtx ();
+      out_label = gen_label_rtx ();
+    }
    iter = gen_reg_rtx (iter_mode);
  
    size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
@@ -7923,7 +7934,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
      }
    emit_move_insn (iter, const0_rtx);
  
-  emit_label (top_label);
+  if (loop_count != 1)
+    emit_label (top_label);
  
    tmp = convert_modes (Pmode, iter_mode, iter, true);
  
@@ -7991,21 +8003,25 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
    if (tmp != iter)
      emit_move_insn (iter, tmp);
  
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
-                          true, top_label);
-  if (expected_size != -1)
+  if (loop_count != 1)
      {
-      expected_size /= GET_MODE_SIZE (mode) * unroll;
-      if (expected_size == 0)
-       predict_jump (0);
-      else if (expected_size > REG_BR_PROB_BASE)
-       predict_jump (REG_BR_PROB_BASE - 1);
+      emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+                              true, top_label);
+      if (expected_size != -1)
+       {
+         expected_size /= GET_MODE_SIZE (mode) * unroll;
+         if (expected_size == 0)
+           predict_jump (0);
+         else if (expected_size > REG_BR_PROB_BASE)
+           predict_jump (REG_BR_PROB_BASE - 1);
+         else
+           predict_jump (REG_BR_PROB_BASE
+                         - (REG_BR_PROB_BASE + expected_size / 2)
+                           / expected_size);
+       }
        else
-        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
-                     / expected_size);
+       predict_jump (REG_BR_PROB_BASE * 80 / 100);
      }
-  else
-    predict_jump (REG_BR_PROB_BASE * 80 / 100);
    iter = ix86_zero_extend_to_Pmode (iter);
    tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
                              true, OPTAB_LIB_WIDEN);
@@ -8018,7 +8034,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
        if (tmp != srcptr)
         emit_move_insn (srcptr, tmp);
      }
-  emit_label (out_label);
+  if (loop_count != 1)
+    emit_label (out_label);
  }
  
  /* Divide COUNTREG by SCALE.  */
@@ -8221,19 +8238,11 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
    rtx src, dest;
    if (CONST_INT_P (count))
      {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-        relaxed, but it'll require a bit more complicated epilogue
-        expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-       {
-         if (epilogue_size & i)
-           destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
-       }
+      unsigned HOST_WIDE_INT countval = UINTVAL (count);
+      unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+      unsigned int destalign = MEM_ALIGN (destmem);
+      move_by_pieces (destmem, srcmem, epilogue_size, destalign,
+                     RETURN_BEGIN);
        return;
      }
    if (max_size > 8)
@@ -8394,6 +8403,81 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
                                  1, max_size / 2, true);
  }
  
+/* Callback routine for store_by_pieces.  Return the RTL of a register
+   containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
+   is a word or a word vector register.  If PREV_P isn't nullptr, it
+   has the RTL info from the previous iteration.  */
+
+static rtx
+setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
+                        fixed_size_mode mode)
+{
+  rtx target;
+  by_pieces_prev *prev = (by_pieces_prev *) prev_p;
+  if (prev)
+    {
+      rtx prev_op = prev->data;
+      if (prev_op)
+       {
+         machine_mode prev_mode = GET_MODE (prev_op);
+         if (prev_mode == mode)
+           return prev_op;
+         if (VECTOR_MODE_P (prev_mode)
+             && VECTOR_MODE_P (mode)
+             && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
+           {
+             target = gen_rtx_SUBREG (mode, prev_op, 0);
+             return target;
+           }
+       }
+    }
+
+  rtx op = (rtx) op_p;
+  machine_mode op_mode = GET_MODE (op);
+
+  gcc_assert (op_mode == word_mode
+             || (VECTOR_MODE_P (op_mode)
+                 && GET_MODE_INNER (op_mode) == word_mode));
+
+  if (VECTOR_MODE_P (mode))
+    {
+      gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+      unsigned int op_size = GET_MODE_SIZE (op_mode);
+      unsigned int size = GET_MODE_SIZE (mode);
+      unsigned int nunits = op_size / GET_MODE_SIZE (QImode);
+      machine_mode vec_mode
+       = mode_for_vector (QImode, nunits).require ();
+      target = gen_reg_rtx (vec_mode);
+      op = gen_rtx_SUBREG (vec_mode, op, 0);
+      emit_move_insn (target, op);
+      if (op_size == size)
+       return target;
+
+      rtx tmp = gen_reg_rtx (mode);
+      target = gen_rtx_SUBREG (mode, target, 0);
+      emit_move_insn (tmp, target);
+      return tmp;
+    }
+
+  target = gen_reg_rtx (word_mode);
+  if (VECTOR_MODE_P (op_mode))
+    {
+      op = gen_rtx_SUBREG (word_mode, op, 0);
+      emit_move_insn (target, op);
+    }
+  else
+    target = op;
+
+  if (mode == word_mode)
+    return target;
+
+  rtx tmp = gen_reg_rtx (mode);
+  target = gen_rtx_SUBREG (mode, target, 0);
+  emit_move_insn (tmp, target);
+  return tmp;
+}
+
  /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
  static void
  expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
@@ -8403,24 +8487,12 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
  
    if (CONST_INT_P (count))
      {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-        relaxed, but it'll require a bit more complicated epilogue
-        expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-       {
-         if (epilogue_size & i)
-           {
-             if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
-               destmem = emit_memset (destmem, destptr, vec_value, i);
-             else
-               destmem = emit_memset (destmem, destptr, value, i);
-           }
-       }
+      unsigned HOST_WIDE_INT countval = UINTVAL (count);
+      unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+      unsigned int destalign = MEM_ALIGN (destmem);
+      store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
+                      vec_value ? vec_value : value, destalign, true,
+                      RETURN_BEGIN);
        return;
      }
    if (max_size > 32)
@@ -8552,6 +8624,7 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
    rtx_code_label *label = ix86_expand_aligntest (count, size, false);
    machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
    rtx modesize;
+  rtx scalar_value = value;
    int n;
  
    /* If we do not have vector value to copy, we must reduce size.  */
@@ -8571,11 +8644,57 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
      {
        /* Choose appropriate vector mode.  */
        if (size >= 32)
-       mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+       switch (MOVE_MAX)
+         {
+         case 64:
+           if (size >= 64)
+             {
+               mode = V64QImode;
+               break;
+             }
+           /* FALLTHRU */
+         case 32:
+           mode = V32QImode;
+           break;
+         case 16:
+           mode = V16QImode;
+           break;
+         case 8:
+           mode = DImode;
+           break;
+         default:
+           gcc_unreachable ();
+         }
        else if (size >= 16)
         mode = TARGET_SSE ? V16QImode : DImode;
        srcmem = change_address (srcmem, mode, srcptr);
      }
+  if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
+    {
+      /* For memset with vector and the size is smaller than the vector
+        size, first try the narrower vector, otherwise, use the
+        original value. */
+      machine_mode inner_mode = GET_MODE_INNER (mode);
+      unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
+      if (nunits > 1)
+       {
+         mode = mode_for_vector (GET_MODE_INNER (mode),
+                                 nunits).require ();
+         value = gen_rtx_SUBREG (mode, value, 0);
+       }
+      else
+       {
+         scalar_int_mode smode
+           = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
+         gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
+                     >= GET_MODE_SIZE (smode));
+         mode = smode;
+         if (GET_MODE (scalar_value) == mode)
+           value = scalar_value;
+         else
+           value = gen_rtx_SUBREG (mode, scalar_value, 0);
+       }
+    }
    destmem = change_address (destmem, mode, destptr);
    modesize = GEN_INT (GET_MODE_SIZE (mode));
    gcc_assert (GET_MODE_SIZE (mode) <= size);
@@ -9179,13 +9298,26 @@ decide_alignment (int align,
  static rtx
  promote_duplicated_reg (machine_mode mode, rtx val)
  {
+  if (val == const0_rtx)
+    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+
    machine_mode valmode = GET_MODE (val);
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* Duplicate the scalar value for integer vector.  */
+      gcc_assert ((val == const0_rtx || val == constm1_rtx)
+                 || GET_MODE_INNER (mode) == valmode);
+      rtx dup = gen_reg_rtx (mode);
+      bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
+                                                  val);
+      gcc_assert (ok);
+      return dup;
+    }
+
    rtx tmp;
    int nops = mode == DImode ? 3 : 2;
  
-  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
-  if (val == const0_rtx)
-    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+  gcc_assert (mode == SImode || mode == DImode);
    if (CONST_INT_P (val))
      {
        HOST_WIDE_INT v = INTVAL (val) & 255;
@@ -9413,11 +9545,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
      return false;
    gcc_assert (alg != no_stringop);
  
-  /* For now vector-version of memset is generated only for memory zeroing, as
-     creating of promoted vector value is very cheap in this case.  */
-  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
-    alg = unrolled_loop;
-
    if (!count)
      count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
    destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
@@ -9510,20 +9637,41 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
         && ((desired_align > align && !align_bytes)
            || (!count && epilogue_size_needed > 1)));
  
+  /* Destination is aligned after the misaligned prologue.  */
+  bool aligned_dstmem = misaligned_prologue_used;
+
+  if (noalign && !misaligned_prologue_used)
+    {
+      /* Also use misaligned prologue if alignment isn't needed and
+        destination isn't aligned.   Since alignment isn't needed,
+        the destination after prologue won't be aligned.  */
+      aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+                       <= MEM_ALIGN (dst));
+      if (!aligned_dstmem)
+       misaligned_prologue_used = true;
+    }
+
    /* Do the cheap promotion to allow better CSE across the
       main loop and epilogue (ie one load of the big constant in the
       front of all code.
       For now the misaligned move sequences do not have fast path
       without broadcasting.  */
-  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+  if (issetmem
+      && (alg == vector_loop
+         || CONST_INT_P (val_exp)
+         || misaligned_prologue_used))
      {
        if (alg == vector_loop)
         {
-         gcc_assert (val_exp == const0_rtx);
-         vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
           promoted_val = promote_duplicated_reg_to_size (val_exp,
                                                          GET_MODE_SIZE (word_mode),
                                                          desired_align, align);
+         /* Duplicate the promoted scalar value if not 0 nor -1.  */
+         vec_promoted_val
+           = promote_duplicated_reg (move_mode,
+                                     (val_exp == const0_rtx
+                                      || val_exp == constm1_rtx)
+                                     ? val_exp : promoted_val);
         }
        else
         {
@@ -9548,7 +9696,8 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
        if (!issetmem)
          src = change_address (src, BLKmode, srcreg);
        dst = change_address (dst, BLKmode, destreg);
-      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      if (aligned_dstmem)
+       set_mem_align (dst, desired_align * BITS_PER_UNIT);
        epilogue_size_needed = 0;
        if (need_zero_guard
           && min_size < (unsigned HOST_WIDE_INT) size_needed)
diff --git a/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c b/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c

index 102217ce2c1b016babbc7750dfdaf8803306a643..4f26aa47802c51146fc7a5f981ff1cf8ae25ec20 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
+++ b/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
@@ -8,17 +8,28 @@
  /*
  **foo:
  **...
+**     leaq    -160\(%rbp\), %rax
+**     movq    %rax, %rcx
  **     pxor    %xmm0, %xmm0
-**...
+**     movl    \$160, %edx
+**     movl    %edx, %edi
+**     andl    \$-64, %edi
+**     movl    \$0, %esi
  **.L[0-9]+:
-**     movl    %esi, %ecx
-**     movaps  %xmm0, \(%rdx,%rcx\)
-**     movaps  %xmm0, 16\(%rdx,%rcx\)
-**     movaps  %xmm0, 32\(%rdx,%rcx\)
-**     movaps  %xmm0, 48\(%rdx,%rcx\)
+**     movl    %esi, %edx
+**     movaps  %xmm0, \(%rax,%rdx\)
+**     movaps  %xmm0, 16\(%rax,%rdx\)
+**     movaps  %xmm0, 32\(%rax,%rdx\)
+**     movaps  %xmm0, 48\(%rax,%rdx\)
  **     addl    \$64, %esi
  **     cmpl    %edi, %esi
  **     jb      .L[0-9]+
+**     movl    %esi, %eax
+**     addq    %rax, %rcx
+**     movaps  %xmm0, \(%rcx\)
+**     movaps  %xmm0, 16\(%rcx\)
+**     movzbl  -116\(%rbp\), %eax
+**     movsbl  %al, %eax
  **...
  */
  
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c

new file mode 100644 (file)

index 0000000..753238e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemcpy-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movq    221\(%rsi\), %rax
+**     xorl    %edx, %edx
+**     movq    %rax, 221\(%rdi\)
+**     movq    229\(%rsi\), %rax
+**     movq    %rax, 229\(%rdi\)
+**     movq    237\(%rsi\), %rax
+**     movq    %rax, 237\(%rdi\)
+**     movq    245\(%rsi\), %rax
+**     movq    %rax, 245\(%rdi\)
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$32, %edx
+**     movq    \(%rsi,%rax\), %r10
+**     movq    8\(%rsi,%rax\), %r9
+**     movq    16\(%rsi,%rax\), %r8
+**     movq    24\(%rsi,%rax\), %rcx
+**     movq    %r10, \(%rdi,%rax\)
+**     movq    %r9, 8\(%rdi,%rax\)
+**     movq    %r8, 16\(%rdi,%rax\)
+**     movq    %rcx, 24\(%rdi,%rax\)
+**     cmpl    \$224, %edx
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 253);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c

new file mode 100644 (file)

index 0000000..9b0fb06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     xorl    %edx, %edx
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$64, %edx
+**     movdqa  src\(%rax\), %xmm3
+**     movdqa  src\+16\(%rax\), %xmm2
+**     movdqa  src\+32\(%rax\), %xmm1
+**     movdqa  src\+48\(%rax\), %xmm0
+**     movaps  %xmm3, dest\(%rax\)
+**     movaps  %xmm2, dest\+16\(%rax\)
+**     movaps  %xmm1, dest\+32\(%rax\)
+**     movaps  %xmm0, dest\+48\(%rax\)
+**     cmpl    \$256, %edx
+**     jb      .L[0-9]+
+**     movdqa  src\(%rdx\), %xmm0
+**     movaps  %xmm0, dest\(%rdx\)
+**     ret
+**...
+*/
+
+#define SIZE (16 + 1) * 16
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c

new file mode 100644 (file)

index 0000000..600459b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     xorl    %edx, %edx
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$64, %edx
+**     movdqa  src\(%rax\), %xmm3
+**     movdqa  src\+16\(%rax\), %xmm2
+**     movdqa  src\+32\(%rax\), %xmm1
+**     movdqa  src\+48\(%rax\), %xmm0
+**     movaps  %xmm3, dest\(%rax\)
+**     movaps  %xmm2, dest\+16\(%rax\)
+**     movaps  %xmm1, dest\+32\(%rax\)
+**     movaps  %xmm0, dest\+48\(%rax\)
+**     cmpl    \$256, %edx
+**     jb      .L[0-9]+
+**     movdqa  src\(%rdx\), %xmm0
+**     movaps  %xmm0, dest\(%rdx\)
+**     movdqu  src\+15\(%rdx\), %xmm0
+**     movups  %xmm0, dest\+15\(%rdx\)
+**     ret
+**...
+*/
+
+#define SIZE 16 * 16 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c

new file mode 100644 (file)

index 0000000..14833ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     xorl    %edx, %edx
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     subl    \$-128, %edx
+**     vmovdqa src\(%rax\), %ymm3
+**     vmovdqa src\+32\(%rax\), %ymm2
+**     vmovdqa src\+64\(%rax\), %ymm1
+**     vmovdqa src\+96\(%rax\), %ymm0
+**     vmovdqa %ymm3, dest\(%rax\)
+**     vmovdqa %ymm2, dest\+32\(%rax\)
+**     vmovdqa %ymm1, dest\+64\(%rax\)
+**     vmovdqa %ymm0, dest\+96\(%rax\)
+**     cmpl    \$512, %edx
+**     jb      .L[0-9]+
+**     vmovdqa src\(%rdx\), %ymm0
+**     vmovdqa %ymm0, dest\(%rdx\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE (16 + 1) * 32
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c

new file mode 100644 (file)

index 0000000..15ffed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     xorl    %edx, %edx
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     subl    \$-128, %edx
+**     vmovdqa src\(%rax\), %ymm3
+**     vmovdqa src\+32\(%rax\), %ymm2
+**     vmovdqa src\+64\(%rax\), %ymm1
+**     vmovdqa src\+96\(%rax\), %ymm0
+**     vmovdqa %ymm3, dest\(%rax\)
+**     vmovdqa %ymm2, dest\+32\(%rax\)
+**     vmovdqa %ymm1, dest\+64\(%rax\)
+**     vmovdqa %ymm0, dest\+96\(%rax\)
+**     cmpl    \$512, %edx
+**     jb      .L[0-9]+
+**     vmovdqa src\(%rdx\), %ymm0
+**     vmovdqa %ymm0, dest\(%rdx\)
+**     vmovdqu src\+31\(%rdx\), %ymm0
+**     vmovdqu %ymm0, dest\+31\(%rdx\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE 16 * 32 + 32 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c

new file mode 100644 (file)

index 0000000..d57dcc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     xorl    %edx, %edx
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$256, %edx
+**     vmovdqa64       src\(%rax\), %zmm3
+**     vmovdqa64       src\+64\(%rax\), %zmm2
+**     vmovdqa64       src\+128\(%rax\), %zmm1
+**     vmovdqa64       src\+192\(%rax\), %zmm0
+**     vmovdqa64       %zmm3, dest\(%rax\)
+**     vmovdqa64       %zmm2, dest\+64\(%rax\)
+**     vmovdqa64       %zmm1, dest\+128\(%rax\)
+**     vmovdqa64       %zmm0, dest\+192\(%rax\)
+**     cmpl    \$1024, %edx
+**     jb      .L[0-9]+
+**     vmovdqa64       src\(%rdx\), %zmm0
+**     vmovdqa64       %zmm0, dest\(%rdx\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE (16 + 1) * 64
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c

new file mode 100644 (file)

index 0000000..d9eb77d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     xorl    %edx, %edx
+**.L[0-9]+:
+**     movl    %edx, %eax
+**     addl    \$256, %edx
+**     vmovdqa64       src\(%rax\), %zmm3
+**     vmovdqa64       src\+64\(%rax\), %zmm2
+**     vmovdqa64       src\+128\(%rax\), %zmm1
+**     vmovdqa64       src\+192\(%rax\), %zmm0
+**     vmovdqa64       %zmm3, dest\(%rax\)
+**     vmovdqa64       %zmm2, dest\+64\(%rax\)
+**     vmovdqa64       %zmm1, dest\+128\(%rax\)
+**     vmovdqa64       %zmm0, dest\+192\(%rax\)
+**     cmpl    \$1024, %edx
+**     jb      .L[0-9]+
+**     vmovdqa src\(%rdx\), %ymm0
+**     vmovdqa %ymm0, dest\(%rdx\)
+**     vmovdqu src\+31\(%rdx\), %ymm0
+**     vmovdqu %ymm0, dest\+31\(%rdx\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+#define SIZE 16 * 64 + 63
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c

index d0316efc8eeab1a4ddfffd494403adf0d4ff3db3..47160864ea76bb172bfa8016ec7cc6e08b8e0bae 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
@@ -6,9 +6,16 @@
  /*
  **foo:
  **.LFB[0-9]+:
-**...
+**     .cfi_startproc
+**     movq    221\(%rsi\), %rax
  **     xorl    %edx, %edx
-**...
+**     movq    %rax, 221\(%rdi\)
+**     movq    229\(%rsi\), %rax
+**     movq    %rax, 229\(%rdi\)
+**     movq    237\(%rsi\), %rax
+**     movq    %rax, 237\(%rdi\)
+**     movq    245\(%rsi\), %rax
+**     movq    %rax, 245\(%rdi\)
  **.L[0-9]+:
  **     movl    %edx, %eax
  **     addl    \$32, %edx
@@ -22,6 +29,7 @@
  **     movq    %rcx, 24\(%rdi,%rax\)
  **     cmpl    \$224, %edx
  **     jb      .L[0-9]+
+**     ret
  **...
  */
  
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c

new file mode 100644 (file)

index 0000000..90e544d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**     movups  %xmm0, 190\(%rdi\)
+**     movups  %xmm0, 206\(%rdi\)
+**     movups  %xmm0, 222\(%rdi\)
+**     movups  %xmm0, 238\(%rdi\)
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movups  %xmm0, \(%rdi,%rdx\)
+**     movups  %xmm0, 16\(%rdi,%rdx\)
+**     movups  %xmm0, 32\(%rdi,%rdx\)
+**     movups  %xmm0, 48\(%rdi,%rdx\)
+**     cmpl    \$192, %eax
+**     jb      .L[0-9]+
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c

new file mode 100644 (file)

index 0000000..6d3d9e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movq    \$0, 48\(%rdi\)
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, 8\(%rdi\)
+**     movq    \$0, 16\(%rdi\)
+**     movq    \$0, 24\(%rdi\)
+**     movq    \$0, 32\(%rdi\)
+**     movq    \$0, 40\(%rdi\)
+**     movq    \$0, 53\(%rdi\)
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c

new file mode 100644 (file)

index 0000000..30b0cad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     movq    %rax, 48\(%rdi\)
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, 8\(%rdi\)
+**     movq    %rax, 16\(%rdi\)
+**     movq    %rax, 24\(%rdi\)
+**     movq    %rax, 32\(%rdi\)
+**     movq    %rax, 40\(%rdi\)
+**     movq    %rax, 53\(%rdi\)
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 4, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c

new file mode 100644 (file)

index 0000000..15987a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     movq    %rsi, 48\(%rdi\)
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, 8\(%rdi\)
+**     movq    %rsi, 16\(%rdi\)
+**     movq    %rsi, 24\(%rdi\)
+**     movq    %rsi, 32\(%rdi\)
+**     movq    %rsi, 40\(%rdi\)
+**     movq    %rsi, 53\(%rdi\)
+**     ret
+**...
+*/
+
+void
+foo (char *dest, int c)
+{
+  __builtin_memset (dest, c, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c

new file mode 100644 (file)

index 0000000..3da6ca7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movaps  %xmm0, dest\(%rdx\)
+**     movaps  %xmm0, dest\+16\(%rdx\)
+**     movaps  %xmm0, dest\+32\(%rdx\)
+**     movaps  %xmm0, dest\+48\(%rdx\)
+**     cmpl    \$192, %eax
+**     jb      .L[0-9]+
+**     movaps  %xmm0, dest\(%rax\)
+**     movaps  %xmm0, dest\+16\(%rax\)
+**     movaps  %xmm0, dest\+32\(%rax\)
+**     ret
+**...
+*/
+
+char dest[240];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c

new file mode 100644 (file)

index 0000000..7ec9b3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
@@ -0,0 +1,91 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     cmpq    \$64, %rsi
+**     jnb     .L2
+**     testb   \$32, %sil
+**     jne     .L19
+**     testb   \$16, %sil
+**     jne     .L20
+**     testb   \$8, %sil
+**     jne     .L21
+**     testb   \$4, %sil
+**     jne     .L22
+**     testq   %rsi, %rsi
+**     jne     .L23
+**.L1:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     movups  %xmm0, -64\(%rdi,%rsi\)
+**     movups  %xmm0, -48\(%rdi,%rsi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$64, %rsi
+**     jb      .L1
+**     andq    \$-64, %rsi
+**     xorl    %eax, %eax
+**.L9:
+**     movups  %xmm0, \(%rdi,%rax\)
+**     movups  %xmm0, 16\(%rdi,%rax\)
+**     movups  %xmm0, 32\(%rdi,%rax\)
+**     movups  %xmm0, 48\(%rdi,%rax\)
+**     addq    \$64, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L9
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     movb    \$0, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L1
+**     xorl    %eax, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L19:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, 16\(%rdi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L20:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     movl    \$0, \(%rdi\)
+**     movl    \$0, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c

new file mode 100644 (file)

index 0000000..e754405
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     cmpq    \$128, %rsi
+**     jnb     .L2
+**     testb   \$64, %sil
+**     jne     .L22
+**     testb   \$32, %sil
+**     jne     .L23
+**     testb   \$16, %sil
+**     jne     .L24
+**     testb   \$8, %sil
+**     jne     .L25
+**     testb   \$4, %sil
+**     jne     .L26
+**     testq   %rsi, %rsi
+**     jne     .L27
+**.L20:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu %ymm0, -128\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -96\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$128, %rsi
+**     jb      .L19
+**     andq    \$-128, %rsi
+**     xorl    %eax, %eax
+**.L10:
+**     vmovdqu %ymm0, \(%rdi,%rax\)
+**     vmovdqu %ymm0, 32\(%rdi,%rax\)
+**     vmovdqu %ymm0, 64\(%rdi,%rax\)
+**     vmovdqu %ymm0, 96\(%rdi,%rax\)
+**     subq    \$-128, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L10
+**.L19:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movb    \$0, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L20
+**     xorl    %eax, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movl    \$0, \(%rdi\)
+**     movl    \$0, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c

new file mode 100644 (file)

index 0000000..c519bf3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
@@ -0,0 +1,112 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     cmpq    \$256, %rsi
+**     jnb     .L2
+**     testb   \$-128, %sil
+**     jne     .L23
+**     testb   \$64, %sil
+**     jne     .L24
+**     testb   \$32, %sil
+**     jne     .L25
+**     testb   \$16, %sil
+**     jne     .L26
+**     testb   \$8, %sil
+**     jne     .L27
+**     testb   \$4, %sil
+**     jne     .L28
+**     testq   %rsi, %rsi
+**     jne     .L29
+**.L21:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu64       %zmm0, -256\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -192\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$256, %rsi
+**     jb      .L20
+**     xorb    %sil, %sil
+**     xorl    %eax, %eax
+**.L11:
+**     vmovdqu64       %zmm0, \(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 64\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 128\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 192\(%rdi,%rax\)
+**     addq    \$256, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L11
+**.L20:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L29:
+**     movb    \$0, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L21
+**     xorl    %eax, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, 64\(%rdi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movq    \$0, \(%rdi\)
+**     movq    \$0, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L28:
+**     movl    \$0, \(%rdi\)
+**     movl    \$0, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c

new file mode 100644 (file)

index 0000000..744184c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movaps  %xmm0, dest\(%rdx\)
+**     movaps  %xmm0, dest\+16\(%rdx\)
+**     movaps  %xmm0, dest\+32\(%rdx\)
+**     movaps  %xmm0, dest\+48\(%rdx\)
+**     cmpl    \$128, %eax
+**     jb      .L[0-9]+
+**     movq    \$0, dest\+48\(%rax\)
+**     movaps  %xmm0, dest\(%rax\)
+**     movaps  %xmm0, dest\+16\(%rax\)
+**     movaps  %xmm0, dest\+32\(%rax\)
+**     ret
+**...
+*/
+
+char dest[184];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c

new file mode 100644 (file)

index 0000000..32f8981
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movaps  %xmm0, dest\(%rdx\)
+**     movaps  %xmm0, dest\+16\(%rdx\)
+**     movaps  %xmm0, dest\+32\(%rdx\)
+**     movaps  %xmm0, dest\+48\(%rdx\)
+**     cmpl    \$128, %eax
+**     jb      .L[0-9]+
+**     movaps  %xmm0, dest\+32\(%rax\)
+**     movaps  %xmm0, dest\(%rax\)
+**     movl    \$0, dest\+47\(%rax\)
+**     movaps  %xmm0, dest\+16\(%rax\)
+**     ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c

new file mode 100644 (file)

index 0000000..04f9171
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     pxor    %xmm0, %xmm0
+**     xorl    %eax, %eax
+**.L[0-9]+:
+**     movl    %eax, %edx
+**     addl    \$64, %eax
+**     movaps  %xmm0, dest\(%rdx\)
+**     movaps  %xmm0, dest\+16\(%rdx\)
+**     movaps  %xmm0, dest\+32\(%rdx\)
+**     movaps  %xmm0, dest\+48\(%rdx\)
+**     cmpl    \$128, %eax
+**     jb      .L[0-9]+
+**     movb    \$0, dest\+48\(%rax\)
+**     movaps  %xmm0, dest\(%rax\)
+**     movaps  %xmm0, dest\+16\(%rax\)
+**     movaps  %xmm0, dest\+32\(%rax\)
+**     ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c

new file mode 100644 (file)

index 0000000..f7834c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     vmovdqu %ymm0, 192\(%rdi\)
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, 64\(%rdi\)
+**     vmovdqu %ymm0, 96\(%rdi\)
+**     vmovdqu %ymm0, 128\(%rdi\)
+**     vmovdqu %ymm0, 160\(%rdi\)
+**     vmovdqu %ymm0, 222\(%rdi\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c

new file mode 100644 (file)

index 0000000..edece12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movd    %edi, %xmm0
+**     punpcklbw       %xmm0, %xmm0
+**     punpcklwd       %xmm0, %xmm0
+**     pshufd  \$0, %xmm0, %xmm0
+**     movaps  %xmm0, dest\+160\(%rip\)
+**     movaps  %xmm0, dest\(%rip\)
+**     movaps  %xmm0, dest\+16\(%rip\)
+**     movaps  %xmm0, dest\+32\(%rip\)
+**     movaps  %xmm0, dest\+48\(%rip\)
+**     movaps  %xmm0, dest\+64\(%rip\)
+**     movaps  %xmm0, dest\+80\(%rip\)
+**     movaps  %xmm0, dest\+96\(%rip\)
+**     movaps  %xmm0, dest\+112\(%rip\)
+**     movaps  %xmm0, dest\+128\(%rip\)
+**     movaps  %xmm0, dest\+144\(%rip\)
+**     movd    %xmm0, dest\+175\(%rip\)
+**     ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (int c)
+{
+  __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c

new file mode 100644 (file)

index 0000000..a88e109
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movd    %edi, %xmm0
+**     movb    %dil, dest\+176\(%rip\)
+**     punpcklbw       %xmm0, %xmm0
+**     punpcklwd       %xmm0, %xmm0
+**     pshufd  \$0, %xmm0, %xmm0
+**     movaps  %xmm0, dest\(%rip\)
+**     movaps  %xmm0, dest\+16\(%rip\)
+**     movaps  %xmm0, dest\+32\(%rip\)
+**     movaps  %xmm0, dest\+48\(%rip\)
+**     movaps  %xmm0, dest\+64\(%rip\)
+**     movaps  %xmm0, dest\+80\(%rip\)
+**     movaps  %xmm0, dest\+96\(%rip\)
+**     movaps  %xmm0, dest\+112\(%rip\)
+**     movaps  %xmm0, dest\+128\(%rip\)
+**     movaps  %xmm0, dest\+144\(%rip\)
+**     movaps  %xmm0, dest\+160\(%rip\)
+**     ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (int c)
+{
+  __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c

new file mode 100644 (file)

index 0000000..f2bd698
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c
@@ -0,0 +1,27 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=rep_8byte:8192:align,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     movl    \$25, %ecx
+**     xorl    %eax, %eax
+**     movl    \$dest, %edi
+**     rep stosq
+**     movl    \$0, \(%rdi\)
+**     ret
+**...
+*/
+
+#define SIZE 204
+
+char dest[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c

new file mode 100644 (file)

index 0000000..784f8dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c
@@ -0,0 +1,67 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movzbl  %dil, %edi
+**     movl    \$p, %eax
+**     movabsq \$72340172838076673, %rdx
+**     imulq   %rdx, %rdi
+**     movq    %rdi, %xmm0
+**     punpcklqdq      %xmm0, %xmm0
+**     cmpq    \$64, %rsi
+**     jnb     .L18
+**.L2:
+**     movq    %rsi, %rcx
+**     andl    \$63, %ecx
+**     je      .L1
+**     xorl    %edx, %edx
+**     andl    \$1, %esi
+**     je      .L5
+**     movl    \$1, %edx
+**     movb    %dil, \(%rax\)
+**     cmpq    %rcx, %rdx
+**     jnb     .L19
+**.L5:
+**     movb    %dil, \(%rax,%rdx\)
+**     movb    %dil, 1\(%rax,%rdx\)
+**     addq    \$2, %rdx
+**     cmpq    %rcx, %rdx
+**     jb      .L5
+**.L1:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L18:
+**     movq    %rsi, %rdx
+**     xorl    %eax, %eax
+**     andq    \$-64, %rdx
+**.L3:
+**     movaps  %xmm0, p\(%rax\)
+**     addq    \$64, %rax
+**     movaps  %xmm0, p-48\(%rax\)
+**     movaps  %xmm0, p-32\(%rax\)
+**     movaps  %xmm0, p-16\(%rax\)
+**     cmpq    %rdx, %rax
+**     jb      .L3
+**     addq    \$p, %rax
+**     jmp     .L2
+**.L19:
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+
+#define WRITE_CHUNK 256
+char p[WRITE_CHUNK];
+
+void
+foo (int c, __SIZE_TYPE__ nbyte)
+{
+ __builtin_memset (p, c, nbyte);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c

new file mode 100644 (file)

index 0000000..621baf7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**     .cfi_startproc
+**     vpxor   %xmm0, %xmm0, %xmm0
+**     vmovdqu8        %zmm0, 128\(%rdi\)
+**     vmovdqu8        %zmm0, \(%rdi\)
+**     vmovdqu8        %zmm0, 64\(%rdi\)
+**     vmovdqu8        %zmm0, 190\(%rdi\)
+**     vzeroupper
+**     ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c

new file mode 100644 (file)

index 0000000..712404b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
@@ -0,0 +1,93 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     movq    %rax, %xmm0
+**     punpcklqdq      %xmm0, %xmm0
+**     cmpq    \$64, %rsi
+**     jnb     .L2
+**     testb   \$32, %sil
+**     jne     .L19
+**     testb   \$16, %sil
+**     jne     .L20
+**     testb   \$8, %sil
+**     jne     .L21
+**     testb   \$4, %sil
+**     jne     .L22
+**     testq   %rsi, %rsi
+**     jne     .L23
+**.L1:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     movups  %xmm0, -64\(%rdi,%rsi\)
+**     movups  %xmm0, -48\(%rdi,%rsi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$64, %rsi
+**     jb      .L1
+**     andq    \$-64, %rsi
+**     xorl    %eax, %eax
+**.L9:
+**     movups  %xmm0, \(%rdi,%rax\)
+**     movups  %xmm0, 16\(%rdi,%rax\)
+**     movups  %xmm0, 32\(%rdi,%rax\)
+**     movups  %xmm0, 48\(%rdi,%rax\)
+**     addq    \$64, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L9
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     movb    \$4, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L1
+**     movl    \$1028, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L19:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, 16\(%rdi\)
+**     movups  %xmm0, -32\(%rdi,%rsi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L20:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, -16\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, -8\(%rdi,%rsi\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     movl    \$67372036, \(%rdi\)
+**     movl    \$67372036, -4\(%rdi,%rsi\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c

new file mode 100644 (file)

index 0000000..f597395
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     vmovq   %rax, %xmm1
+**     vpbroadcastq    %xmm1, %ymm0
+**     cmpq    \$128, %rsi
+**     jnb     .L2
+**     testb   \$64, %sil
+**     jne     .L21
+**     testb   \$32, %sil
+**     jne     .L22
+**     testb   \$16, %sil
+**     jne     .L23
+**     testb   \$8, %sil
+**     jne     .L24
+**     testb   \$4, %sil
+**     jne     .L25
+**     testq   %rsi, %rsi
+**     jne     .L26
+**.L19:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu %ymm0, -128\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -96\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$128, %rsi
+**     jb      .L19
+**     andq    \$-128, %rsi
+**     xorl    %eax, %eax
+**.L10:
+**     vmovdqu %ymm0, \(%rdi,%rax\)
+**     vmovdqu %ymm0, 32\(%rdi,%rax\)
+**     vmovdqu %ymm0, 64\(%rdi,%rax\)
+**     vmovdqu %ymm0, 96\(%rdi,%rax\)
+**     subq    \$-128, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L10
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movb    \$4, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L19
+**     movl    \$1028, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rsi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, -8\(%rdi,%rsi\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     movl    \$67372036, \(%rdi\)
+**     movl    \$67372036, -4\(%rdi,%rsi\)
+**     jmp     .L19
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c

new file mode 100644 (file)

index 0000000..7ba1b74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
@@ -0,0 +1,109 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$289360691352306692, %rax
+**     vpbroadcastq    %rax, %zmm0
+**     cmpq    \$256, %rsi
+**     jnb     .L2
+**     testb   \$-128, %sil
+**     jne     .L22
+**     testb   \$64, %sil
+**     jne     .L23
+**     testb   \$32, %sil
+**     jne     .L24
+**     testb   \$16, %sil
+**     jne     .L25
+**     testb   \$8, %sil
+**     jne     .L26
+**     testb   \$4, %sil
+**     jne     .L27
+**     testq   %rsi, %rsi
+**     jne     .L28
+**.L20:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu64       %zmm0, -256\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -192\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     subq    \$1, %rsi
+**     cmpq    \$256, %rsi
+**     jb      .L20
+**     xorb    %sil, %sil
+**     xorl    %eax, %eax
+**.L11:
+**     vmovdqu64       %zmm0, \(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 64\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 128\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 192\(%rdi,%rax\)
+**     addq    \$256, %rax
+**     cmpq    %rsi, %rax
+**     jb      .L11
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L28:
+**     movb    \$4, \(%rdi\)
+**     testb   \$2, %sil
+**     je      .L20
+**     movl    \$1028, %eax
+**     movw    %ax, -2\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, 64\(%rdi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rsi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movq    %rax, \(%rdi\)
+**     movq    %rax, -8\(%rdi,%rsi\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movl    \$67372036, \(%rdi\)
+**     movl    \$67372036, -4\(%rdi,%rsi\)
+**     jmp     .L20
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c

new file mode 100644 (file)

index 0000000..62f61c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     movq    %rsi, %xmm0
+**     punpcklqdq      %xmm0, %xmm0
+**     cmpq    \$64, %rdx
+**     jnb     .L2
+**     testb   \$32, %dl
+**     jne     .L19
+**     testb   \$16, %dl
+**     jne     .L20
+**     testb   \$8, %dl
+**     jne     .L21
+**     testb   \$4, %dl
+**     jne     .L22
+**     testq   %rdx, %rdx
+**     jne     .L23
+**.L1:
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     movups  %xmm0, -64\(%rdi,%rdx\)
+**     movups  %xmm0, -48\(%rdi,%rdx\)
+**     movups  %xmm0, -32\(%rdi,%rdx\)
+**     movups  %xmm0, -16\(%rdi,%rdx\)
+**     subq    \$1, %rdx
+**     cmpq    \$64, %rdx
+**     jb      .L1
+**     andq    \$-64, %rdx
+**     xorl    %eax, %eax
+**.L9:
+**     movups  %xmm0, \(%rdi,%rax\)
+**     movups  %xmm0, 16\(%rdi,%rax\)
+**     movups  %xmm0, 32\(%rdi,%rax\)
+**     movups  %xmm0, 48\(%rdi,%rax\)
+**     addq    \$64, %rax
+**     cmpq    %rdx, %rax
+**     jb      .L9
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     movb    %sil, \(%rdi\)
+**     testb   \$2, %dl
+**     je      .L1
+**     movw    %si, -2\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L19:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, 16\(%rdi\)
+**     movups  %xmm0, -32\(%rdi,%rdx\)
+**     movups  %xmm0, -16\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L20:
+**     movups  %xmm0, \(%rdi\)
+**     movups  %xmm0, -16\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, -8\(%rdi,%rdx\)
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     movl    %esi, \(%rdi\)
+**     movl    %esi, -4\(%rdi,%rdx\)
+**     ret
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c

new file mode 100644 (file)

index 0000000..d12ab15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     vmovq   %rsi, %xmm1
+**     vpbroadcastq    %xmm1, %ymm0
+**     cmpq    \$128, %rdx
+**     jnb     .L2
+**     testb   \$64, %dl
+**     jne     .L21
+**     testb   \$32, %dl
+**     jne     .L22
+**     testb   \$16, %dl
+**     jne     .L23
+**     testb   \$8, %dl
+**     jne     .L24
+**     testb   \$4, %dl
+**     jne     .L25
+**     testq   %rdx, %rdx
+**     jne     .L26
+**.L19:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu %ymm0, -128\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -96\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -64\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     subq    \$1, %rdx
+**     cmpq    \$128, %rdx
+**     jb      .L19
+**     andq    \$-128, %rdx
+**     xorl    %eax, %eax
+**.L10:
+**     vmovdqu %ymm0, \(%rdi,%rax\)
+**     vmovdqu %ymm0, 32\(%rdi,%rax\)
+**     vmovdqu %ymm0, 64\(%rdi,%rax\)
+**     vmovdqu %ymm0, 96\(%rdi,%rax\)
+**     subq    \$-128, %rax
+**     cmpq    %rdx, %rax
+**     jb      .L10
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movb    %sil, \(%rdi\)
+**     testb   \$2, %dl
+**     je      .L19
+**     movw    %si, -2\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L21:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, 32\(%rdi\)
+**     vmovdqu %ymm0, -64\(%rdi,%rdx\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, -8\(%rdi,%rdx\)
+**     jmp     .L19
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     movl    %esi, \(%rdi\)
+**     movl    %esi, -4\(%rdi,%rdx\)
+**     jmp     .L19
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c

new file mode 100644 (file)

index 0000000..1a0abe6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c
@@ -0,0 +1,110 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**     .cfi_startproc
+**     movabsq \$72340172838076673, %rax
+**     movzbl  %sil, %esi
+**     imulq   %rax, %rsi
+**     vpbroadcastq    %rsi, %zmm0
+**     cmpq    \$256, %rdx
+**     jnb     .L2
+**     testb   \$-128, %dl
+**     jne     .L22
+**     testb   \$64, %dl
+**     jne     .L23
+**     testb   \$32, %dl
+**     jne     .L24
+**     testb   \$16, %dl
+**     jne     .L25
+**     testb   \$8, %dl
+**     jne     .L26
+**     testb   \$4, %dl
+**     jne     .L27
+**     testq   %rdx, %rdx
+**     jne     .L28
+**.L20:
+**     vzeroupper
+**     ret
+**     .p2align 4,,10
+**     .p2align 3
+**.L2:
+**     vmovdqu64       %zmm0, -256\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -192\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rdx\)
+**     subq    \$1, %rdx
+**     cmpq    \$256, %rdx
+**     jb      .L20
+**     xorb    %dl, %dl
+**     xorl    %eax, %eax
+**.L11:
+**     vmovdqu64       %zmm0, \(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 64\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 128\(%rdi,%rax\)
+**     vmovdqu64       %zmm0, 192\(%rdi,%rax\)
+**     addq    \$256, %rax
+**     cmpq    %rdx, %rax
+**     jb      .L11
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L28:
+**     movb    %sil, \(%rdi\)
+**     testb   \$2, %dl
+**     je      .L20
+**     movw    %si, -2\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L22:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, 64\(%rdi\)
+**     vmovdqu64       %zmm0, -128\(%rdi,%rdx\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L23:
+**     vmovdqu64       %zmm0, \(%rdi\)
+**     vmovdqu64       %zmm0, -64\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L24:
+**     vmovdqu %ymm0, \(%rdi\)
+**     vmovdqu %ymm0, -32\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L25:
+**     vmovdqu %xmm0, \(%rdi\)
+**     vmovdqu %xmm0, -16\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L26:
+**     movq    %rsi, \(%rdi\)
+**     movq    %rsi, -8\(%rdi,%rdx\)
+**     jmp     .L20
+**     .p2align 4,,10
+**     .p2align 3
+**.L27:
+**     movl    %esi, \(%rdi\)
+**     movl    %esi, -4\(%rdi,%rdx\)
+**     jmp     .L20
+**     .cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-25.c b/gcc/testsuite/gcc.target/i386/memset-strategy-25.c

index 1cc3de7f6ffe20892b1aec003f1d258c03fb7a44..7bd5d43c8cd5775393e69a997aec0b582496adbc 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
@@ -7,7 +7,11 @@
  **foo:
  **.LFB[0-9]+:
  **     .cfi_startproc
+**     movq    \$0, 221\(%rdi\)
  **     xorl    %eax, %eax
+**     movq    \$0, 229\(%rdi\)
+**     movq    \$0, 237\(%rdi\)
+**     movq    \$0, 245\(%rdi\)
  **.L[0-9]+:
  **     movl    %eax, %edx
  **     addl    \$32, %eax
@@ -17,6 +21,7 @@
  **     movq    \$0, 24\(%rdi,%rdx\)
  **     cmpl    \$224, %eax
  **     jb      .L[0-9]+
+**     ret
  **...
  */
  
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-29.c b/gcc/testsuite/gcc.target/i386/memset-strategy-29.c

index 61aef929ad04fb511aa55e0bf99e8aef17adffec..a33bf9232c09fd593f781baab7a24fd3e8fe7dfc 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
@@ -8,7 +8,11 @@
  **...
  **.LFB[0-9]+:
  **     .cfi_startproc
+**     movq    \$0, 49\(%rdi\)
  **     xorl    %eax, %eax
+**     movq    \$0, 57\(%rdi\)
+**     movq    \$0, 65\(%rdi\)
+**     movq    \$0, 73\(%rdi\)
  **.L[0-9]+:
  **     movl    %eax, %edx
  **     addl    \$32, %eax
@@ -18,6 +22,7 @@
  **     movq    \$0, 24\(%rdi,%rdx\)
  **     cmpl    \$64, %eax
  **     jb      .L[0-9]+
+**     ret
  **...
  */
  
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-30.c b/gcc/testsuite/gcc.target/i386/memset-strategy-30.c

index 917f1519302b8b6ac9265945c3c0cdb9ca5df41c..f3912f8c03b87310a609a362531c2aeddf5d90e0 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
@@ -8,7 +8,11 @@
  **...
  **.LFB[0-9]+:
  **     .cfi_startproc
+**     movq    \$0, 63\(%rdi\)
  **     xorl    %eax, %eax
+**     movq    \$0, 71\(%rdi\)
+**     movq    \$0, 79\(%rdi\)
+**     movq    \$0, 87\(%rdi\)
  **.L[0-9]+:
  **     movl    %eax, %edx
  **     addl    \$32, %eax
@@ -18,6 +22,7 @@
  **     movq    \$0, 24\(%rdi,%rdx\)
  **     cmpl    \$64, %eax
  **     jb      .L[0-9]+
+**     ret
  **...
  */
  
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-31.c b/gcc/testsuite/gcc.target/i386/memset-strategy-31.c

index 17a4df25bb288585b310c73da76337c66752a03a..4791c4dd17c9ec97db4e8e58c2019645fcc51922 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
@@ -9,6 +9,10 @@
  **...
  **     pxor    %xmm0, %xmm0
  **     xorl    %eax, %eax
+**     movups  %xmm0, 190\(%rdi\)
+**     movups  %xmm0, 206\(%rdi\)
+**     movups  %xmm0, 222\(%rdi\)
+**     movups  %xmm0, 238\(%rdi\)
  **.L[0-9]+:
  **     movl    %eax, %edx
  **     addl    \$64, %eax
author	H.J. Lu <hjl.tools@gmail.com>
	Tue, 17 Jun 2025 02:17:17 +0000 (10:17 +0800)
committer	H.J. Lu <hjl.tools@gmail.com>
	Mon, 7 Jul 2025 07:39:31 +0000 (15:39 +0800)
gcc/config/i386/i386-expand.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/auto-init-padding-9.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/memset-pr120683-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-10.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-11.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-12.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-13.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-14.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-15.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-16.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-17.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-18.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-19.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-20.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-21.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-22.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-23.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-5.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-6.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-7.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-8.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-pr120683-9.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/memset-strategy-25.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/memset-strategy-29.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/memset-strategy-30.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/memset-strategy-31.c		patch \| blob \| blame \| history