return true;
}
+/* Fully unroll memmove of known size with up to 8 registers. */
+
+static bool
+ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
+ unsigned HOST_WIDE_INT count,
+ machine_mode mode)
+{
+ /* If 8 registers registers can cover all memory, load them into
+ registers and store them together to avoid possible address
+ overlap between source and destination. */
+ unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
+ if (moves == 0)
+ {
+ mode = smallest_int_mode_for_size
+ (count * BITS_PER_UNIT).require ();
+ if (count == GET_MODE_SIZE (mode))
+ moves = 1;
+ else
+ {
+ /* Reduce the smallest move size by half so that MOVES == 1. */
+ mode = smallest_int_mode_for_size
+ (GET_MODE_BITSIZE (mode) / 2).require ();
+ moves = count / GET_MODE_SIZE (mode);
+ gcc_assert (moves == 1);
+ }
+ }
+ else if (moves > 8)
+ return false;
+
+ unsigned int i;
+ rtx tmp[9];
+
+ for (i = 0; i < moves; i++)
+ tmp[i] = gen_reg_rtx (mode);
+
+ rtx srcmem = change_address (src, mode, srcreg);
+ for (i = 0; i < moves; i++)
+ {
+ emit_move_insn (tmp[i], srcmem);
+ srcmem = offset_address (srcmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+
+ unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
+ machine_mode epilogue_mode = VOIDmode;
+ if (epilogue_size)
+ {
+ /* Handle the remaining bytes with overlapping move. */
+ epilogue_mode = smallest_int_mode_for_size
+ (epilogue_size * BITS_PER_UNIT).require ();
+ tmp[8] = gen_reg_rtx (epilogue_mode);
+ srcmem = adjust_address (srcmem, epilogue_mode, 0);
+ srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
+ srcmem = offset_address (srcmem,
+ GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
+ GET_MODE_SIZE (epilogue_mode));
+ emit_move_insn (tmp[8], srcmem);
+ }
+
+ rtx destmem = change_address (dst, mode, destreg);
+ for (i = 0; i < moves; i++)
+ {
+ emit_move_insn (destmem, tmp[i]);
+ destmem = offset_address (destmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+
+ if (epilogue_size)
+ {
+ /* Use overlapping move. */
+ destmem = adjust_address (destmem, epilogue_mode, 0);
+ destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
+ destmem = offset_address (destmem,
+ GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
+ GET_MODE_SIZE (epilogue_mode));
+ emit_move_insn (destmem, tmp[8]);
+ }
+
+ return true;
+}
+
+/* Expand memmove of size with MOVES * mode size and MOVES <= 4. If
+ FORWARD is true, copy forward. Otherwise copy backward. */
+
+static void
+ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
+ unsigned int moves, bool forward)
+{
+ gcc_assert (moves <= 4);
+
+ unsigned int i;
+ rtx tmp[8];
+
+ for (i = 0; i < moves; i++)
+ tmp[i] = gen_reg_rtx (mode);
+
+ rtx step;
+ if (forward)
+ step = GEN_INT (GET_MODE_SIZE (mode));
+ else
+ step = GEN_INT (-GET_MODE_SIZE (mode));
+
+ /* Load MOVES. */
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (tmp[i], srcmem);
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (tmp[i], srcmem);
+
+ /* Store MOVES. */
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (destmem, tmp[i]);
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, tmp[i]);
+}
+
+/* Load MOVES of mode size into REGS. If LAST is true, load the
+ last MOVES. Otherwise, load the first MOVES. */
+
+static void
+ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
+ machine_mode mode, unsigned int moves,
+ rtx regs[], bool last)
+{
+ unsigned int i;
+
+ for (i = 0; i < moves; i++)
+ regs[i] = gen_reg_rtx (mode);
+
+ rtx srcmem = change_address (src, mode, srcreg);
+ rtx step;
+ if (last)
+ {
+ srcmem = offset_address (srcmem, count_exp, 1);
+ step = GEN_INT (-GET_MODE_SIZE (mode));
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ }
+ else
+ step = GEN_INT (GET_MODE_SIZE (mode));
+
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (regs[i], srcmem);
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (regs[i], srcmem);
+}
+
+/* Store MOVES of mode size into REGS. If LAST is true, store the
+ last MOVES. Otherwise, store the first MOVES. */
+
+static void
+ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
+ machine_mode mode, unsigned int moves,
+ rtx regs[], bool last)
+{
+ unsigned int i;
+
+ rtx destmem = change_address (dst, mode, destreg);
+ rtx step;
+ if (last)
+ {
+ destmem = offset_address (destmem, count_exp, 1);
+ step = GEN_INT (-GET_MODE_SIZE (mode));
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+ }
+ else
+ step = GEN_INT (GET_MODE_SIZE (mode));
+
+ for (i = 0; i < moves - 1; i++)
+ {
+ emit_move_insn (destmem, regs[i]);
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, regs[i]);
+}
+
+/* Expand memmove of size between (MOVES / 2) * mode size and
+ MOVES * mode size with overlapping load and store. MOVES is even.
+ MOVES >= 2 and MOVES <= 8. */
+
+static void
+ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
+ rtx srcreg, rtx count_exp,
+ machine_mode mode,
+ unsigned int moves)
+{
+ gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
+
+ unsigned int half_moves = moves / 2;
+ unsigned int i, j;
+ rtx tmp[8];
+
+ for (i = 0; i < moves; i++)
+ tmp[i] = gen_reg_rtx (mode);
+
+ rtx base_srcmem = change_address (src, mode, srcreg);
+
+ /* Load the first half. */
+ rtx srcmem = base_srcmem;
+ for (i = 0; i < half_moves - 1; i++)
+ {
+ emit_move_insn (tmp[i], srcmem);
+ srcmem = offset_address (srcmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (tmp[i], srcmem);
+
+ /* Load the second half. */
+ srcmem = offset_address (base_srcmem, count_exp, 1);
+ srcmem = offset_address (srcmem,
+ GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
+ {
+ emit_move_insn (tmp[j], srcmem);
+ srcmem = offset_address (srcmem,
+ GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (tmp[j], srcmem);
+
+ rtx base_destmem = change_address (dst, mode, destreg);
+
+ /* Store the first half. */
+ rtx destmem = base_destmem;
+ for (i = 0; i < half_moves - 1; i++)
+ {
+ emit_move_insn (destmem, tmp[i]);
+ destmem = offset_address (destmem,
+ GEN_INT (GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, tmp[i]);
+
+ /* Store the second half. */
+ destmem = offset_address (base_destmem, count_exp, 1);
+ destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
+ {
+ emit_move_insn (destmem, tmp[j]);
+ destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, tmp[j]);
+}
+
+/* Expand memmove of size < mode size which is <= 64. */
+
+static void
+ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
+ rtx srcreg, rtx count_exp,
+ unsigned HOST_WIDE_INT min_size,
+ machine_mode mode,
+ rtx_code_label *done_label)
+{
+ bool skip = false;
+ machine_mode count_mode = counter_mode (count_exp);
+
+ rtx_code_label *between_32_63_label
+ = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64. */
+ if (between_32_63_label)
+ {
+ if (min_size && min_size >= 32)
+ {
+ emit_jump_insn (gen_jump (between_32_63_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
+ nullptr, count_mode, 1,
+ between_32_63_label);
+ }
+
+ rtx_code_label *between_16_31_label
+ = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31. */
+ if (between_16_31_label)
+ {
+ if (min_size && min_size >= 16)
+ {
+ emit_jump_insn (gen_jump (between_16_31_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
+ nullptr, count_mode, 1,
+ between_16_31_label);
+ }
+
+ rtx_code_label *between_8_15_label
+ = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15. */
+ if (between_8_15_label)
+ {
+ if (min_size && min_size >= 8)
+ {
+ emit_jump_insn (gen_jump (between_8_15_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
+ nullptr, count_mode, 1,
+ between_8_15_label);
+ }
+
+ rtx_code_label *between_4_7_label
+ = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7. */
+ if (between_4_7_label)
+ {
+ if (min_size && min_size >= 4)
+ {
+ emit_jump_insn (gen_jump (between_4_7_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
+ nullptr, count_mode, 1,
+ between_4_7_label);
+ }
+
+ rtx_code_label *between_2_3_label
+ = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
+ /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3. */
+ if (between_2_3_label)
+ {
+ if (min_size && min_size >= 2)
+ {
+ emit_jump_insn (gen_jump (between_2_3_label));
+ emit_barrier ();
+ skip = true;
+ }
+ else
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
+ nullptr, count_mode, 1,
+ between_2_3_label);
+ }
+
+ if (!skip)
+ {
+ rtx_code_label *zero_label
+ = min_size == 0 ? gen_label_rtx () : nullptr;
+ /* Skip if size == 0. */
+ if (zero_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
+ nullptr, count_mode, 1,
+ zero_label,
+ profile_probability::unlikely ());
+
+ /* Move 1 byte. */
+ rtx tmp0 = gen_reg_rtx (QImode);
+ rtx srcmem = change_address (src, QImode, srcreg);
+ emit_move_insn (tmp0, srcmem);
+ rtx destmem = change_address (dst, QImode, destreg);
+ emit_move_insn (destmem, tmp0);
+
+ if (zero_label)
+ emit_label (zero_label);
+
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_32_63_label)
+ {
+ emit_label (between_32_63_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, OImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_16_31_label)
+ {
+ emit_label (between_16_31_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, TImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_8_15_label)
+ {
+ emit_label (between_8_15_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, DImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_4_7_label)
+ {
+ emit_label (between_4_7_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, SImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (between_2_3_label)
+ {
+ emit_label (between_2_3_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, HImode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+}
+
+/* Expand movmem with overlapping unaligned loads and stores:
+ 1. Load all sources into registers and store them together to avoid
+ possible address overlap between source and destination.
+ 2. For known size, first try to fully unroll with 8 registers.
+ 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
+ and then store them together.
+ 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
+ into 4 registers first and then store them together.
+ 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
+ into 8 registers first and then store them together.
+ 6. For size > 8 * MOVE_MAX,
+ a. If address of destination > address of source, copy backward
+ with a 4 * MOVE_MAX loop with unaligned loads and stores. Load
+ the first 4 * MOVE_MAX into 4 registers before the loop and
+ store them after the loop to support overlapping addresses.
+ b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
+ loads and stores. Load the last 4 * MOVE_MAX into 4 registers
+ before the loop and store them after the loop to support
+ overlapping addresses.
+ */
+
+bool
+ix86_expand_movmem (rtx operands[])
+{
+ /* Since there are much less registers available in 32-bit mode, don't
+ inline movmem in 32-bit mode. */
+ if (!TARGET_64BIT)
+ return false;
+
+ rtx dst = operands[0];
+ rtx src = operands[1];
+ rtx count_exp = operands[2];
+ rtx expected_size_exp = operands[5];
+ rtx min_size_exp = operands[6];
+ rtx probable_max_size_exp = operands[8];
+ unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
+ HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
+ unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
+ unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
+
+ if (CONST_INT_P (count_exp))
+ {
+ min_size = probable_max_size = count = expected_size
+ = INTVAL (count_exp);
+ /* When COUNT is 0, there is nothing to do. */
+ if (!count)
+ return true;
+ }
+ else
+ {
+ if (min_size_exp)
+ min_size = INTVAL (min_size_exp);
+ if (probable_max_size_exp)
+ probable_max_size = INTVAL (probable_max_size_exp);
+ if (CONST_INT_P (expected_size_exp))
+ expected_size = INTVAL (expected_size_exp);
+ }
+
+ /* Make sure we don't need to care about overflow later on. */
+ if (count > (HOST_WIDE_INT_1U << 30))
+ return false;
+
+ addr_space_t dst_as = MEM_ADDR_SPACE (dst);
+ addr_space_t src_as = MEM_ADDR_SPACE (src);
+ int dynamic_check;
+ bool noalign;
+ enum stringop_alg alg = decide_alg (count, expected_size, min_size,
+ probable_max_size, false, false,
+ dst_as, src_as, &dynamic_check,
+ &noalign, false);
+ if (alg == libcall)
+ return false;
+
+ rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
+ rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+
+ unsigned int move_max = MOVE_MAX;
+ machine_mode mode = smallest_int_mode_for_size
+ (move_max * BITS_PER_UNIT).require ();
+ if (probable_max_size && probable_max_size < move_max)
+ {
+ /* Get a usable MOVE_MAX. */
+ mode = smallest_int_mode_for_size
+ (probable_max_size * BITS_PER_UNIT).require ();
+ /* Reduce MOVE_MAX by half so that MOVE_MAX can be used. */
+ if (GET_MODE_SIZE (mode) > probable_max_size)
+ mode = smallest_int_mode_for_size
+ (GET_MODE_BITSIZE (mode) / 2).require ();
+ move_max = GET_MODE_SIZE (mode);
+ }
+
+ /* Try to fully unroll memmove of known size first. */
+ if (count
+ && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
+ mode))
+ return true;
+
+ rtx_code_label *done_label = gen_label_rtx ();
+
+ rtx_code_label *less_vec_label = nullptr;
+ if (min_size == 0 || min_size < move_max)
+ less_vec_label = gen_label_rtx ();
+
+ machine_mode count_mode = counter_mode (count_exp);
+
+ /* Jump to LESS_VEC_LABEL if size < MOVE_MAX. */
+ if (less_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
+ nullptr, count_mode, 1,
+ less_vec_label);
+
+ rtx_code_label *more_2x_vec_label = nullptr;
+ if (probable_max_size == 0 || probable_max_size > 2 * move_max)
+ more_2x_vec_label = gen_label_rtx ();
+
+ /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX. */
+ if (more_2x_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
+ nullptr, count_mode, 1,
+ more_2x_vec_label);
+
+ if (min_size == 0 || min_size <= 2 * move_max)
+ {
+ /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
+ count_exp, mode, 2);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (less_vec_label)
+ {
+ /* Size < MOVE_MAX. */
+ emit_label (less_vec_label);
+ ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
+ count_exp, min_size, mode,
+ done_label);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (more_2x_vec_label)
+ {
+ /* Size > 2 * MOVE_MAX and destination may overlap with source. */
+ emit_label (more_2x_vec_label);
+
+ rtx_code_label *more_8x_vec_label = nullptr;
+ if (probable_max_size == 0 || probable_max_size > 8 * move_max)
+ more_8x_vec_label = gen_label_rtx ();
+
+ /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX. */
+ if (more_8x_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
+ nullptr, count_mode, 1,
+ more_8x_vec_label);
+
+ rtx_code_label *last_4x_vec_label = nullptr;
+ if (min_size == 0 || min_size < 4 * move_max)
+ last_4x_vec_label = gen_label_rtx ();
+
+ /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */
+ if (last_4x_vec_label)
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
+ nullptr, count_mode, 1,
+ last_4x_vec_label);
+
+ if (probable_max_size == 0 || probable_max_size > 4 * move_max)
+ {
+ /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
+ srcreg, count_exp,
+ mode, 8);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (last_4x_vec_label)
+ {
+ /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */
+ emit_label (last_4x_vec_label);
+ ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
+ srcreg, count_exp,
+ mode, 4);
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+
+ if (more_8x_vec_label)
+ {
+ /* Size > 8 * MOVE_MAX. */
+ emit_label (more_8x_vec_label);
+
+ rtx loop_count = gen_reg_rtx (count_mode);
+ emit_move_insn (loop_count, count_exp);
+
+ /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
+ lower than destination address. */
+ rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
+ GET_MODE (destreg), 1,
+ more_8x_vec_backward_label);
+
+ /* Skip if source == destination which is less common. */
+ emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
+ GET_MODE (destreg), 1, done_label,
+ profile_probability::unlikely ());
+
+ rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
+ emit_move_insn (base_destreg, destreg);
+
+ /* Load the last 4 * MOVE_MAX. */
+ rtx regs[4];
+ ix86_expand_load_movmem (src, srcreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, true);
+
+ rtx srcmem = change_address (src, mode, srcreg);
+ rtx destmem = change_address (dst, mode, destreg);
+
+ /* Copy forward with a 4 * MOVE_MAX loop. */
+ rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
+ emit_label (loop_4x_vec_forward_label);
+
+ ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
+
+ rtx tmp;
+ rtx delta = GEN_INT (4 * MOVE_MAX);
+
+ /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
+ loop_count, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != loop_count)
+ emit_move_insn (loop_count, tmp);
+
+ /* Increment DESTREG and SRCREG by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
+ destreg, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != destreg)
+ emit_move_insn (destreg, tmp);
+ tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
+ delta, nullptr, 1, OPTAB_DIRECT);
+ if (tmp != srcreg)
+ emit_move_insn (srcreg, tmp);
+
+ /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
+ emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
+ GET_MODE (loop_count), 1,
+ loop_4x_vec_forward_label);
+
+ /* Store the last 4 * MOVE_MAX. */
+ ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, true);
+
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+
+ /* Copy backward with a 4 * MOVE_MAX loop. */
+ emit_label (more_8x_vec_backward_label);
+
+ base_destreg = gen_reg_rtx (GET_MODE (destreg));
+ emit_move_insn (base_destreg, destreg);
+
+ /* Load the first 4 * MOVE_MAX. */
+ ix86_expand_load_movmem (src, srcreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, false);
+
+ /* Increment DESTREG and SRCREG by COUNT_EXP. */
+ tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
+ destreg, count_exp, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != destreg)
+ emit_move_insn (destreg, tmp);
+ tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
+ count_exp, nullptr, 1, OPTAB_DIRECT);
+ if (tmp != srcreg)
+ emit_move_insn (srcreg, tmp);
+
+ srcmem = change_address (src, mode, srcreg);
+ destmem = change_address (dst, mode, destreg);
+ rtx step = GEN_INT (-GET_MODE_SIZE (mode));
+ srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
+ destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
+
+ rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
+ emit_label (loop_4x_vec_backward_label);
+
+ ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
+
+ /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
+ loop_count, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != loop_count)
+ emit_move_insn (loop_count, tmp);
+
+ /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */
+ tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
+ destreg, delta, nullptr, 1,
+ OPTAB_DIRECT);
+ if (tmp != destreg)
+ emit_move_insn (destreg, tmp);
+ tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
+ delta, nullptr, 1, OPTAB_DIRECT);
+ if (tmp != srcreg)
+ emit_move_insn (srcreg, tmp);
+
+ /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
+ emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
+ GET_MODE (loop_count), 1,
+ loop_4x_vec_backward_label);
+
+ /* Store the first 4 * MOVE_MAX. */
+ ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
+ ARRAY_SIZE (regs), regs, false);
+
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+ }
+ }
+
+ emit_label (done_label);
+
+ return true;
+}
+
/* Expand cmpstrn or memcmp. */
bool
extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
rtx, rtx, rtx, rtx, bool);
+extern bool ix86_expand_movmem (rtx[]);
extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool);
extern enum reg_class ix86_insn_base_reg_class (rtx_insn *);
(set_attr "length_immediate" "0")
(set_attr "modrm" "0")])
+(define_expand "movmem<mode>"
+ [(use (match_operand:BLK 0 "memory_operand"))
+ (use (match_operand:BLK 1 "memory_operand"))
+ (use (match_operand:SWI48 2 "nonmemory_operand"))
+ (use (match_operand:SWI48 3 "const_int_operand"))
+ (use (match_operand:SI 4 "const_int_operand"))
+ (use (match_operand:SI 5 "const_int_operand"))
+ (use (match_operand:SI 6 ""))
+ (use (match_operand:SI 7 ""))
+ (use (match_operand:SI 8 ""))]
+ ""
+{
+ if (ix86_expand_movmem (operands))
+ DONE;
+ FAIL;
+})
+
(define_expand "cpymem<mode>"
[(use (match_operand:BLK 0 "memory_operand"))
(use (match_operand:BLK 1 "memory_operand"))
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$63, %rdx
+** ja .L12
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L12:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$128, %rdx
+** jbe .L13
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L6
+** je .L1
+** movdqu -16\(%rax,%rdx\), %xmm7
+** movdqu -32\(%rax,%rdx\), %xmm6
+** movdqu -48\(%rax,%rdx\), %xmm5
+** movdqu -64\(%rax,%rdx\), %xmm4
+**.L7:
+** movdqu \(%rax\), %xmm3
+** subq \$64, %rsi
+** addq \$64, %rcx
+** addq \$64, %rax
+** movdqu -48\(%rax\), %xmm2
+** movdqu -32\(%rax\), %xmm1
+** movdqu -16\(%rax\), %xmm0
+** movups %xmm3, -64\(%rcx\)
+** movups %xmm2, -48\(%rcx\)
+** movups %xmm1, -32\(%rcx\)
+** movups %xmm0, -16\(%rcx\)
+** cmpq \$64, %rsi
+** ja .L7
+** movups %xmm7, -16\(%rdi,%rdx\)
+** movups %xmm6, -32\(%rdi,%rdx\)
+** movups %xmm5, -48\(%rdi,%rdx\)
+** movups %xmm4, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** movdqu \(%rsi\), %xmm7
+** movdqu 16\(%rsi\), %xmm6
+** movdqu 32\(%rsi\), %xmm5
+** movdqu 48\(%rsi\), %xmm4
+** movdqu -16\(%rsi,%rdx\), %xmm3
+** movdqu -32\(%rsi,%rdx\), %xmm2
+** movdqu -48\(%rsi,%rdx\), %xmm1
+** movdqu -64\(%rsi,%rdx\), %xmm0
+** movups %xmm7, \(%rdi\)
+** movups %xmm6, 16\(%rdi\)
+** movups %xmm5, 32\(%rdi\)
+** movups %xmm4, 48\(%rdi\)
+** movups %xmm3, -16\(%rdi,%rdx\)
+** movups %xmm2, -32\(%rdi,%rdx\)
+** movups %xmm1, -48\(%rdi,%rdx\)
+** movups %xmm0, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movdqu \(%rax\), %xmm3
+** movdqu 16\(%rax\), %xmm2
+** leaq \(%rdi,%rdx\), %rcx
+** movdqu 32\(%rax\), %xmm1
+** movdqu 48\(%rax\), %xmm0
+** addq %rdx, %rax
+**.L8:
+** movdqu -16\(%rax\), %xmm7
+** movdqu -32\(%rax\), %xmm6
+** subq \$64, %rsi
+** subq \$64, %rcx
+** movdqu -48\(%rax\), %xmm5
+** movdqu -64\(%rax\), %xmm4
+** subq \$64, %rax
+** movups %xmm7, 48\(%rcx\)
+** movups %xmm6, 32\(%rcx\)
+** movups %xmm5, 16\(%rcx\)
+** movups %xmm4, \(%rcx\)
+** cmpq \$64, %rsi
+** ja .L8
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, 32\(%rdi\)
+** movups %xmm0, 48\(%rdi\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n >= 64)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_xmm:
+**.LFB0:
+** .cfi_startproc
+** movq %rdi, %rax
+** movl \$512, %edx
+** cmpq %rdi, %rsi
+** jb .L5
+** je .L1
+** movdqu 496\(%rsi\), %xmm7
+** movdqu 480\(%rsi\), %xmm6
+** movdqu 464\(%rsi\), %xmm5
+** movdqu 448\(%rsi\), %xmm4
+**.L6:
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** subl \$64, %edx
+** addq \$64, %rax
+** movdqu 32\(%rsi\), %xmm1
+** movdqu 48\(%rsi\), %xmm0
+** addq \$64, %rsi
+** movups %xmm3, -64\(%rax\)
+** movups %xmm2, -48\(%rax\)
+** movups %xmm1, -32\(%rax\)
+** movups %xmm0, -16\(%rax\)
+** cmpl \$64, %edx
+** ja .L6
+** movups %xmm7, 496\(%rdi\)
+** movups %xmm6, 480\(%rdi\)
+** movups %xmm5, 464\(%rdi\)
+** movups %xmm4, 448\(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movdqu \(%rsi\), %xmm7
+** movdqu 16\(%rsi\), %xmm6
+** leaq 512\(%rdi\), %rax
+** addq \$512, %rsi
+** movdqu -480\(%rsi\), %xmm5
+** movdqu -464\(%rsi\), %xmm4
+**.L7:
+** movdqu -16\(%rsi\), %xmm3
+** subl \$64, %edx
+** subq \$64, %rax
+** subq \$64, %rsi
+** movdqu 32\(%rsi\), %xmm2
+** movdqu 16\(%rsi\), %xmm1
+** movdqu \(%rsi\), %xmm0
+** movups %xmm3, 48\(%rax\)
+** movups %xmm2, 32\(%rax\)
+** movups %xmm1, 16\(%rax\)
+** movups %xmm0, \(%rax\)
+** cmpl \$64, %edx
+** ja .L7
+** movups %xmm7, \(%rdi\)
+** movups %xmm6, 16\(%rdi\)
+** movups %xmm5, 32\(%rdi\)
+** movups %xmm4, 48\(%rdi\)
+**.L1:
+** ret
+** .cfi_endproc
+**...
+*/
+
+#ifndef gcc_memmove
+#define gcc_memmove gcc_memmove_xmm
+#endif
+
+void
+gcc_memmove (void *a, void *b)
+{
+ __builtin_memmove (a, b, 512);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_ymm:
+**.LFB0:
+** .cfi_startproc
+** movq %rdi, %rax
+** movl \$512, %edx
+** cmpq %rdi, %rsi
+** jb .L5
+** je .L10
+** vmovdqu 480\(%rsi\), %ymm7
+** vmovdqu 448\(%rsi\), %ymm6
+** vmovdqu 416\(%rsi\), %ymm5
+** vmovdqu 384\(%rsi\), %ymm4
+**.L6:
+** vmovdqu \(%rsi\), %ymm3
+** vmovdqu 32\(%rsi\), %ymm2
+** addl \$-128, %edx
+** subq \$-128, %rax
+** vmovdqu 64\(%rsi\), %ymm1
+** vmovdqu 96\(%rsi\), %ymm0
+** subq \$-128, %rsi
+** vmovdqu %ymm3, -128\(%rax\)
+** vmovdqu %ymm2, -96\(%rax\)
+** vmovdqu %ymm1, -64\(%rax\)
+** vmovdqu %ymm0, -32\(%rax\)
+** cmpl \$128, %edx
+** ja .L6
+** vmovdqu %ymm7, 480\(%rdi\)
+** vmovdqu %ymm6, 448\(%rdi\)
+** vmovdqu %ymm5, 416\(%rdi\)
+** vmovdqu %ymm4, 384\(%rdi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** vmovdqu \(%rsi\), %ymm7
+** vmovdqu 32\(%rsi\), %ymm6
+** leaq 512\(%rdi\), %rax
+** addq \$512, %rsi
+** vmovdqu -448\(%rsi\), %ymm5
+** vmovdqu -416\(%rsi\), %ymm4
+**.L7:
+** vmovdqu -32\(%rsi\), %ymm3
+** addl \$-128, %edx
+** addq \$-128, %rax
+** addq \$-128, %rsi
+** vmovdqu 64\(%rsi\), %ymm2
+** vmovdqu 32\(%rsi\), %ymm1
+** vmovdqu \(%rsi\), %ymm0
+** vmovdqu %ymm3, 96\(%rax\)
+** vmovdqu %ymm2, 64\(%rax\)
+** vmovdqu %ymm1, 32\(%rax\)
+** vmovdqu %ymm0, \(%rax\)
+** cmpl \$128, %edx
+** ja .L7
+** vmovdqu %ymm7, \(%rdi\)
+** vmovdqu %ymm6, 32\(%rdi\)
+** vmovdqu %ymm5, 64\(%rdi\)
+** vmovdqu %ymm4, 96\(%rdi\)
+** vzeroupper
+**.L10:
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_ymm
+#include "builtin-memmove-11a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_zmm:
+**.LFB0:
+** .cfi_startproc
+** vmovdqu64 \(%(e|r)si\), %zmm7
+** vmovdqu64 64\(%(e|r)si\), %zmm6
+** vmovdqu64 128\(%(e|r)si\), %zmm5
+** vmovdqu64 192\(%(e|r)si\), %zmm4
+** vmovdqu64 256\(%(e|r)si\), %zmm3
+** vmovdqu64 320\(%(e|r)si\), %zmm2
+** vmovdqu64 384\(%(e|r)si\), %zmm1
+** vmovdqu64 448\(%(e|r)si\), %zmm0
+** vmovdqu64 %zmm7, \(%(e|r)di\)
+** vmovdqu64 %zmm6, 64\(%(e|r)di\)
+** vmovdqu64 %zmm5, 128\(%(e|r)di\)
+** vmovdqu64 %zmm4, 192\(%(e|r)di\)
+** vmovdqu64 %zmm3, 256\(%(e|r)di\)
+** vmovdqu64 %zmm2, 320\(%(e|r)di\)
+** vmovdqu64 %zmm1, 384\(%(e|r)di\)
+** vmovdqu64 %zmm0, 448\(%(e|r)di\)
+** vzeroupper
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_zmm
+#include "builtin-memmove-11a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movdqu a\+20\(%rip\), %xmm5
+** movdqu a\+36\(%rip\), %xmm4
+** movdqu a\+52\(%rip\), %xmm3
+** movdqu a\+68\(%rip\), %xmm2
+** movdqu a\+84\(%rip\), %xmm1
+** movdqu a\+100\(%rip\), %xmm0
+** movups %xmm5, a\+24\(%rip\)
+** movq a\+116\(%rip\), %rax
+** movdqu a\+4\(%rip\), %xmm6
+** movups %xmm4, a\+40\(%rip\)
+** movl %edi, a\+4\(%rip\)
+** movq %rax, a\+120\(%rip\)
+** movups %xmm6, a\+8\(%rip\)
+** movups %xmm3, a\+56\(%rip\)
+** movups %xmm2, a\+72\(%rip\)
+** movups %xmm1, a\+88\(%rip\)
+** movups %xmm0, a\+104\(%rip\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define N 32
+
+int a[N];
+
+void
+foo (int x)
+{
+ __builtin_memmove (a + 2, a + 1, sizeof a - 2 * sizeof *a);
+ a[1] = x;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+** .cfi_startproc
+** movl a\+3\(%rip\), %eax
+** movl %eax, a\(%rip\)
+** movzbl a\+7\(%rip\), %eax
+** movb %al, a\+4\(%rip\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+char a[8] = "12345678";
+
+void
+foo (void)
+{
+ __builtin_memmove (a, a + 3, sizeof a - 3);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$64, %rdx
+** jbe .L12
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L12:
+** cmpl \$16, %edx
+** jnb .L13
+** cmpl \$8, %edx
+** jnb .L6
+** cmpl \$4, %edx
+** jnb .L7
+** cmpl \$1, %edx
+** ja .L8
+** jb .L1
+** movzbl \(%rsi\), %eax
+** movb %al, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** cmpl \$32, %edx
+** ja .L5
+** movl %edx, %edx
+** movdqu \(%rsi\), %xmm1
+** movdqu -16\(%rsi,%rdx\), %xmm0
+** movups %xmm1, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movl %edx, %edx
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** addq %rdx, %rsi
+** movdqu -16\(%rsi\), %xmm1
+** movdqu -32\(%rsi\), %xmm0
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, -16\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movl %edx, %edx
+** movq \(%rsi\), %rcx
+** movq -8\(%rsi,%rdx\), %rax
+** movq %rcx, \(%rdi\)
+** movq %rax, -8\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movl %edx, %edx
+** movl \(%rsi\), %ecx
+** movl -4\(%rsi,%rdx\), %eax
+** movl %ecx, \(%rdi\)
+** movl %eax, -4\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** movl %edx, %edx
+** movzwl \(%rsi\), %ecx
+** movzwl -2\(%rsi,%rdx\), %eax
+** movw %cx, \(%rdi\)
+** movw %ax, -2\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n <= 64)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$66, %rdx
+** jbe .L12
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L12:
+** cmpl \$16, %edx
+** jnb .L13
+** cmpl \$8, %edx
+** jnb .L6
+** cmpl \$4, %edx
+** jnb .L7
+** cmpl \$1, %edx
+** ja .L8
+** jb .L1
+** movzbl \(%rsi\), %eax
+** movb %al, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** cmpl \$32, %edx
+** ja .L5
+** movl %edx, %edx
+** movdqu \(%rsi\), %xmm1
+** movdqu -16\(%rsi,%rdx\), %xmm0
+** movups %xmm1, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** cmpl \$64, %edx
+** jnb .L14
+** movl %edx, %edx
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** addq %rdx, %rsi
+** movdqu -16\(%rsi\), %xmm1
+** movdqu -32\(%rsi\), %xmm0
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, -16\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movl %edx, %edx
+** movq \(%rsi\), %rcx
+** movq -8\(%rsi,%rdx\), %rax
+** movq %rcx, \(%rdi\)
+** movq %rax, -8\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L14:
+** movl %edx, %edx
+** movdqu \(%rsi\), %xmm7
+** movdqu 16\(%rsi\), %xmm6
+** movdqu 32\(%rsi\), %xmm5
+** movdqu 48\(%rsi\), %xmm4
+** addq %rdx, %rsi
+** movdqu -16\(%rsi\), %xmm3
+** movdqu -32\(%rsi\), %xmm2
+** movdqu -48\(%rsi\), %xmm1
+** movdqu -64\(%rsi\), %xmm0
+** movups %xmm7, \(%rdi\)
+** movups %xmm6, 16\(%rdi\)
+** movups %xmm5, 32\(%rdi\)
+** movups %xmm4, 48\(%rdi\)
+** movups %xmm3, -16\(%rdi,%rdx\)
+** movups %xmm2, -32\(%rdi,%rdx\)
+** movups %xmm1, -48\(%rdi,%rdx\)
+** movups %xmm0, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movl %edx, %edx
+** movl \(%rsi\), %ecx
+** movl -4\(%rsi,%rdx\), %eax
+** movl %ecx, \(%rdi\)
+** movl %eax, -4\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** movl %edx, %edx
+** movzwl \(%rsi\), %ecx
+** movzwl -2\(%rsi,%rdx\), %eax
+** movw %cx, \(%rdi\)
+** movw %ax, -2\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n <= 66)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */
+
+/*
+**memmove7:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movl \(%(?:r|e)si\), %edx
+** movl 3\(%(?:r|e)si\), %eax
+** movl %edx, \(%(?:r|e)di\)
+** movl %eax, 3\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove13:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq \(%(?:r|e)si\), %rdx
+** movq 5\(%(?:r|e)si\), %rax
+** movq %rdx, \(%(?:r|e)di\)
+** movq %rax, 5\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove31:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movdqu \(%(?:r|e)si\), %xmm1
+** movdqu 15\(%(?:r|e)si\), %xmm0
+** movups %xmm1, \(%(?:r|e)di\)
+** movups %xmm0, 15\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove39:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movdqu \(%(?:r|e)si\), %xmm1
+** movdqu 16\(%(?:r|e)si\), %xmm0
+** movq 31\(%(?:r|e)si\), %rax
+** movups %xmm0, 16\(%(?:r|e)di\)
+** movups %xmm1, \(%(?:r|e)di\)
+** movq %rax, 31\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove61:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movdqu \(%(?:r|e)si\), %xmm3
+** movdqu 16\(%(?:r|e)si\), %xmm2
+** movdqu 32\(%(?:r|e)si\), %xmm1
+** movdqu 45\(%(?:r|e)si\), %xmm0
+** movups %xmm3, \(%(?:r|e)di\)
+** movups %xmm1, 32\(%(?:r|e)di\)
+** movups %xmm2, 16\(%(?:r|e)di\)
+** movups %xmm0, 45\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove69:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movdqu \(%(?:r|e)si\), %xmm3
+** movdqu 16\(%(?:r|e)si\), %xmm2
+** movdqu 32\(%(?:r|e)si\), %xmm1
+** movdqu 48\(%(?:r|e)si\), %xmm0
+** movq 61\(%(?:r|e)si\), %rax
+** movups %xmm3, \(%(?:r|e)di\)
+** movups %xmm0, 48\(%(?:r|e)di\)
+** movups %xmm2, 16\(%(?:r|e)di\)
+** movq %rax, 61\(%(?:r|e)di\)
+** movups %xmm1, 32\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove93:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movdqu \(%(?:r|e)si\), %xmm5
+** movdqu 16\(%(?:r|e)si\), %xmm4
+** movdqu 32\(%(?:r|e)si\), %xmm3
+** movdqu 48\(%(?:r|e)si\), %xmm2
+** movdqu 64\(%(?:r|e)si\), %xmm1
+** movdqu 77\(%(?:r|e)si\), %xmm0
+** movups %xmm5, \(%(?:r|e)di\)
+** movups %xmm4, 16\(%(?:r|e)di\)
+** movups %xmm1, 64\(%(?:r|e)di\)
+** movups %xmm3, 32\(%(?:r|e)di\)
+** movups %xmm2, 48\(%(?:r|e)di\)
+** movups %xmm0, 77\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+#define TEST(n) \
+ void \
+ memmove##n (void *a, void *b) \
+ { \
+ __builtin_memmove (a, b, n); \
+ }
+
+TEST (7)
+TEST (13)
+TEST (31)
+TEST (39)
+TEST (61)
+TEST (69)
+TEST (93)
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */
+
+/*
+**memmove7:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movl \(%(?:r|e)si\), %edx
+** movl 3\(%(?:r|e)si\), %eax
+** movl %edx, \(%(?:r|e)di\)
+** movl %eax, 3\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove13:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq \(%(?:r|e)si\), %rdx
+** movq 5\(%(?:r|e)si\), %rax
+** movq %rdx, \(%(?:r|e)di\)
+** movq %rax, 5\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove31:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu \(%(?:r|e)si\), %xmm1
+** vmovdqu 15\(%(?:r|e)si\), %xmm0
+** vmovdqu %xmm1, \(%(?:r|e)di\)
+** vmovdqu %xmm0, 15\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove39:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu \(%(?:r|e)si\), %ymm0
+** movq 31\(%(?:r|e)si\), %rax
+** vmovdqu %ymm0, \(%(?:r|e)di\)
+** movq %rax, 31\(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+/*
+**memmove61:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu \(%(?:r|e)si\), %ymm1
+** vmovdqu 29\(%(?:r|e)si\), %ymm0
+** vmovdqu %ymm1, \(%(?:r|e)di\)
+** vmovdqu %ymm0, 29\(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+/*
+**memmove69:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu 32\(%(?:r|e)si\), %ymm0
+** movq 61\(%(?:r|e)si\), %rax
+** vmovdqu \(%(?:r|e)si\), %ymm1
+** vmovdqu %ymm0, 32\(%(?:r|e)di\)
+** movq %rax, 61\(%(?:r|e)di\)
+** vmovdqu %ymm1, \(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+/*
+**memmove93:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu \(%(?:r|e)si\), %ymm2
+** vmovdqu 32\(%(?:r|e)si\), %ymm1
+** vmovdqu 61\(%(?:r|e)si\), %ymm0
+** vmovdqu %ymm1, 32\(%(?:r|e)di\)
+** vmovdqu %ymm2, \(%(?:r|e)di\)
+** vmovdqu %ymm0, 61\(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+#include "builtin-memmove-1a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } {^\t?\.} } } */
+
+/*
+**memmove7:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movl \(%(?:r|e)si\), %edx
+** movl 3\(%(?:r|e)si\), %eax
+** movl %edx, \(%(?:r|e)di\)
+** movl %eax, 3\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove13:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq \(%(?:r|e)si\), %rdx
+** movq 5\(%(?:r|e)si\), %rax
+** movq %rdx, \(%(?:r|e)di\)
+** movq %rax, 5\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove31:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu \(%(?:r|e)si\), %xmm1
+** vmovdqu 15\(%(?:r|e)si\), %xmm0
+** vmovdqu %xmm1, \(%(?:r|e)di\)
+** vmovdqu %xmm0, 15\(%(?:r|e)di\)
+** ret
+**...
+*/
+
+/*
+**memmove39:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu \(%(?:r|e)si\), %ymm0
+** movq 31\(%(?:r|e)si\), %rax
+** vmovdqu %ymm0, \(%(?:r|e)di\)
+** movq %rax, 31\(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+/*
+**memmove61:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu \(%(?:r|e)si\), %ymm1
+** vmovdqu 29\(%(?:r|e)si\), %ymm0
+** vmovdqu %ymm1, \(%(?:r|e)di\)
+** vmovdqu %ymm0, 29\(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+/*
+**memmove69:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu64 \(%(?:r|e)si\), %zmm0
+** movq 61\(%(?:r|e)si\), %rax
+** vmovdqu64 %zmm0, \(%(?:r|e)di\)
+** movq %rax, 61\(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+/*
+**memmove93:
+**.LFB[0-9]+:
+** .cfi_startproc
+** vmovdqu64 \(%(?:r|e)si\), %zmm1
+** vmovdqu 61\(%(?:r|e)si\), %ymm0
+** vmovdqu64 %zmm1, \(%(?:r|e)di\)
+** vmovdqu %ymm0, 61\(%(?:r|e)di\)
+** vzeroupper
+** ret
+**...
+*/
+
+#include "builtin-memmove-1a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mgeneral-regs-only -march=x86-64 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**memmove7:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movl \(%rsi\), %edx
+** movl 3\(%rsi\), %eax
+** movl %edx, \(%rdi\)
+** movl %eax, 3\(%rdi\)
+** ret
+**...
+*/
+
+/*
+**memmove13:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq \(%rsi\), %rdx
+** movq 5\(%rsi\), %rax
+** movq %rdx, \(%rdi\)
+** movq %rax, 5\(%rdi\)
+** ret
+**...
+*/
+
+/*
+**memmove31:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq \(%(e|r)si\), %r8
+** movq 8\(%(e|r)si\), %rcx
+** movq 16\(%(e|r)si\), %rdx
+** movq 23\(%(e|r)si\), %rax
+** movq %r8, \(%(e|r)di\)
+** movq %rdx, 16\(%(e|r)di\)
+** movq %rcx, 8\(%(e|r)di\)
+** movq %rax, 23\(%(e|r)di\)
+** ret
+**...
+*/
+
+/*
+**memmove39:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq \(%rsi\), %r9
+** movq 8\(%rsi\), %r8
+** movq 16\(%rsi\), %rcx
+** movq 24\(%rsi\), %rdx
+** movq 31\(%rsi\), %rax
+** movq %r9, \(%rdi\)
+** movq %rdx, 24\(%rdi\)
+** movq %r8, 8\(%rdi\)
+** movq %rcx, 16\(%rdi\)
+** movq %rax, 31\(%rdi\)
+** ret
+**...
+*/
+
+/*
+**memmove61:
+**.LFB[0-9]+:
+** .cfi_startproc
+** movq 8\(%rsi\), %r11
+** movq 16\(%rsi\), %r10
+** pushq %rbx
+** .cfi_def_cfa_offset 16
+** .cfi_offset 3, -16
+** movq 24\(%rsi\), %r9
+** movq \(%rsi\), %rbx
+** movq 32\(%rsi\), %r8
+** movq 40\(%rsi\), %rcx
+** movq 48\(%rsi\), %rdx
+** movq 53\(%rsi\), %rax
+** movq %rbx, \(%rdi\)
+** movq %r11, 8\(%rdi\)
+** popq %rbx
+** .cfi_def_cfa_offset 8
+** movq %rdx, 48\(%rdi\)
+** movq %r10, 16\(%rdi\)
+** movq %r9, 24\(%rdi\)
+** movq %r8, 32\(%rdi\)
+** movq %rcx, 40\(%rdi\)
+** movq %rax, 53\(%rdi\)
+** ret
+**...
+*/
+
+/*
+**memmove69:
+**.LFB5:
+** .cfi_startproc
+** movq 16\(%rsi\), %r11
+** movq 24\(%rsi\), %r10
+** pushq %rbp
+** .cfi_def_cfa_offset 16
+** .cfi_offset 6, -16
+** movq 32\(%rsi\), %r9
+** movq \(%rsi\), %rbp
+** pushq %rbx
+** .cfi_def_cfa_offset 24
+** .cfi_offset 3, -24
+** movq 40\(%rsi\), %r8
+** movq 8\(%rsi\), %rbx
+** movq 48\(%rsi\), %rcx
+** movq 56\(%rsi\), %rdx
+** movq 61\(%rsi\), %rax
+** movq %rbp, \(%rdi\)
+** movq %rbx, 8\(%rdi\)
+** popq %rbx
+** .cfi_def_cfa_offset 16
+** movq %rdx, 56\(%rdi\)
+** popq %rbp
+** .cfi_def_cfa_offset 8
+** movq %r11, 16\(%rdi\)
+** movq %r10, 24\(%rdi\)
+** movq %r9, 32\(%rdi\)
+** movq %r8, 40\(%rdi\)
+** movq %rcx, 48\(%rdi\)
+** movq %rax, 61\(%rdi\)
+** ret
+**...
+*/
+
+/*
+**memmove93:
+**.LFB[0-9]+:
+** .cfi_startproc
+** sub(l|q) \$24, %(e|r)sp
+** .cfi_def_cfa_offset 32
+** mov(l|q) %(e|r)si, %(e|r)ax
+** movl \$93, %ecx
+** cmp(l|q) %(e|r)di, %(e|r)si
+** jb .L14
+** je .L10
+** movq %rbx, \(%(e|r)sp\)
+** mov(l|q) %(e|r)di, %(e|r)dx
+** movq %r14, 8\(%(e|r)sp\)
+** movq %r15, 16\(%(e|r)sp\)
+** .cfi_offset 3, -32
+** .cfi_offset 14, -24
+** .cfi_offset 15, -16
+** movq 85\(%(e|r)si\), %r14
+** movq 77\(%(e|r)si\), %r15
+** movq 69\(%(e|r)si\), %r10
+** movq 61\(%(e|r)si\), %r11
+**.L15:
+** movq 8\(%(e|r)ax\), %r9
+** movq 16\(%(e|r)ax\), %r8
+** subl \$32, %ecx
+** add(l|q) \$32, %(e|r)dx
+** movq 24\(%(e|r)ax\), %rsi
+** movq \(%(e|r)ax\), %rbx
+** add(l|q) \$32, %(e|r)ax
+** movq %r9, -24\(%(e|r)dx\)
+** movq %rbx, -32\(%(e|r)dx\)
+** movq %r8, -16\(%(e|r)dx\)
+** movq %rsi, -8\(%(e|r)dx\)
+** cmpl \$32, %ecx
+** ja .L15
+** movq %r10, 69\(%(e|r)di\)
+** movq \(%(e|r)sp\), %rbx
+** .cfi_restore 3
+** movq %r11, 61\(%(e|r)di\)
+** movq %r14, 85\(%(e|r)di\)
+** movq 8\(%(e|r)sp\), %r14
+** .cfi_restore 14
+** movq %r15, 77\(%(e|r)di\)
+** movq 16\(%(e|r)sp\), %r15
+** .cfi_restore 15
+**.L10:
+** add(l|q) \$24, %(e|r)sp
+** .cfi_remember_state
+** .cfi_def_cfa_offset 8
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L14:
+** .cfi_restore_state
+** movq %rbx, \(%(e|r)sp\)
+** lea(l|q) 93\(%(e|r)di\), %(e|r)dx
+** add(l|q) \$93, %(e|r)ax
+** movq %r14, 8\(%(e|r)sp\)
+** movq %r15, 16\(%(e|r)sp\)
+** .cfi_offset 3, -32
+** .cfi_offset 14, -24
+** .cfi_offset 15, -16
+** movq \(%(e|r)si\), %r14
+** movq 8\(%(e|r)si\), %r15
+** movq 16\(%(e|r)si\), %r10
+** movq 24\(%(e|r)si\), %r11
+**.L16:
+** movq -16\(%(e|r)ax\), %r9
+** movq -24\(%(e|r)ax\), %r8
+** subl \$32, %ecx
+** sub(l|q) \$32, %(e|r)dx
+** movq -32\(%(e|r)ax\), %rsi
+** movq -8\(%(e|r)ax\), %rbx
+** sub(l|q) \$32, %(e|r)ax
+** movq %r9, 16\(%(e|r)dx\)
+** movq %rbx, 24\(%(e|r)dx\)
+** movq %r8, 8\(%(e|r)dx\)
+** movq %rsi, \(%(e|r)dx\)
+** cmpl \$32, %ecx
+** ja .L16
+** movq %r14, \(%(e|r)di\)
+** movq \(%(e|r)sp\), %rbx
+** .cfi_restore 3
+** movq %r15, 8\(%(e|r)di\)
+** movq 8\(%(e|r)sp\), %r14
+** .cfi_restore 14
+** movq %r10, 16\(%(e|r)di\)
+** movq 16\(%(e|r)sp\), %r15
+** .cfi_restore 15
+** movq %r11, 24\(%(e|r)di\)
+** add(l|q) \$24, %(e|r)sp
+** .cfi_def_cfa_offset 8
+** ret
+**...
+*/
+
+#include "builtin-memmove-1a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_xmm:
+**.LFB0:
+** .cfi_startproc
+** movq %rdi, %rax
+** movq %rsi, %rcx
+** cmpq \$16, %rdx
+** jb .L3
+** cmpq \$32, %rdx
+** jbe .L17
+** cmpq \$128, %rdx
+** jbe .L18
+** movq %rdx, %rsi
+** cmpq %rdi, %rcx
+** jb .L11
+** je .L2
+** movdqu -16\(%rcx,%rdx\), %xmm7
+** movdqu -32\(%rcx,%rdx\), %xmm6
+** movdqu -48\(%rcx,%rdx\), %xmm5
+** movdqu -64\(%rcx,%rdx\), %xmm4
+**.L12:
+** movdqu \(%rcx\), %xmm3
+** subq \$64, %rsi
+** addq \$64, %rdi
+** addq \$64, %rcx
+** movdqu -48\(%rcx\), %xmm2
+** movdqu -32\(%rcx\), %xmm1
+** movdqu -16\(%rcx\), %xmm0
+** movups %xmm3, -64\(%rdi\)
+** movups %xmm2, -48\(%rdi\)
+** movups %xmm1, -32\(%rdi\)
+** movups %xmm0, -16\(%rdi\)
+** cmpq \$64, %rsi
+** ja .L12
+** movups %xmm7, -16\(%rax,%rdx\)
+** movups %xmm6, -32\(%rax,%rdx\)
+** movups %xmm5, -48\(%rax,%rdx\)
+** movups %xmm4, -64\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L3:
+** cmpq \$8, %rdx
+** jb .L19
+** movq \(%rsi\), %rdi
+** movq -8\(%rsi,%rdx\), %rcx
+** movq %rdi, \(%rax\)
+** movq %rcx, -8\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** cmpq \$4, %rdx
+** jnb .L6
+** cmpq \$1, %rdx
+** ja .L7
+** jb .L2
+** movzbl \(%rsi\), %edx
+** movb %dl, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L17:
+** movdqu \(%rsi\), %xmm1
+** movdqu -16\(%rsi,%rdx\), %xmm0
+** movups %xmm1, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L18:
+** cmpq \$64, %rdx
+** jb .L10
+** movdqu \(%rsi\), %xmm7
+** movdqu 16\(%rsi\), %xmm6
+** movdqu 32\(%rsi\), %xmm5
+** movdqu 48\(%rsi\), %xmm4
+** movdqu -16\(%rsi,%rdx\), %xmm3
+** movdqu -32\(%rsi,%rdx\), %xmm2
+** movdqu -48\(%rsi,%rdx\), %xmm1
+** movdqu -64\(%rsi,%rdx\), %xmm0
+** movups %xmm7, \(%rdi\)
+** movups %xmm6, 16\(%rdi\)
+** movups %xmm5, 32\(%rdi\)
+** movups %xmm4, 48\(%rdi\)
+** movups %xmm3, -16\(%rdi,%rdx\)
+** movups %xmm2, -32\(%rdi,%rdx\)
+** movups %xmm1, -48\(%rdi,%rdx\)
+** movups %xmm0, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movl \(%rsi\), %edi
+** movl -4\(%rsi,%rdx\), %ecx
+** movl %edi, \(%rax\)
+** movl %ecx, -4\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L11:
+** movdqu \(%rcx\), %xmm7
+** movdqu 16\(%rcx\), %xmm6
+** leaq \(%rdi,%rdx\), %rdi
+** movdqu 32\(%rcx\), %xmm5
+** movdqu 48\(%rcx\), %xmm4
+** addq %rdx, %rcx
+**.L13:
+** movdqu -16\(%rcx\), %xmm3
+** movdqu -32\(%rcx\), %xmm2
+** subq \$64, %rsi
+** subq \$64, %rdi
+** movdqu -48\(%rcx\), %xmm1
+** movdqu -64\(%rcx\), %xmm0
+** subq \$64, %rcx
+** movups %xmm3, 48\(%rdi\)
+** movups %xmm2, 32\(%rdi\)
+** movups %xmm1, 16\(%rdi\)
+** movups %xmm0, \(%rdi\)
+** cmpq \$64, %rsi
+** ja .L13
+** movups %xmm7, \(%rax\)
+** movups %xmm6, 16\(%rax\)
+** movups %xmm5, 32\(%rax\)
+** movups %xmm4, 48\(%rax\)
+**.L2:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L10:
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** movdqu -16\(%rsi,%rdx\), %xmm1
+** movdqu -32\(%rsi,%rdx\), %xmm0
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, -16\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movzwl \(%rsi\), %edi
+** movzwl -2\(%rsi,%rdx\), %ecx
+** movw %di, \(%rax\)
+** movw %cx, -2\(%rax,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+#ifndef gcc_memmove
+#define gcc_memmove gcc_memmove_xmm
+#endif
+
+void *
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ return __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_ymm:
+**.LFB0:
+** .cfi_startproc
+** movq %rdi, %rax
+** movq %rsi, %rcx
+** cmpq \$32, %rdx
+** jb .L3
+** cmpq \$64, %rdx
+** jbe .L18
+** cmpq \$256, %rdx
+** jbe .L19
+** movq %rdx, %rsi
+** cmpq %rdi, %rcx
+** jb .L12
+** je .L2
+** vmovdqu -32\(%rcx,%rdx\), %ymm7
+** vmovdqu -64\(%rcx,%rdx\), %ymm6
+** vmovdqu -96\(%rcx,%rdx\), %ymm5
+** vmovdqu -128\(%rcx,%rdx\), %ymm4
+**.L13:
+** vmovdqu \(%rcx\), %ymm3
+** addq \$-128, %rsi
+** subq \$-128, %rdi
+** subq \$-128, %rcx
+** vmovdqu -96\(%rcx\), %ymm2
+** vmovdqu -64\(%rcx\), %ymm1
+** vmovdqu -32\(%rcx\), %ymm0
+** vmovdqu %ymm3, -128\(%rdi\)
+** vmovdqu %ymm2, -96\(%rdi\)
+** vmovdqu %ymm1, -64\(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi\)
+** cmpq \$128, %rsi
+** ja .L13
+** vmovdqu %ymm7, -32\(%rax,%rdx\)
+** vmovdqu %ymm6, -64\(%rax,%rdx\)
+** vmovdqu %ymm5, -96\(%rax,%rdx\)
+** vmovdqu %ymm4, -128\(%rax,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L3:
+** cmpq \$16, %rdx
+** jb .L20
+** vmovdqu \(%rsi\), %xmm1
+** vmovdqu -16\(%rsi,%rdx\), %xmm0
+** vmovdqu %xmm1, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L20:
+** cmpq \$8, %rdx
+** jnb .L6
+** cmpq \$4, %rdx
+** jnb .L7
+** cmpq \$1, %rdx
+** ja .L8
+** jb .L2
+** movzbl \(%rsi\), %edx
+** movb %dl, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L18:
+** vmovdqu \(%rsi\), %ymm1
+** vmovdqu -32\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm1, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** cmpq \$128, %rdx
+** jb .L11
+** vmovdqu \(%rsi\), %ymm7
+** vmovdqu 32\(%rsi\), %ymm6
+** vmovdqu 64\(%rsi\), %ymm5
+** vmovdqu 96\(%rsi\), %ymm4
+** vmovdqu -32\(%rsi,%rdx\), %ymm3
+** vmovdqu -64\(%rsi,%rdx\), %ymm2
+** vmovdqu -96\(%rsi,%rdx\), %ymm1
+** vmovdqu -128\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm7, \(%rdi\)
+** vmovdqu %ymm6, 32\(%rdi\)
+** vmovdqu %ymm5, 64\(%rdi\)
+** vmovdqu %ymm4, 96\(%rdi\)
+** vmovdqu %ymm3, -32\(%rdi,%rdx\)
+** vmovdqu %ymm2, -64\(%rdi,%rdx\)
+** vmovdqu %ymm1, -96\(%rdi,%rdx\)
+** vmovdqu %ymm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movq \(%rsi\), %rdi
+** movq -8\(%rsi,%rdx\), %rcx
+** movq %rdi, \(%rax\)
+** movq %rcx, -8\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L12:
+** vmovdqu \(%rcx\), %ymm7
+** vmovdqu 32\(%rcx\), %ymm6
+** leaq \(%rdi,%rdx\), %rdi
+** vmovdqu 64\(%rcx\), %ymm5
+** vmovdqu 96\(%rcx\), %ymm4
+** addq %rdx, %rcx
+**.L14:
+** vmovdqu -32\(%rcx\), %ymm3
+** vmovdqu -64\(%rcx\), %ymm2
+** addq \$-128, %rsi
+** addq \$-128, %rdi
+** vmovdqu -96\(%rcx\), %ymm1
+** vmovdqu -128\(%rcx\), %ymm0
+** addq \$-128, %rcx
+** vmovdqu %ymm3, 96\(%rdi\)
+** vmovdqu %ymm2, 64\(%rdi\)
+** vmovdqu %ymm1, 32\(%rdi\)
+** vmovdqu %ymm0, \(%rdi\)
+** cmpq \$128, %rsi
+** ja .L14
+** vmovdqu %ymm7, \(%rax\)
+** vmovdqu %ymm6, 32\(%rax\)
+** vmovdqu %ymm5, 64\(%rax\)
+** vmovdqu %ymm4, 96\(%rax\)
+** vzeroupper
+**.L2:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L11:
+** vmovdqu \(%rsi\), %ymm3
+** vmovdqu 32\(%rsi\), %ymm2
+** vmovdqu -32\(%rsi,%rdx\), %ymm1
+** vmovdqu -64\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm3, \(%rdi\)
+** vmovdqu %ymm2, 32\(%rdi\)
+** vmovdqu %ymm1, -32\(%rdi,%rdx\)
+** vmovdqu %ymm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movl \(%rsi\), %edi
+** movl -4\(%rsi,%rdx\), %ecx
+** movl %edi, \(%rax\)
+** movl %ecx, -4\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** movzwl \(%rsi\), %edi
+** movzwl -2\(%rsi,%rdx\), %ecx
+** movw %di, \(%rax\)
+** movw %cx, -2\(%rax,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_ymm
+#include "builtin-memmove-2a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_zmm:
+**.LFB0:
+** .cfi_startproc
+** movq %rdi, %rax
+** movq %rsi, %rcx
+** cmpq \$64, %rdx
+** jb .L3
+** cmpq \$128, %rdx
+** jbe .L19
+** cmpq \$512, %rdx
+** jbe .L20
+** movq %rdx, %rsi
+** cmpq %rdi, %rcx
+** jb .L13
+** je .L2
+** vmovdqu64 -64\(%rcx,%rdx\), %zmm7
+** vmovdqu64 -128\(%rcx,%rdx\), %zmm6
+** vmovdqu64 -192\(%rcx,%rdx\), %zmm5
+** vmovdqu64 -256\(%rcx,%rdx\), %zmm4
+**.L14:
+** vmovdqu64 \(%rcx\), %zmm3
+** vmovdqu64 64\(%rcx\), %zmm2
+** subq \$256, %rsi
+** addq \$256, %rdi
+** vmovdqu64 128\(%rcx\), %zmm1
+** addq \$256, %rcx
+** vmovdqu64 -64\(%rcx\), %zmm0
+** vmovdqu64 %zmm3, -256\(%rdi\)
+** vmovdqu64 %zmm2, -192\(%rdi\)
+** vmovdqu64 %zmm1, -128\(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi\)
+** cmpq \$256, %rsi
+** ja .L14
+** vmovdqu64 %zmm7, -64\(%rax,%rdx\)
+** vmovdqu64 %zmm6, -128\(%rax,%rdx\)
+** vmovdqu64 %zmm5, -192\(%rax,%rdx\)
+** vmovdqu64 %zmm4, -256\(%rax,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L3:
+** cmpq \$32, %rdx
+** jb .L21
+** vmovdqu \(%rsi\), %ymm1
+** vmovdqu -32\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm1, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L21:
+** cmpq \$16, %rdx
+** jnb .L6
+** cmpq \$8, %rdx
+** jnb .L7
+** cmpq \$4, %rdx
+** jnb .L8
+** cmpq \$1, %rdx
+** ja .L9
+** jb .L2
+** movzbl \(%rsi\), %edx
+** movb %dl, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** vmovdqu64 \(%rsi\), %zmm1
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm1, \(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L20:
+** cmpq \$256, %rdx
+** jb .L12
+** vmovdqu64 \(%rsi\), %zmm7
+** vmovdqu64 64\(%rsi\), %zmm6
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm3
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm2
+** vmovdqu64 128\(%rsi\), %zmm5
+** vmovdqu64 192\(%rsi\), %zmm4
+** vmovdqu64 -192\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -256\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm7, \(%rdi\)
+** vmovdqu64 %zmm6, 64\(%rdi\)
+** vmovdqu64 %zmm5, 128\(%rdi\)
+** vmovdqu64 %zmm4, 192\(%rdi\)
+** vmovdqu64 %zmm3, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm2, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm1, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -256\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** vmovdqu \(%rsi\), %xmm1
+** vmovdqu -16\(%rsi,%rdx\), %xmm0
+** vmovdqu %xmm1, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** vmovdqu64 \(%rcx\), %zmm7
+** leaq \(%rdi,%rdx\), %rdi
+** vmovdqu64 64\(%rcx\), %zmm6
+** vmovdqu64 128\(%rcx\), %zmm5
+** vmovdqu64 192\(%rcx\), %zmm4
+** addq %rdx, %rcx
+**.L15:
+** vmovdqu64 -64\(%rcx\), %zmm3
+** vmovdqu64 -128\(%rcx\), %zmm2
+** subq \$256, %rsi
+** subq \$256, %rdi
+** vmovdqu64 -192\(%rcx\), %zmm1
+** subq \$256, %rcx
+** vmovdqu64 \(%rcx\), %zmm0
+** vmovdqu64 %zmm3, 192\(%rdi\)
+** vmovdqu64 %zmm2, 128\(%rdi\)
+** vmovdqu64 %zmm1, 64\(%rdi\)
+** vmovdqu64 %zmm0, \(%rdi\)
+** cmpq \$256, %rsi
+** ja .L15
+** vmovdqu64 %zmm7, \(%rax\)
+** vmovdqu64 %zmm6, 64\(%rax\)
+** vmovdqu64 %zmm5, 128\(%rax\)
+** vmovdqu64 %zmm4, 192\(%rax\)
+** vzeroupper
+**.L2:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L12:
+** vmovdqu64 \(%rsi\), %zmm3
+** vmovdqu64 64\(%rsi\), %zmm2
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm3, \(%rdi\)
+** vmovdqu64 %zmm2, 64\(%rdi\)
+** vmovdqu64 %zmm1, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movq \(%rsi\), %rdi
+** movq -8\(%rsi,%rdx\), %rcx
+** movq %rdi, \(%rax\)
+** movq %rcx, -8\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** movl \(%rsi\), %edi
+** movl -4\(%rsi,%rdx\), %ecx
+** movl %edi, \(%rax\)
+** movl %ecx, -4\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L9:
+** movzwl \(%rsi\), %edi
+** movzwl -2\(%rsi,%rdx\), %ecx
+** movw %di, \(%rax\)
+** movw %cx, -2\(%rax,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_zmm
+#include "builtin-memmove-2a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mgeneral-regs-only -march=x86-64 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_gpr:
+**.LFB0:
+** .cfi_startproc
+** movq %rdi, %rax
+** cmpq \$8, %rdx
+** jb .L3
+** cmpq \$16, %rdx
+** jbe .L19
+** subq \$32, %rsp
+** .cfi_def_cfa_offset 40
+** cmpq \$64, %rdx
+** jbe .L20
+** movq %rsi, %rcx
+** movq %rdx, %rsi
+** cmpq %rdi, %rcx
+** jb .L10
+** je .L2
+** movq %rbx, \(%rsp\)
+** movq %rbp, 8\(%rsp\)
+** movq %r14, 16\(%rsp\)
+** movq %r15, 24\(%rsp\)
+** .cfi_offset 3, -40
+** .cfi_offset 6, -32
+** .cfi_offset 14, -24
+** .cfi_offset 15, -16
+** movq -8\(%rcx,%rdx\), %r15
+** movq -16\(%rcx,%rdx\), %r14
+** movq -24\(%rcx,%rdx\), %rbp
+** movq -32\(%rcx,%rdx\), %r11
+**.L11:
+** movq 8\(%rcx\), %r10
+** movq 16\(%rcx\), %r9
+** subq \$32, %rsi
+** addq \$32, %rdi
+** movq 24\(%rcx\), %r8
+** movq \(%rcx\), %rbx
+** addq \$32, %rcx
+** movq %r10, -24\(%rdi\)
+** movq %rbx, -32\(%rdi\)
+** movq %r9, -16\(%rdi\)
+** movq %r8, -8\(%rdi\)
+** cmpq \$32, %rsi
+** ja .L11
+** movq %r15, -8\(%rax,%rdx\)
+** movq %r14, -16\(%rax,%rdx\)
+** movq %rbp, -24\(%rax,%rdx\)
+** movq %r11, -32\(%rax,%rdx\)
+** movq \(%rsp\), %rbx
+** .cfi_restore 3
+** movq 8\(%rsp\), %rbp
+** .cfi_restore 6
+** movq 16\(%rsp\), %r14
+** .cfi_restore 14
+** movq 24\(%rsp\), %r15
+** .cfi_restore 15
+** jmp .L2
+** .p2align 4,,10
+** .p2align 3
+**.L3:
+** .cfi_def_cfa_offset 8
+** cmpq \$4, %rdx
+** jb .L21
+** movl \(%rsi\), %edi
+** movl -4\(%rsi,%rdx\), %ecx
+** movl %edi, \(%rax\)
+** movl %ecx, -4\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L21:
+** cmpq \$1, %rdx
+** ja .L6
+** jb .L16
+** movzbl \(%rsi\), %edx
+** movb %dl, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** movq \(%rsi\), %rdi
+** movq -8\(%rsi,%rdx\), %rcx
+** movq %rdi, \(%rax\)
+** movq %rcx, -8\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L20:
+** .cfi_def_cfa_offset 40
+** cmpq \$32, %rdx
+** jb .L9
+** movq %rbx, \(%rsp\)
+** movq %r14, 16\(%rsp\)
+** .cfi_offset 3, -40
+** .cfi_offset 14, -24
+** movq \(%rsi\), %rbx
+** movq 8\(%rsi\), %r14
+** movq 16\(%rsi\), %r11
+** movq 24\(%rsi\), %r10
+** movq -8\(%rsi,%rdx\), %r9
+** movq -16\(%rsi,%rdx\), %r8
+** movq -24\(%rsi,%rdx\), %rdi
+** movq -32\(%rsi,%rdx\), %rcx
+** movq %rbx, \(%rax\)
+** movq %r14, 8\(%rax\)
+** movq %r11, 16\(%rax\)
+** movq %r10, 24\(%rax\)
+** movq %r9, -8\(%rax,%rdx\)
+** movq %r8, -16\(%rax,%rdx\)
+** movq %rdi, -24\(%rax,%rdx\)
+** movq %rcx, -32\(%rax,%rdx\)
+** movq \(%rsp\), %rbx
+** .cfi_restore 3
+** movq 16\(%rsp\), %r14
+** .cfi_restore 14
+**.L2:
+** addq \$32, %rsp
+** .cfi_def_cfa_offset 8
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movzwl \(%rsi\), %edi
+** movzwl -2\(%rsi,%rdx\), %ecx
+** movw %di, \(%rax\)
+** movw %cx, -2\(%rax,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L16:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L9:
+** .cfi_def_cfa_offset 40
+** movq \(%rsi\), %r9
+** movq 8\(%rsi\), %r8
+** movq -8\(%rsi,%rdx\), %rdi
+** movq -16\(%rsi,%rdx\), %rcx
+** movq %r9, \(%rax\)
+** movq %r8, 8\(%rax\)
+** movq %rdi, -8\(%rax,%rdx\)
+** movq %rcx, -16\(%rax,%rdx\)
+** jmp .L2
+** .p2align 4,,10
+** .p2align 3
+**.L10:
+** movq %rbx, \(%rsp\)
+** leaq \(%rdi,%rdx\), %rdi
+** movq %r14, 16\(%rsp\)
+** movq %r15, 24\(%rsp\)
+** .cfi_offset 3, -40
+** .cfi_offset 14, -24
+** .cfi_offset 15, -16
+** movq \(%rcx\), %r14
+** movq 8\(%rcx\), %r15
+** movq 16\(%rcx\), %r10
+** movq 24\(%rcx\), %r11
+** addq %rdx, %rcx
+**.L12:
+** movq -16\(%rcx\), %r9
+** movq -24\(%rcx\), %r8
+** subq \$32, %rsi
+** subq \$32, %rdi
+** movq -32\(%rcx\), %rdx
+** movq -8\(%rcx\), %rbx
+** subq \$32, %rcx
+** movq %r9, 16\(%rdi\)
+** movq %rbx, 24\(%rdi\)
+** movq %r8, 8\(%rdi\)
+** movq %rdx, \(%rdi\)
+** cmpq \$32, %rsi
+** ja .L12
+** movq %r14, \(%rax\)
+** movq \(%rsp\), %rbx
+** .cfi_restore 3
+** movq %r15, 8\(%rax\)
+** movq 16\(%rsp\), %r14
+** .cfi_restore 14
+** movq %r10, 16\(%rax\)
+** movq 24\(%rsp\), %r15
+** .cfi_restore 15
+** movq %r11, 24\(%rax\)
+** jmp .L2
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_gpr
+#include "builtin-memmove-2a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_xmm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$16, %rdx
+** ja .L13
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$32, %rdx
+** jbe .L14
+** cmpq \$128, %rdx
+** ja .L5
+** cmpq \$64, %rdx
+** jnb .L15
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** movdqu -16\(%rsi,%rdx\), %xmm1
+** movdqu -32\(%rsi,%rdx\), %xmm0
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, -16\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L14:
+** movdqu \(%rsi\), %xmm1
+** movdqu -16\(%rsi,%rdx\), %xmm0
+** movups %xmm1, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L7
+** je .L1
+** movdqu -16\(%rax,%rdx\), %xmm7
+** movdqu -32\(%rax,%rdx\), %xmm6
+** movdqu -48\(%rax,%rdx\), %xmm5
+** movdqu -64\(%rax,%rdx\), %xmm4
+**.L8:
+** movdqu \(%rax\), %xmm3
+** subq \$64, %rsi
+** addq \$64, %rcx
+** addq \$64, %rax
+** movdqu -48\(%rax\), %xmm2
+** movdqu -32\(%rax\), %xmm1
+** movdqu -16\(%rax\), %xmm0
+** movups %xmm3, -64\(%rcx\)
+** movups %xmm2, -48\(%rcx\)
+** movups %xmm1, -32\(%rcx\)
+** movups %xmm0, -16\(%rcx\)
+** cmpq \$64, %rsi
+** ja .L8
+** movups %xmm7, -16\(%rdi,%rdx\)
+** movups %xmm6, -32\(%rdi,%rdx\)
+** movups %xmm5, -48\(%rdi,%rdx\)
+** movups %xmm4, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movdqu \(%rax\), %xmm3
+** movdqu 16\(%rax\), %xmm2
+** leaq \(%rdi,%rdx\), %rcx
+** movdqu 32\(%rax\), %xmm1
+** movdqu 48\(%rax\), %xmm0
+** addq %rdx, %rax
+**.L9:
+** movdqu -16\(%rax\), %xmm7
+** movdqu -32\(%rax\), %xmm6
+** subq \$64, %rsi
+** subq \$64, %rcx
+** movdqu -48\(%rax\), %xmm5
+** movdqu -64\(%rax\), %xmm4
+** subq \$64, %rax
+** movups %xmm7, 48\(%rcx\)
+** movups %xmm6, 32\(%rcx\)
+** movups %xmm5, 16\(%rcx\)
+** movups %xmm4, \(%rcx\)
+** cmpq \$64, %rsi
+** ja .L9
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, 32\(%rdi\)
+** movups %xmm0, 48\(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L15:
+** movdqu \(%rsi\), %xmm7
+** movdqu 16\(%rsi\), %xmm6
+** movdqu 32\(%rsi\), %xmm5
+** movdqu 48\(%rsi\), %xmm4
+** movdqu -16\(%rsi,%rdx\), %xmm3
+** movdqu -32\(%rsi,%rdx\), %xmm2
+** movdqu -48\(%rsi,%rdx\), %xmm1
+** movdqu -64\(%rsi,%rdx\), %xmm0
+** movups %xmm7, \(%rdi\)
+** movups %xmm6, 16\(%rdi\)
+** movups %xmm5, 32\(%rdi\)
+** movups %xmm4, 48\(%rdi\)
+** movups %xmm3, -16\(%rdi,%rdx\)
+** movups %xmm2, -32\(%rdi,%rdx\)
+** movups %xmm1, -48\(%rdi,%rdx\)
+** movups %xmm0, -64\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+#ifndef gcc_memmove
+#define gcc_memmove gcc_memmove_xmm
+#endif
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n > 16)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_ymm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$16, %rdx
+** ja .L16
+**.L14:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L16:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$32, %rdx
+** jb .L6
+** cmpq \$64, %rdx
+** ja .L5
+** vmovdqu \(%rsi\), %ymm1
+** vmovdqu -32\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm1, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** vmovdqu \(%rsi\), %xmm1
+** vmovdqu -16\(%rsi,%rdx\), %xmm0
+** vmovdqu %xmm1, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** cmpq \$256, %rdx
+** jbe .L17
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L9
+** je .L14
+** vmovdqu -32\(%rax,%rdx\), %ymm7
+** vmovdqu -64\(%rax,%rdx\), %ymm6
+** vmovdqu -96\(%rax,%rdx\), %ymm5
+** vmovdqu -128\(%rax,%rdx\), %ymm4
+**.L10:
+** vmovdqu \(%rax\), %ymm3
+** addq \$-128, %rsi
+** subq \$-128, %rcx
+** subq \$-128, %rax
+** vmovdqu -96\(%rax\), %ymm2
+** vmovdqu -64\(%rax\), %ymm1
+** vmovdqu -32\(%rax\), %ymm0
+** vmovdqu %ymm3, -128\(%rcx\)
+** vmovdqu %ymm2, -96\(%rcx\)
+** vmovdqu %ymm1, -64\(%rcx\)
+** vmovdqu %ymm0, -32\(%rcx\)
+** cmpq \$128, %rsi
+** ja .L10
+** vmovdqu %ymm7, -32\(%rdi,%rdx\)
+** vmovdqu %ymm6, -64\(%rdi,%rdx\)
+** vmovdqu %ymm5, -96\(%rdi,%rdx\)
+** vmovdqu %ymm4, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L17:
+** cmpq \$128, %rdx
+** jb .L8
+** vmovdqu \(%rsi\), %ymm7
+** vmovdqu 32\(%rsi\), %ymm6
+** vmovdqu 64\(%rsi\), %ymm5
+** vmovdqu 96\(%rsi\), %ymm4
+** vmovdqu -32\(%rsi,%rdx\), %ymm3
+** vmovdqu -64\(%rsi,%rdx\), %ymm2
+** vmovdqu -96\(%rsi,%rdx\), %ymm1
+** vmovdqu -128\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm7, \(%rdi\)
+** vmovdqu %ymm6, 32\(%rdi\)
+** vmovdqu %ymm5, 64\(%rdi\)
+** vmovdqu %ymm4, 96\(%rdi\)
+** vmovdqu %ymm3, -32\(%rdi,%rdx\)
+** vmovdqu %ymm2, -64\(%rdi,%rdx\)
+** vmovdqu %ymm1, -96\(%rdi,%rdx\)
+** vmovdqu %ymm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** vmovdqu \(%rsi\), %ymm3
+** vmovdqu 32\(%rsi\), %ymm2
+** vmovdqu -32\(%rsi,%rdx\), %ymm1
+** vmovdqu -64\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm3, \(%rdi\)
+** vmovdqu %ymm2, 32\(%rdi\)
+** vmovdqu %ymm1, -32\(%rdi,%rdx\)
+** vmovdqu %ymm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L9:
+** vmovdqu \(%rax\), %ymm3
+** vmovdqu 32\(%rax\), %ymm2
+** leaq \(%rdi,%rdx\), %rcx
+** vmovdqu 64\(%rax\), %ymm1
+** vmovdqu 96\(%rax\), %ymm0
+** addq %rdx, %rax
+**.L11:
+** vmovdqu -32\(%rax\), %ymm7
+** vmovdqu -64\(%rax\), %ymm6
+** addq \$-128, %rsi
+** addq \$-128, %rcx
+** vmovdqu -96\(%rax\), %ymm5
+** vmovdqu -128\(%rax\), %ymm4
+** addq \$-128, %rax
+** vmovdqu %ymm7, 96\(%rcx\)
+** vmovdqu %ymm6, 64\(%rcx\)
+** vmovdqu %ymm5, 32\(%rcx\)
+** vmovdqu %ymm4, \(%rcx\)
+** cmpq \$128, %rsi
+** ja .L11
+** vmovdqu %ymm3, \(%rdi\)
+** vmovdqu %ymm2, 32\(%rdi\)
+** vmovdqu %ymm1, 64\(%rdi\)
+** vmovdqu %ymm0, 96\(%rdi\)
+** vzeroupper
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_ymm
+#include "builtin-memmove-3a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_zmm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$16, %rdx
+** ja .L18
+**.L16:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L18:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$64, %rdx
+** jnb .L19
+** cmpq \$32, %rdx
+** jb .L15
+** vmovdqu \(%rsi\), %ymm1
+** vmovdqu -32\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm1, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L19:
+** cmpq \$128, %rdx
+** ja .L5
+** vmovdqu64 \(%rsi\), %zmm1
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm1, \(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** cmpq \$512, %rdx
+** jbe .L20
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L10
+** je .L16
+** vmovdqu64 -64\(%rax,%rdx\), %zmm7
+** vmovdqu64 -128\(%rax,%rdx\), %zmm6
+** vmovdqu64 -192\(%rax,%rdx\), %zmm5
+** vmovdqu64 -256\(%rax,%rdx\), %zmm4
+**.L11:
+** vmovdqu64 \(%rax\), %zmm3
+** addq \$256, %rax
+** vmovdqu64 -192\(%rax\), %zmm2
+** subq \$256, %rsi
+** vmovdqu64 -128\(%rax\), %zmm1
+** vmovdqu64 -64\(%rax\), %zmm0
+** addq \$256, %rcx
+** vmovdqu64 %zmm3, -256\(%rcx\)
+** vmovdqu64 %zmm2, -192\(%rcx\)
+** vmovdqu64 %zmm1, -128\(%rcx\)
+** vmovdqu64 %zmm0, -64\(%rcx\)
+** cmpq \$256, %rsi
+** ja .L11
+** vmovdqu64 %zmm7, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm6, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm5, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm4, -256\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L15:
+** vmovdqu \(%rsi\), %xmm1
+** vmovdqu -16\(%rsi,%rdx\), %xmm0
+** vmovdqu %xmm1, \(%rdi\)
+** vmovdqu %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L20:
+** cmpq \$256, %rdx
+** jb .L9
+** vmovdqu64 \(%rsi\), %zmm7
+** vmovdqu64 64\(%rsi\), %zmm6
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm3
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm2
+** vmovdqu64 128\(%rsi\), %zmm5
+** vmovdqu64 192\(%rsi\), %zmm4
+** vmovdqu64 -192\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -256\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm7, \(%rdi\)
+** vmovdqu64 %zmm6, 64\(%rdi\)
+** vmovdqu64 %zmm5, 128\(%rdi\)
+** vmovdqu64 %zmm4, 192\(%rdi\)
+** vmovdqu64 %zmm3, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm2, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm1, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -256\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L9:
+** vmovdqu64 \(%rsi\), %zmm3
+** vmovdqu64 64\(%rsi\), %zmm2
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm3, \(%rdi\)
+** vmovdqu64 %zmm2, 64\(%rdi\)
+** vmovdqu64 %zmm1, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L10:
+** vmovdqu64 \(%rax\), %zmm3
+** leaq \(%rdi,%rdx\), %rcx
+** vmovdqu64 64\(%rax\), %zmm2
+** vmovdqu64 128\(%rax\), %zmm1
+** vmovdqu64 192\(%rax\), %zmm0
+** addq %rdx, %rax
+**.L12:
+** vmovdqu64 -64\(%rax\), %zmm7
+** subq \$256, %rax
+** vmovdqu64 128\(%rax\), %zmm6
+** subq \$256, %rsi
+** vmovdqu64 64\(%rax\), %zmm5
+** vmovdqu64 \(%rax\), %zmm4
+** subq \$256, %rcx
+** vmovdqu64 %zmm7, 192\(%rcx\)
+** vmovdqu64 %zmm6, 128\(%rcx\)
+** vmovdqu64 %zmm5, 64\(%rcx\)
+** vmovdqu64 %zmm4, \(%rcx\)
+** cmpq \$256, %rsi
+** ja .L12
+** vmovdqu64 %zmm3, \(%rdi\)
+** vmovdqu64 %zmm2, 64\(%rdi\)
+** vmovdqu64 %zmm1, 128\(%rdi\)
+** vmovdqu64 %zmm0, 192\(%rdi\)
+** vzeroupper
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_zmm
+#include "builtin-memmove-3a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_xmm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$32, %rdx
+** ja .L13
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$128, %rdx
+** jbe .L14
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L7
+** je .L1
+** movdqu -16\(%rax,%rdx\), %xmm7
+** movdqu -32\(%rax,%rdx\), %xmm6
+** movdqu -48\(%rax,%rdx\), %xmm5
+** movdqu -64\(%rax,%rdx\), %xmm4
+**.L8:
+** movdqu \(%rax\), %xmm3
+** subq \$64, %rsi
+** addq \$64, %rcx
+** addq \$64, %rax
+** movdqu -48\(%rax\), %xmm2
+** movdqu -32\(%rax\), %xmm1
+** movdqu -16\(%rax\), %xmm0
+** movups %xmm3, -64\(%rcx\)
+** movups %xmm2, -48\(%rcx\)
+** movups %xmm1, -32\(%rcx\)
+** movups %xmm0, -16\(%rcx\)
+** cmpq \$64, %rsi
+** ja .L8
+** movups %xmm7, -16\(%rdi,%rdx\)
+** movups %xmm6, -32\(%rdi,%rdx\)
+** movups %xmm5, -48\(%rdi,%rdx\)
+** movups %xmm4, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L14:
+** cmpq \$64, %rdx
+** jb .L6
+** movdqu \(%rsi\), %xmm7
+** movdqu 16\(%rsi\), %xmm6
+** movdqu 32\(%rsi\), %xmm5
+** movdqu 48\(%rsi\), %xmm4
+** movdqu -16\(%rsi,%rdx\), %xmm3
+** movdqu -32\(%rsi,%rdx\), %xmm2
+** movdqu -48\(%rsi,%rdx\), %xmm1
+** movdqu -64\(%rsi,%rdx\), %xmm0
+** movups %xmm7, \(%rdi\)
+** movups %xmm6, 16\(%rdi\)
+** movups %xmm5, 32\(%rdi\)
+** movups %xmm4, 48\(%rdi\)
+** movups %xmm3, -16\(%rdi,%rdx\)
+** movups %xmm2, -32\(%rdi,%rdx\)
+** movups %xmm1, -48\(%rdi,%rdx\)
+** movups %xmm0, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** movdqu -16\(%rsi,%rdx\), %xmm1
+** movdqu -32\(%rsi,%rdx\), %xmm0
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, -16\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movdqu \(%rax\), %xmm3
+** movdqu 16\(%rax\), %xmm2
+** leaq \(%rdi,%rdx\), %rcx
+** movdqu 32\(%rax\), %xmm1
+** movdqu 48\(%rax\), %xmm0
+** addq %rdx, %rax
+**.L9:
+** movdqu -16\(%rax\), %xmm7
+** movdqu -32\(%rax\), %xmm6
+** subq \$64, %rsi
+** subq \$64, %rcx
+** movdqu -48\(%rax\), %xmm5
+** movdqu -64\(%rax\), %xmm4
+** subq \$64, %rax
+** movups %xmm7, 48\(%rcx\)
+** movups %xmm6, 32\(%rcx\)
+** movups %xmm5, 16\(%rcx\)
+** movups %xmm4, \(%rcx\)
+** cmpq \$64, %rsi
+** ja .L9
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, 32\(%rdi\)
+** movups %xmm0, 48\(%rdi\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+#ifndef gcc_memmove
+#define gcc_memmove gcc_memmove_xmm
+#endif
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n > 32)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_ymm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$32, %rdx
+** ja .L14
+**.L12:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L14:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$64, %rdx
+** jbe .L15
+** cmpq \$256, %rdx
+** ja .L5
+** cmpq \$128, %rdx
+** jnb .L16
+** vmovdqu \(%rsi\), %ymm3
+** vmovdqu 32\(%rsi\), %ymm2
+** vmovdqu -32\(%rsi,%rdx\), %ymm1
+** vmovdqu -64\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm3, \(%rdi\)
+** vmovdqu %ymm2, 32\(%rdi\)
+** vmovdqu %ymm1, -32\(%rdi,%rdx\)
+** vmovdqu %ymm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L15:
+** vmovdqu \(%rsi\), %ymm1
+** vmovdqu -32\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm1, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L7
+** je .L12
+** vmovdqu -32\(%rax,%rdx\), %ymm7
+** vmovdqu -64\(%rax,%rdx\), %ymm6
+** vmovdqu -96\(%rax,%rdx\), %ymm5
+** vmovdqu -128\(%rax,%rdx\), %ymm4
+**.L8:
+** vmovdqu \(%rax\), %ymm3
+** addq \$-128, %rsi
+** subq \$-128, %rcx
+** subq \$-128, %rax
+** vmovdqu -96\(%rax\), %ymm2
+** vmovdqu -64\(%rax\), %ymm1
+** vmovdqu -32\(%rax\), %ymm0
+** vmovdqu %ymm3, -128\(%rcx\)
+** vmovdqu %ymm2, -96\(%rcx\)
+** vmovdqu %ymm1, -64\(%rcx\)
+** vmovdqu %ymm0, -32\(%rcx\)
+** cmpq \$128, %rsi
+** ja .L8
+** vmovdqu %ymm7, -32\(%rdi,%rdx\)
+** vmovdqu %ymm6, -64\(%rdi,%rdx\)
+** vmovdqu %ymm5, -96\(%rdi,%rdx\)
+** vmovdqu %ymm4, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** vmovdqu \(%rax\), %ymm3
+** vmovdqu 32\(%rax\), %ymm2
+** leaq \(%rdi,%rdx\), %rcx
+** vmovdqu 64\(%rax\), %ymm1
+** vmovdqu 96\(%rax\), %ymm0
+** addq %rdx, %rax
+**.L9:
+** vmovdqu -32\(%rax\), %ymm7
+** vmovdqu -64\(%rax\), %ymm6
+** addq \$-128, %rsi
+** addq \$-128, %rcx
+** vmovdqu -96\(%rax\), %ymm5
+** vmovdqu -128\(%rax\), %ymm4
+** addq \$-128, %rax
+** vmovdqu %ymm7, 96\(%rcx\)
+** vmovdqu %ymm6, 64\(%rcx\)
+** vmovdqu %ymm5, 32\(%rcx\)
+** vmovdqu %ymm4, \(%rcx\)
+** cmpq \$128, %rsi
+** ja .L9
+** vmovdqu %ymm3, \(%rdi\)
+** vmovdqu %ymm2, 32\(%rdi\)
+** vmovdqu %ymm1, 64\(%rdi\)
+** vmovdqu %ymm0, 96\(%rdi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L16:
+** vmovdqu \(%rsi\), %ymm7
+** vmovdqu 32\(%rsi\), %ymm6
+** vmovdqu 64\(%rsi\), %ymm5
+** vmovdqu 96\(%rsi\), %ymm4
+** vmovdqu -32\(%rsi,%rdx\), %ymm3
+** vmovdqu -64\(%rsi,%rdx\), %ymm2
+** vmovdqu -96\(%rsi,%rdx\), %ymm1
+** vmovdqu -128\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm7, \(%rdi\)
+** vmovdqu %ymm6, 32\(%rdi\)
+** vmovdqu %ymm5, 64\(%rdi\)
+** vmovdqu %ymm4, 96\(%rdi\)
+** vmovdqu %ymm3, -32\(%rdi,%rdx\)
+** vmovdqu %ymm2, -64\(%rdi,%rdx\)
+** vmovdqu %ymm1, -96\(%rdi,%rdx\)
+** vmovdqu %ymm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_ymm
+#include "builtin-memmove-4a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_zmm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$32, %rdx
+** ja .L16
+**.L14:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L16:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$64, %rdx
+** jb .L6
+** cmpq \$128, %rdx
+** ja .L5
+** vmovdqu64 \(%rsi\), %zmm1
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm1, \(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** vmovdqu \(%rsi\), %ymm1
+** vmovdqu -32\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm1, \(%rdi\)
+** vmovdqu %ymm0, -32\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** cmpq \$512, %rdx
+** jbe .L17
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L9
+** je .L14
+** vmovdqu64 -64\(%rax,%rdx\), %zmm7
+** vmovdqu64 -128\(%rax,%rdx\), %zmm6
+** vmovdqu64 -192\(%rax,%rdx\), %zmm5
+** vmovdqu64 -256\(%rax,%rdx\), %zmm4
+**.L10:
+** vmovdqu64 \(%rax\), %zmm3
+** addq \$256, %rax
+** vmovdqu64 -192\(%rax\), %zmm2
+** subq \$256, %rsi
+** vmovdqu64 -128\(%rax\), %zmm1
+** vmovdqu64 -64\(%rax\), %zmm0
+** addq \$256, %rcx
+** vmovdqu64 %zmm3, -256\(%rcx\)
+** vmovdqu64 %zmm2, -192\(%rcx\)
+** vmovdqu64 %zmm1, -128\(%rcx\)
+** vmovdqu64 %zmm0, -64\(%rcx\)
+** cmpq \$256, %rsi
+** ja .L10
+** vmovdqu64 %zmm7, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm6, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm5, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm4, -256\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L17:
+** cmpq \$256, %rdx
+** jb .L8
+** vmovdqu64 \(%rsi\), %zmm7
+** vmovdqu64 64\(%rsi\), %zmm6
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm3
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm2
+** vmovdqu64 128\(%rsi\), %zmm5
+** vmovdqu64 192\(%rsi\), %zmm4
+** vmovdqu64 -192\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -256\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm7, \(%rdi\)
+** vmovdqu64 %zmm6, 64\(%rdi\)
+** vmovdqu64 %zmm5, 128\(%rdi\)
+** vmovdqu64 %zmm4, 192\(%rdi\)
+** vmovdqu64 %zmm3, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm2, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm1, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -256\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** vmovdqu64 \(%rsi\), %zmm3
+** vmovdqu64 64\(%rsi\), %zmm2
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm3, \(%rdi\)
+** vmovdqu64 %zmm2, 64\(%rdi\)
+** vmovdqu64 %zmm1, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L9:
+** vmovdqu64 \(%rax\), %zmm3
+** leaq \(%rdi,%rdx\), %rcx
+** vmovdqu64 64\(%rax\), %zmm2
+** vmovdqu64 128\(%rax\), %zmm1
+** vmovdqu64 192\(%rax\), %zmm0
+** addq %rdx, %rax
+**.L11:
+** vmovdqu64 -64\(%rax\), %zmm7
+** subq \$256, %rax
+** vmovdqu64 128\(%rax\), %zmm6
+** subq \$256, %rsi
+** vmovdqu64 64\(%rax\), %zmm5
+** vmovdqu64 \(%rax\), %zmm4
+** subq \$256, %rcx
+** vmovdqu64 %zmm7, 192\(%rcx\)
+** vmovdqu64 %zmm6, 128\(%rcx\)
+** vmovdqu64 %zmm5, 64\(%rcx\)
+** vmovdqu64 %zmm4, \(%rcx\)
+** cmpq \$256, %rsi
+** ja .L11
+** vmovdqu64 %zmm3, \(%rdi\)
+** vmovdqu64 %zmm2, 64\(%rdi\)
+** vmovdqu64 %zmm1, 128\(%rdi\)
+** vmovdqu64 %zmm0, 192\(%rdi\)
+** vzeroupper
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_zmm
+#include "builtin-memmove-4a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_xmm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$67, %rdx
+** ja .L12
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L12:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$128, %rdx
+** jbe .L13
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L6
+** je .L1
+** movdqu -16\(%rax,%rdx\), %xmm7
+** movdqu -32\(%rax,%rdx\), %xmm6
+** movdqu -48\(%rax,%rdx\), %xmm5
+** movdqu -64\(%rax,%rdx\), %xmm4
+**.L7:
+** movdqu \(%rax\), %xmm3
+** subq \$64, %rsi
+** addq \$64, %rcx
+** addq \$64, %rax
+** movdqu -48\(%rax\), %xmm2
+** movdqu -32\(%rax\), %xmm1
+** movdqu -16\(%rax\), %xmm0
+** movups %xmm3, -64\(%rcx\)
+** movups %xmm2, -48\(%rcx\)
+** movups %xmm1, -32\(%rcx\)
+** movups %xmm0, -16\(%rcx\)
+** cmpq \$64, %rsi
+** ja .L7
+** movups %xmm7, -16\(%rdi,%rdx\)
+** movups %xmm6, -32\(%rdi,%rdx\)
+** movups %xmm5, -48\(%rdi,%rdx\)
+** movups %xmm4, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** movdqu \(%rsi\), %xmm7
+** movdqu 16\(%rsi\), %xmm6
+** movdqu 32\(%rsi\), %xmm5
+** movdqu 48\(%rsi\), %xmm4
+** movdqu -16\(%rsi,%rdx\), %xmm3
+** movdqu -32\(%rsi,%rdx\), %xmm2
+** movdqu -48\(%rsi,%rdx\), %xmm1
+** movdqu -64\(%rsi,%rdx\), %xmm0
+** movups %xmm7, \(%rdi\)
+** movups %xmm6, 16\(%rdi\)
+** movups %xmm5, 32\(%rdi\)
+** movups %xmm4, 48\(%rdi\)
+** movups %xmm3, -16\(%rdi,%rdx\)
+** movups %xmm2, -32\(%rdi,%rdx\)
+** movups %xmm1, -48\(%rdi,%rdx\)
+** movups %xmm0, -64\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movdqu \(%rax\), %xmm3
+** movdqu 16\(%rax\), %xmm2
+** leaq \(%rdi,%rdx\), %rcx
+** movdqu 32\(%rax\), %xmm1
+** movdqu 48\(%rax\), %xmm0
+** addq %rdx, %rax
+**.L8:
+** movdqu -16\(%rax\), %xmm7
+** movdqu -32\(%rax\), %xmm6
+** subq \$64, %rsi
+** subq \$64, %rcx
+** movdqu -48\(%rax\), %xmm5
+** movdqu -64\(%rax\), %xmm4
+** subq \$64, %rax
+** movups %xmm7, 48\(%rcx\)
+** movups %xmm6, 32\(%rcx\)
+** movups %xmm5, 16\(%rcx\)
+** movups %xmm4, \(%rcx\)
+** cmpq \$64, %rsi
+** ja .L8
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, 32\(%rdi\)
+** movups %xmm0, 48\(%rdi\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+#ifndef gcc_memmove
+#define gcc_memmove gcc_memmove_xmm
+#endif
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n > 67)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -march=x86-64-v3 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_ymm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$67, %rdx
+** ja .L14
+**.L12:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L14:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$256, %rdx
+** jbe .L15
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L7
+** je .L12
+** vmovdqu -32\(%rax,%rdx\), %ymm7
+** vmovdqu -64\(%rax,%rdx\), %ymm6
+** vmovdqu -96\(%rax,%rdx\), %ymm5
+** vmovdqu -128\(%rax,%rdx\), %ymm4
+**.L8:
+** vmovdqu \(%rax\), %ymm3
+** addq \$-128, %rsi
+** subq \$-128, %rcx
+** subq \$-128, %rax
+** vmovdqu -96\(%rax\), %ymm2
+** vmovdqu -64\(%rax\), %ymm1
+** vmovdqu -32\(%rax\), %ymm0
+** vmovdqu %ymm3, -128\(%rcx\)
+** vmovdqu %ymm2, -96\(%rcx\)
+** vmovdqu %ymm1, -64\(%rcx\)
+** vmovdqu %ymm0, -32\(%rcx\)
+** cmpq \$128, %rsi
+** ja .L8
+** vmovdqu %ymm7, -32\(%rdi,%rdx\)
+** vmovdqu %ymm6, -64\(%rdi,%rdx\)
+** vmovdqu %ymm5, -96\(%rdi,%rdx\)
+** vmovdqu %ymm4, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L15:
+** cmpq \$128, %rdx
+** jb .L6
+** vmovdqu \(%rsi\), %ymm7
+** vmovdqu 32\(%rsi\), %ymm6
+** vmovdqu 64\(%rsi\), %ymm5
+** vmovdqu 96\(%rsi\), %ymm4
+** vmovdqu -32\(%rsi,%rdx\), %ymm3
+** vmovdqu -64\(%rsi,%rdx\), %ymm2
+** vmovdqu -96\(%rsi,%rdx\), %ymm1
+** vmovdqu -128\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm7, \(%rdi\)
+** vmovdqu %ymm6, 32\(%rdi\)
+** vmovdqu %ymm5, 64\(%rdi\)
+** vmovdqu %ymm4, 96\(%rdi\)
+** vmovdqu %ymm3, -32\(%rdi,%rdx\)
+** vmovdqu %ymm2, -64\(%rdi,%rdx\)
+** vmovdqu %ymm1, -96\(%rdi,%rdx\)
+** vmovdqu %ymm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** vmovdqu \(%rsi\), %ymm3
+** vmovdqu 32\(%rsi\), %ymm2
+** vmovdqu -32\(%rsi,%rdx\), %ymm1
+** vmovdqu -64\(%rsi,%rdx\), %ymm0
+** vmovdqu %ymm3, \(%rdi\)
+** vmovdqu %ymm2, 32\(%rdi\)
+** vmovdqu %ymm1, -32\(%rdi,%rdx\)
+** vmovdqu %ymm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** vmovdqu \(%rax\), %ymm3
+** vmovdqu 32\(%rax\), %ymm2
+** leaq \(%rdi,%rdx\), %rcx
+** vmovdqu 64\(%rax\), %ymm1
+** vmovdqu 96\(%rax\), %ymm0
+** addq %rdx, %rax
+**.L9:
+** vmovdqu -32\(%rax\), %ymm7
+** vmovdqu -64\(%rax\), %ymm6
+** addq \$-128, %rsi
+** addq \$-128, %rcx
+** vmovdqu -96\(%rax\), %ymm5
+** vmovdqu -128\(%rax\), %ymm4
+** addq \$-128, %rax
+** vmovdqu %ymm7, 96\(%rcx\)
+** vmovdqu %ymm6, 64\(%rcx\)
+** vmovdqu %ymm5, 32\(%rcx\)
+** vmovdqu %ymm4, \(%rcx\)
+** cmpq \$128, %rsi
+** ja .L9
+** vmovdqu %ymm3, \(%rdi\)
+** vmovdqu %ymm2, 32\(%rdi\)
+** vmovdqu %ymm1, 64\(%rdi\)
+** vmovdqu %ymm0, 96\(%rdi\)
+** vzeroupper
+** ret
+** .cfi_endproc
+**.LFE0:
+**...
+*/
+
+#define gcc_memmove gcc_memmove_ymm
+#include "builtin-memmove-5a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmove-max=512 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove_zmm:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$67, %rdx
+** ja .L14
+**.L12:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L14:
+** movq %rdi, %rcx
+** movq %rsi, %rax
+** cmpq \$128, %rdx
+** jbe .L15
+** cmpq \$512, %rdx
+** ja .L5
+** cmpq \$256, %rdx
+** jnb .L16
+** vmovdqu64 \(%rsi\), %zmm3
+** vmovdqu64 64\(%rsi\), %zmm2
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm3, \(%rdi\)
+** vmovdqu64 %zmm2, 64\(%rdi\)
+** vmovdqu64 %zmm1, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -128\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L15:
+** vmovdqu64 \(%rsi\), %zmm1
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm1, \(%rdi\)
+** vmovdqu64 %zmm0, -64\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movq %rdx, %rsi
+** cmpq %rdi, %rax
+** jb .L7
+** je .L12
+** vmovdqu64 -64\(%rax,%rdx\), %zmm7
+** vmovdqu64 -128\(%rax,%rdx\), %zmm6
+** vmovdqu64 -192\(%rax,%rdx\), %zmm5
+** vmovdqu64 -256\(%rax,%rdx\), %zmm4
+**.L8:
+** vmovdqu64 \(%rax\), %zmm3
+** addq \$256, %rax
+** vmovdqu64 -192\(%rax\), %zmm2
+** subq \$256, %rsi
+** vmovdqu64 -128\(%rax\), %zmm1
+** vmovdqu64 -64\(%rax\), %zmm0
+** addq \$256, %rcx
+** vmovdqu64 %zmm3, -256\(%rcx\)
+** vmovdqu64 %zmm2, -192\(%rcx\)
+** vmovdqu64 %zmm1, -128\(%rcx\)
+** vmovdqu64 %zmm0, -64\(%rcx\)
+** cmpq \$256, %rsi
+** ja .L8
+** vmovdqu64 %zmm7, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm6, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm5, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm4, -256\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** vmovdqu64 \(%rax\), %zmm3
+** leaq \(%rdi,%rdx\), %rcx
+** vmovdqu64 64\(%rax\), %zmm2
+** vmovdqu64 128\(%rax\), %zmm1
+** vmovdqu64 192\(%rax\), %zmm0
+** addq %rdx, %rax
+**.L9:
+** vmovdqu64 -64\(%rax\), %zmm7
+** subq \$256, %rax
+** vmovdqu64 128\(%rax\), %zmm6
+** subq \$256, %rsi
+** vmovdqu64 64\(%rax\), %zmm5
+** vmovdqu64 \(%rax\), %zmm4
+** subq \$256, %rcx
+** vmovdqu64 %zmm7, 192\(%rcx\)
+** vmovdqu64 %zmm6, 128\(%rcx\)
+** vmovdqu64 %zmm5, 64\(%rcx\)
+** vmovdqu64 %zmm4, \(%rcx\)
+** cmpq \$256, %rsi
+** ja .L9
+** vmovdqu64 %zmm3, \(%rdi\)
+** vmovdqu64 %zmm2, 64\(%rdi\)
+** vmovdqu64 %zmm1, 128\(%rdi\)
+** vmovdqu64 %zmm0, 192\(%rdi\)
+** vzeroupper
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L16:
+** vmovdqu64 \(%rsi\), %zmm7
+** vmovdqu64 64\(%rsi\), %zmm6
+** vmovdqu64 -64\(%rsi,%rdx\), %zmm3
+** vmovdqu64 -128\(%rsi,%rdx\), %zmm2
+** vmovdqu64 128\(%rsi\), %zmm5
+** vmovdqu64 192\(%rsi\), %zmm4
+** vmovdqu64 -192\(%rsi,%rdx\), %zmm1
+** vmovdqu64 -256\(%rsi,%rdx\), %zmm0
+** vmovdqu64 %zmm7, \(%rdi\)
+** vmovdqu64 %zmm6, 64\(%rdi\)
+** vmovdqu64 %zmm5, 128\(%rdi\)
+** vmovdqu64 %zmm4, 192\(%rdi\)
+** vmovdqu64 %zmm3, -64\(%rdi,%rdx\)
+** vmovdqu64 %zmm2, -128\(%rdi,%rdx\)
+** vmovdqu64 %zmm1, -192\(%rdi,%rdx\)
+** vmovdqu64 %zmm0, -256\(%rdi,%rdx\)
+** vzeroupper
+** ret
+** .cfi_endproc
+**...
+*/
+
+#define gcc_memmove gcc_memmove_zmm
+#include "builtin-memmove-5a.c"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$7, %rdx
+** jbe .L8
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** cmpl \$4, %edx
+** jnb .L9
+** cmpl \$1, %edx
+** ja .L5
+** jb .L1
+** movzbl \(%rsi\), %eax
+** movb %al, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L9:
+** movl %edx, %edx
+** movl \(%rsi\), %ecx
+** movl -4\(%rsi,%rdx\), %eax
+** movl %ecx, \(%rdi\)
+** movl %eax, -4\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movl %edx, %edx
+** movzwl \(%rsi\), %ecx
+** movzwl -2\(%rsi,%rdx\), %eax
+** movw %cx, \(%rdi\)
+** movw %ax, -2\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n < 8)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$3, %rdx
+** jbe .L7
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** cmpl \$2, %edx
+** jnb .L8
+** cmpl \$1, %edx
+** jb .L1
+** movzbl \(%rsi\), %eax
+** movb %al, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** movl %edx, %edx
+** movzwl \(%rsi\), %ecx
+** movzwl -2\(%rsi,%rdx\), %eax
+** movw %cx, \(%rdi\)
+** movw %ax, -2\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n < 4)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$33, %rdx
+** jbe .L12
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L12:
+** cmpl \$16, %edx
+** jnb .L13
+** cmpl \$8, %edx
+** jnb .L6
+** cmpl \$4, %edx
+** jnb .L7
+** cmpl \$1, %edx
+** ja .L8
+** jb .L1
+** movzbl \(%rsi\), %eax
+** movb %al, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L13:
+** cmpl \$32, %edx
+** ja .L5
+** movl %edx, %edx
+** movdqu \(%rsi\), %xmm1
+** movdqu -16\(%rsi,%rdx\), %xmm0
+** movups %xmm1, \(%rdi\)
+** movups %xmm0, -16\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movl %edx, %edx
+** movdqu \(%rsi\), %xmm3
+** movdqu 16\(%rsi\), %xmm2
+** addq %rdx, %rsi
+** movdqu -16\(%rsi\), %xmm1
+** movdqu -32\(%rsi\), %xmm0
+** movups %xmm3, \(%rdi\)
+** movups %xmm2, 16\(%rdi\)
+** movups %xmm1, -16\(%rdi,%rdx\)
+** movups %xmm0, -32\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movl %edx, %edx
+** movq \(%rsi\), %rcx
+** movq -8\(%rsi,%rdx\), %rax
+** movq %rcx, \(%rdi\)
+** movq %rax, -8\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L7:
+** movl %edx, %edx
+** movl \(%rsi\), %ecx
+** movl -4\(%rsi,%rdx\), %eax
+** movl %ecx, \(%rdi\)
+** movl %eax, -4\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L8:
+** movl %edx, %edx
+** movzwl \(%rsi\), %ecx
+** movzwl -2\(%rsi,%rdx\), %eax
+** movw %cx, \(%rdi\)
+** movw %ax, -2\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n < 34)
+ __builtin_memmove (a, b, n);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */
+/* { dg-final { check-function-bodies "**" "" "" { target { lp64 } } {^\t?\.} } } */
+
+/*
+**gcc_memmove:
+**.LFB0:
+** .cfi_startproc
+** cmpq \$15, %rdx
+** jbe .L9
+**.L1:
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L9:
+** cmpl \$8, %edx
+** jnb .L10
+** cmpl \$4, %edx
+** jnb .L5
+** cmpl \$1, %edx
+** ja .L6
+** jb .L1
+** movzbl \(%rsi\), %eax
+** movb %al, \(%rdi\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L10:
+** movl %edx, %edx
+** movq \(%rsi\), %rcx
+** movq -8\(%rsi,%rdx\), %rax
+** movq %rcx, \(%rdi\)
+** movq %rax, -8\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L5:
+** movl %edx, %edx
+** movl \(%rsi\), %ecx
+** movl -4\(%rsi,%rdx\), %eax
+** movl %ecx, \(%rdi\)
+** movl %eax, -4\(%rdi,%rdx\)
+** ret
+** .p2align 4,,10
+** .p2align 3
+**.L6:
+** movl %edx, %edx
+** movzwl \(%rsi\), %ecx
+** movzwl -2\(%rsi,%rdx\), %eax
+** movw %cx, \(%rdi\)
+** movw %ax, -2\(%rdi,%rdx\)
+** ret
+** .cfi_endproc
+**...
+*/
+
+void
+gcc_memmove (void *a, void *b, __SIZE_TYPE__ n)
+{
+ if (n < 16)
+ __builtin_memmove (a, b, n);
+}