LoongArch: Improve cpymemsi expansion [PR109465]

author Xi Ruoyao <xry111@xry111.site>

Wed, 12 Apr 2023 11:45:48 +0000 (11:45 +0000)

committer Xi Ruoyao <xry111@xry111.site>

Wed, 19 Apr 2023 10:13:50 +0000 (18:13 +0800)
author Xi Ruoyao <xry111@xry111.site>
Wed, 12 Apr 2023 11:45:48 +0000 (11:45 +0000)
committer Xi Ruoyao <xry111@xry111.site>
Wed, 19 Apr 2023 10:13:50 +0000 (18:13 +0800)
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h

index 83df489c7a5f4d14b498205aaf5ec6e40b5d4a72..b71b188507a6661580cb3988d7fc6138ae823587 100644 (file)
--- a/gcc/config/loongarch/loongarch-protos.h
+++ b/gcc/config/loongarch/loongarch-protos.h
@@ -95,7 +95,7 @@ extern void loongarch_expand_conditional_trap (rtx);
  #endif
  extern void loongarch_set_return_address (rtx, rtx);
  extern bool loongarch_move_by_pieces_p (unsigned HOST_WIDE_INT, unsigned int);
-extern bool loongarch_expand_block_move (rtx, rtx, rtx);
+extern bool loongarch_expand_block_move (rtx, rtx, rtx, rtx);
  extern bool loongarch_do_optimize_block_move_p (void);
  
  extern bool loongarch_expand_ext_as_unaligned_load (rtx, rtx, HOST_WIDE_INT,
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc

index dfb731fca9d3ecf5209bb20cb995a5ee366c96b4..d808cb3a5aec5ec97044cb2daee221d52a25f54f 100644 (file)
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -4459,41 +4459,46 @@ loongarch_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
     Assume that the areas do not overlap.  */
  
  static void
-loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length)
+loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length,
+                              HOST_WIDE_INT delta)
  {
-  HOST_WIDE_INT offset, delta;
-  unsigned HOST_WIDE_INT bits;
+  HOST_WIDE_INT offs, delta_cur;
    int i;
    machine_mode mode;
    rtx *regs;
  
-  bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest)));
-
-  mode = int_mode_for_size (bits, 0).require ();
-  delta = bits / BITS_PER_UNIT;
+  /* Calculate how many registers we'll need for the block move.
+     We'll emit length / delta move operations with delta as the size
+     first.  Then we may still have length % delta bytes not copied.
+     We handle these remaining bytes by move operations with smaller
+     (halfed) sizes.  For example, if length = 21 and delta = 8, we'll
+     emit two ld.d/st.d pairs, one ld.w/st.w pair, and one ld.b/st.b
+     pair.  For each load/store pair we use a dedicated register to keep
+     the pipeline as populated as possible.  */
+  HOST_WIDE_INT num_reg = length / delta;
+  for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2)
+    num_reg += !!(length & delta_cur);
  
    /* Allocate a buffer for the temporary registers.  */
-  regs = XALLOCAVEC (rtx, length / delta);
+  regs = XALLOCAVEC (rtx, num_reg);
  
-  /* Load as many BITS-sized chunks as possible.  Use a normal load if
-     the source has enough alignment, otherwise use left/right pairs.  */
-  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
+  for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
      {
-      regs[i] = gen_reg_rtx (mode);
-      loongarch_emit_move (regs[i], adjust_address (src, mode, offset));
-    }
+      mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
  
-  for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
-    loongarch_emit_move (adjust_address (dest, mode, offset), regs[i]);
+      for (; offs + delta_cur <= length; offs += delta_cur, i++)
+       {
+         regs[i] = gen_reg_rtx (mode);
+         loongarch_emit_move (regs[i], adjust_address (src, mode, offs));
+       }
+    }
  
-  /* Mop up any left-over bytes.  */
-  if (offset < length)
+  for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
      {
-      src = adjust_address (src, BLKmode, offset);
-      dest = adjust_address (dest, BLKmode, offset);
-      move_by_pieces (dest, src, length - offset,
-                     MIN (MEM_ALIGN (src), MEM_ALIGN (dest)),
-                     (enum memop_ret) 0);
+      mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
+
+      for (; offs + delta_cur <= length; offs += delta_cur, i++)
+       loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]);
      }
  }
  
@@ -4523,10 +4528,11 @@ loongarch_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
  
  static void
  loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
-                          HOST_WIDE_INT bytes_per_iter)
+                          HOST_WIDE_INT align)
  {
    rtx_code_label *label;
    rtx src_reg, dest_reg, final_src, test;
+  HOST_WIDE_INT bytes_per_iter = align * LARCH_MAX_MOVE_OPS_PER_LOOP_ITER;
    HOST_WIDE_INT leftover;
  
    leftover = length % bytes_per_iter;
@@ -4546,7 +4552,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
    emit_label (label);
  
    /* Emit the loop body.  */
-  loongarch_block_move_straight (dest, src, bytes_per_iter);
+  loongarch_block_move_straight (dest, src, bytes_per_iter, align);
  
    /* Move on to the next block.  */
    loongarch_emit_move (src_reg,
@@ -4563,7 +4569,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
  
    /* Mop up any left-over bytes.  */
    if (leftover)
-    loongarch_block_move_straight (dest, src, leftover);
+    loongarch_block_move_straight (dest, src, leftover, align);
    else
      /* Temporary fix for PR79150.  */
      emit_insn (gen_nop ());
@@ -4573,25 +4579,32 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
     memory reference SRC to memory reference DEST.  */
  
  bool
-loongarch_expand_block_move (rtx dest, rtx src, rtx length)
+loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align)
  {
-  int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT;
+  if (!CONST_INT_P (r_length))
+    return false;
+
+  HOST_WIDE_INT length = INTVAL (r_length);
+  if (length > loongarch_max_inline_memcpy_size)
+    return false;
+
+  HOST_WIDE_INT align = INTVAL (r_align);
+
+  if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD)
+    align = UNITS_PER_WORD;
  
-  if (CONST_INT_P (length)
-      && INTVAL (length) <= loongarch_max_inline_memcpy_size)
+  if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT)
      {
-      if (INTVAL (length) <= max_move_bytes)
-       {
-         loongarch_block_move_straight (dest, src, INTVAL (length));
-         return true;
-       }
-      else if (optimize)
-       {
-         loongarch_block_move_loop (dest, src, INTVAL (length),
-                                    LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER);
-         return true;
-       }
+      loongarch_block_move_straight (dest, src, length, align);
+      return true;
+    }
+
+  if (optimize)
+    {
+      loongarch_block_move_loop (dest, src, length, align);
+      return true;
      }
+
    return false;
  }
  
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h

index 277facbd98cfcb638e70f6b2a680eb2486a8b33e..a9eff6a81bd4473187356db65dd0fa38e34bbb62 100644 (file)
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -1062,13 +1062,13 @@ typedef struct {
  
  /* The maximum number of bytes that can be copied by one iteration of
     a cpymemsi loop; see loongarch_block_move_loop.  */
-#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
+#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4
  
  /* The maximum number of bytes that can be copied by a straight-line
     implementation of cpymemsi; see loongarch_block_move_straight.  We want
     to make sure that any loop-based implementation will iterate at
     least twice.  */
-#define LARCH_MAX_MOVE_BYTES_STRAIGHT (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2)
+#define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2)
  
  /* The base cost of a memcpy call, for MOVE_RATIO and friends.  These
     values were determined experimentally by benchmarking with CSiBE.
@@ -1076,7 +1076,7 @@ typedef struct {
  #define LARCH_CALL_RATIO 8
  
  /* Any loop-based implementation of cpymemsi will have at least
-   LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
+   LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory
     moves, so allow individual copies of fewer elements.
  
     When cpymemsi is not available, use a value approximating
@@ -1087,9 +1087,7 @@ typedef struct {
     value of LARCH_CALL_RATIO to take that into account.  */
  
  #define MOVE_RATIO(speed) \
-  (HAVE_cpymemsi \
-   ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \
-   : CLEAR_RATIO (speed) / 2)
+  (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO (speed) / 2)
  
  /* For CLEAR_RATIO, when optimizing for size, give a better estimate
     of the length of a memset call, but use the default otherwise.  */
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md

index 628ecc7808893c58a8a503daa6ca2097b47f3139..816a943d155b54146af916889c349067705055b4 100644 (file)
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -2488,7 +2488,8 @@
    ""
  {
    if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P
-      && loongarch_expand_block_move (operands[0], operands[1], operands[2]))
+      && loongarch_expand_block_move (operands[0], operands[1],
+                                     operands[2], operands[3]))
      DONE;
    else
      FAIL;
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c

new file mode 100644 (file)

index 0000000..4cd35d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
+
+extern char a[], b[];
+void test() { __builtin_memcpy(a, b, 15); }
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c

new file mode 100644 (file)

index 0000000..703eb95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
+
+extern long a[], b[];
+void test() { __builtin_memcpy(a, b, 15); }
diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c

new file mode 100644 (file)

index 0000000..d6a8065
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */
+
+/* Three loop iterations each contains 4 st.b, and 3 st.b after the loop */
+/* { dg-final { scan-assembler-times "st\\.b" 7 } } */
+
+/* { dg-final { scan-assembler-not "st\\.h" } } */
+/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */
+/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */
+
+extern char a[], b[];
+void test() { __builtin_memcpy(a, b, 15); }
author	Xi Ruoyao <xry111@xry111.site>
	Wed, 12 Apr 2023 11:45:48 +0000 (11:45 +0000)
committer	Xi Ruoyao <xry111@xry111.site>
	Wed, 19 Apr 2023 10:13:50 +0000 (18:13 +0800)
gcc/config/loongarch/loongarch-protos.h		patch \| blob \| blame \| history
gcc/config/loongarch/loongarch.cc		patch \| blob \| blame \| history
gcc/config/loongarch/loongarch.h		patch \| blob \| blame \| history
gcc/config/loongarch/loongarch.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/loongarch/pr109465-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/loongarch/pr109465-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/loongarch/pr109465-3.c	[new file with mode: 0644]	patch \| blob