AArch64: Fix strict-align cpymem/setmem [PR103100]

author Wilco Dijkstra <wilco.dijkstra@arm.com>

Wed, 25 Oct 2023 15:28:04 +0000 (16:28 +0100)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Thu, 30 Nov 2023 13:27:11 +0000 (13:27 +0000)
author Wilco Dijkstra <wilco.dijkstra@arm.com>
Wed, 25 Oct 2023 15:28:04 +0000 (16:28 +0100)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Thu, 30 Nov 2023 13:27:11 +0000 (13:27 +0000)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 64684258b7bf4a95e8092b06e10eb43208850c3d..bc126cc60fc3cf9556b0f83e725f27df08f4ae7f 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23105,27 +23105,23 @@ aarch64_expand_cpymem (rtx *operands)
    int mode_bits;
    rtx dst = operands[0];
    rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
    rtx base;
    machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
  
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
      return aarch64_expand_cpymem_mops (operands);
  
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
  
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
  
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
      return aarch64_expand_cpymem_mops (operands);
  
    int copy_bits = 256;
@@ -23289,12 +23285,13 @@ aarch64_expand_setmem (rtx *operands)
    unsigned HOST_WIDE_INT len;
    rtx dst = operands[0];
    rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
    rtx base;
    machine_mode cur_mode = BLKmode, next_mode;
  
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
      return aarch64_expand_setmem_mops (operands);
  
    bool size_p = optimize_function_for_size_p (cfun);
@@ -23302,10 +23299,13 @@ aarch64_expand_setmem (rtx *operands)
    /* Default the maximum to 256-bytes when considering only libcall vs
       SIMD broadcast sequence.  */
    unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
  
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
  
    int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
    /* The MOPS sequence takes:
@@ -23318,12 +23318,6 @@ aarch64_expand_setmem (rtx *operands)
       the arguments + 1 for the call.  */
    unsigned libcall_cost = 4;
  
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
    /* Attempt a sequence with a vector broadcast followed by stores.
       Count the number of operations involved to see if it's worth it
       against the alternatives.  A simple counter simd_ops on the
@@ -23365,10 +23359,8 @@ aarch64_expand_setmem (rtx *operands)
        simd_ops++;
        n -= mode_bits;
  
-      /* Do certain trailing copies as overlapping if it's going to be
-        cheaper.  i.e. less instructions to do so.  For instance doing a 15
-        byte copy it's more efficient to do two overlapping 8 byte copies than
-        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
        if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
         {
           next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index 7be1de38b1c3c19d037ca2b3722812e92704bda9..4a3af6df7e7caf1fab9483239ce41845a92e23b7 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1630,7 +1630,7 @@
     (match_operand:BLK 1 "memory_operand")
     (match_operand:DI 2 "general_operand")
     (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
  {
    if (aarch64_expand_cpymem (operands))
      DONE;
@@ -1727,7 +1727,7 @@
          (match_operand:QI  2 "nonmemory_operand")) ;; Value
     (use (match_operand:DI  1 "general_operand")) ;; Length
     (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
   {
    if (aarch64_expand_setmem (operands))
      DONE;
author	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Wed, 25 Oct 2023 15:28:04 +0000 (16:28 +0100)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Thu, 30 Nov 2023 13:27:11 +0000 (13:27 +0000)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| blame \| history