vect: Extend peeling and versioning for alignment to VLA modes

author Pengfei Li <Pengfei.Li2@arm.com>

Thu, 7 Aug 2025 11:08:35 +0000 (11:08 +0000)

committer Pengfei Li <Pengfei.Li2@arm.com>

Thu, 7 Aug 2025 11:10:10 +0000 (11:10 +0000)
author Pengfei Li <Pengfei.Li2@arm.com>
Thu, 7 Aug 2025 11:08:35 +0000 (11:08 +0000)
committer Pengfei Li <Pengfei.Li2@arm.com>
Thu, 7 Aug 2025 11:10:10 +0000 (11:10 +0000)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c

new file mode 100644 (file)

index 0000000..feb7ee7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c
@@ -0,0 +1,20 @@
+/* Peeling for alignment with masking in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define START 3
+#define END 510
+
+int __attribute__((noipa))
+foo (int *a) {
+  for (signed int i = START; i < END; ++i) {
+    if (a[i] != 0)
+      return i;
+  }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
+/* { dg-final { scan-assembler {\tnot\tp[0-7]\.b, p[0-7]/z, p.*\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c

new file mode 100644 (file)

index 0000000..b4c267f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c
@@ -0,0 +1,27 @@
+/* Peeling for alignment with masking in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_11.c"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 512
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  for (int k = 5; k < 30; k++) {
+    int *a = (int *) malloc (sizeof(int) * N);
+
+    /* Set only one non-zero element for test.  */
+    for (int i = 5; i < 30; i++)
+      a[i] = (i == k ? 1 : 0);
+
+    int res = foo (a);
+    asm volatile ("");
+    if (res != k) {
+      __builtin_abort ();
+    }
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c

new file mode 100644 (file)

index 0000000..260482a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c
@@ -0,0 +1,21 @@
+/* Peeling for alignment with masking together with versioning in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define START 5
+#define END 509
+
+int __attribute__((noipa))
+foo (int *restrict a, int * restrict b) {
+  for (signed int i = START; i < END; ++i) {
+    if (a[i] != b[i])
+      return i;
+  }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Both peeling and versioning will be applied" "vect" } } */
+/* { dg-final { scan-assembler {\tnot\tp[0-7]\.b, p[0-7]/z, p.*\n} } } */
+/* { dg-final { scan-assembler {\teor\t.*\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c

new file mode 100644 (file)

index 0000000..ba978fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c
@@ -0,0 +1,29 @@
+/* Peeling for alignment with masking together with versioning in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_12.c"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 512
+
+int __attribute__ ((optimize (1)))
+main (void) {
+  for (int k = 5; k < 50; k++) {
+    int *a = (int *) malloc (sizeof(int) * N);
+    int *b = (int *) malloc (sizeof(int) * N);
+
+    /* Set only one place of different values for test.  */
+    for (int i = 5; i < 50; i++) {
+      a[i] = (i == k ? 1 : 0);
+      b[i] = 0;
+    }
+
+    int res = foo (a, b);
+    asm volatile ("");
+    if (res != k) {
+      __builtin_abort ();
+    }
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c

new file mode 100644 (file)

index 0000000..730e33e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c
@@ -0,0 +1,24 @@
+/* Known inbounds DR in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 5
+#define END 509
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (signed int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c

new file mode 100644 (file)

index 0000000..83352a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c
@@ -0,0 +1,15 @@
+/* Known inbounds DR in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_13.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc

index a9d4aaee7718b397d7e94a424256b7beb6f77dfb..a3d3b3e7f433ba88f821280801ca33441c3e5e11 100644 (file)
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1448,17 +1448,20 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
    if (loop_vinfo
        && dr_safe_speculative_read_required (stmt_info))
      {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      auto vectype_size
+      /* The required target alignment must be a power-of-2 value and is
+        computed as the product of vector element size, VF and group size.
+        We compute the constant part first as VF may be a variable.  For
+        variable VF, the power-of-2 check of VF is deferred to runtime.  */
+      auto align_factor_c
         = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
-      poly_uint64 new_alignment = vf * vectype_size;
-      /* If we have a grouped access we require that the alignment be N * elem.  */
        if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-       new_alignment *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+       align_factor_c *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      poly_uint64 new_alignment = vf * align_factor_c;
  
-      unsigned HOST_WIDE_INT target_alignment;
-      if (new_alignment.is_constant (&target_alignment)
-         && pow2p_hwi (target_alignment))
+      if ((vf.is_constant () && pow2p_hwi (new_alignment.to_constant ()))
+         || (!vf.is_constant () && pow2p_hwi (align_factor_c)))
         {
           if (dump_enabled_p ())
             {
@@ -1467,7 +1470,7 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
               dump_dec (MSG_NOTE, new_alignment);
               dump_printf (MSG_NOTE, " bytes.\n");
             }
-         vector_alignment = target_alignment;
+         vector_alignment = new_alignment;
         }
      }
  
@@ -2438,6 +2441,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
       - The cost of peeling (the extra runtime checks, the increase
         in code size).  */
  
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    FOR_EACH_VEC_ELT (datarefs, i, dr)
      {
        dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
@@ -2446,9 +2450,18 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
  
        stmt_vec_info stmt_info = dr_info->stmt;
        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-      do_peeling
-       = vector_alignment_reachable_p (dr_info,
-                                       LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+
+      /* With variable VF, unsafe speculative read can be avoided for known
+        inbounds DRs as long as partial vectors are used.  */
+      if (!vf.is_constant ()
+         && dr_safe_speculative_read_required (stmt_info)
+         && DR_SCALAR_KNOWN_BOUNDS (dr_info))
+       {
+         dr_set_safe_speculative_read_required (stmt_info, false);
+         LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+       }
+
+      do_peeling = vector_alignment_reachable_p (dr_info, vf);
        if (do_peeling)
          {
           if (known_alignment_for_access_p (dr_info, vectype))
@@ -2488,7 +2501,6 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
               poly_uint64 nscalars = npeel_tmp;
                if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
                 {
-                 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
                   unsigned group_size = 1;
                   if (STMT_SLP_TYPE (stmt_info)
                       && STMT_VINFO_GROUPED_ACCESS (stmt_info))
@@ -2911,14 +2923,12 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
       2) there is at least one unsupported misaligned data ref with an unknown
          misalignment, and
       3) all misaligned data refs with a known misalignment are supported, and
-     4) the number of runtime alignment checks is within reason.
-     5) the vectorization factor is a constant.  */
+     4) the number of runtime alignment checks is within reason.  */
  
    do_versioning
      = (optimize_loop_nest_for_speed_p (loop)
         && !loop->inner /* FORNOW */
-       && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP)
-       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ();
+       && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
  
    if (do_versioning)
      {
@@ -2965,25 +2975,22 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
                  ?? We could actually unroll the loop to achieve the required
                  overall step alignment, and forcing the alignment could be
                  done by doing some iterations of the non-vectorized loop.  */
-             if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-                              * DR_STEP_ALIGNMENT (dr),
+             if (!multiple_p (vf * DR_STEP_ALIGNMENT (dr),
                                DR_TARGET_ALIGNMENT (dr_info)))
                 {
                   do_versioning = false;
                   break;
                 }
  
-              /* The rightmost bits of an aligned address must be zeros.
-                 Construct the mask needed for this test.  For example,
-                 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
-                 mask must be 15 = 0xf. */
-             gcc_assert (DR_TARGET_ALIGNMENT (dr_info).is_constant ());
-             int mask = DR_TARGET_ALIGNMENT (dr_info).to_constant () - 1;
+             /* Use "mask = DR_TARGET_ALIGNMENT - 1" to test rightmost address
+                bits for runtime alignment check.  For example, for 16 bytes
+                target alignment the mask is 15 = 0xf.  */
+             poly_uint64 mask = DR_TARGET_ALIGNMENT (dr_info) - 1;
  
               /* FORNOW: use the same mask to test all potentially unaligned
                  references in the loop.  */
-             if (LOOP_VINFO_PTR_MASK (loop_vinfo)
-                 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
+             if (maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), 0U)
+                 && maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), mask))
                 {
                   do_versioning = false;
                   break;
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc

index 6c1b26adda3b778ef1e1bdb0c1c0a34e0e89cae9..566308f4fe5dbe3309c17fb48ae1fb33ffb57572 100644 (file)
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2454,10 +2454,7 @@ get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
    else
      {
        tree vla = build_int_cst (type, target_align);
-      tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
-                                   fold_build2 (MINUS_EXPR, type,
-                                                build_int_cst (type, 0), vla));
-      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
+      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
                                           build_int_cst (type, 1));
      }
  
@@ -3840,7 +3837,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
    const vec<stmt_vec_info> &may_misalign_stmts
      = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
    stmt_vec_info stmt_info;
-  int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
+  poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
    tree mask_cst;
    unsigned int i;
    tree int_ptrsize_type;
@@ -3852,9 +3849,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
    tree ptrsize_zero;
    tree part_cond_expr;
  
-  /* Check that mask is one less than a power of 2, i.e., mask is
-     all zeros followed by all ones.  */
-  gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
+  gcc_assert (known_ne (mask, 0U));
  
    int_ptrsize_type = signed_type_for (ptr_type_node);
  
@@ -3962,6 +3957,62 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
    chain_cond_expr (cond_expr, part_cond_expr);
  }
  
+/* Function vect_create_cond_for_vla_spec_read.
+
+   Create a conditional expression that represents the run-time checks with
+   max speculative read amount in VLA modes.  We check two things:
+     1) if the max speculative read amount exceeds the min page size
+     2) if the VF is power-of-2 - done by checking the max read amount instead
+
+   Input:
+   COND_EXPR  - input conditional expression.  New conditions will be chained
+               with logical AND operation.
+   LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
+               possible speculative read amount in VLA modes.
+
+   Output:
+   COND_EXPR - conditional expression.
+
+   The returned COND_EXPR is the conditional expression to be used in the
+   if statement that controls which version of the loop gets executed at
+   runtime.  */
+
+static void
+vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
+{
+  poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
+  tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
+
+  /* Both the read amount and the VF must be variants, and the read amount must
+     be a constant power-of-2 multiple of the VF.  */
+  unsigned HOST_WIDE_INT multiple;
+  gcc_assert (!read_amount_poly.is_constant ()
+             && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
+             && constant_multiple_p (read_amount_poly,
+                                     LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                                     &multiple)
+             && pow2p_hwi (multiple));
+
+  tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
+  tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
+  tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
+                                        (unsigned long) param_min_pagesize);
+
+  /* Create an expression of "amount & (amount - 1) == 0".  */
+  tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
+                               amount, cst_ul_one);
+  tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
+                                     amount, amount_m1);
+  tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+                                      amount_and_expr, cst_ul_zero);
+  chain_cond_expr (cond_expr, powof2_cond_expr);
+
+  /* Create an expression of "amount <= cst_ul_pagesize".  */
+  tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
+                                        amount, cst_ul_pagesize);
+  chain_cond_expr (cond_expr, pagesize_cond_expr);
+}
+
  /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
     create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
     Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
@@ -4087,6 +4138,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
    gimple_seq gimplify_stmt_list = NULL;
    tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
    bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
+  bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
    bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
    bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
    poly_uint64 versioning_threshold
@@ -4145,6 +4197,9 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
      vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
                                        &cond_expr_stmt_list);
  
+  if (version_spec_read)
+    vect_create_cond_for_vla_spec_read (loop_vinfo, &cond_expr);
+
    if (version_alias)
      {
        vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc

index 85f3e90c124c2265e3a8fa61888c1db0e7b47ac3..55a849561e3a0c864e53a5e4997736ca33f274ba 100644 (file)
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1009,6 +1009,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
      unaligned_dr (NULL),
      peeling_for_alignment (0),
      ptr_mask (0),
+    max_spec_read_amount (0),
      nonlinear_iv (false),
      ivexpr_map (NULL),
      scan_map (NULL),
@@ -10141,7 +10142,12 @@ vectorizable_induction (loop_vec_info loop_vinfo,
        if (peel_mul)
         {
           if (!step_mul)
-           step_mul = peel_mul;
+           {
+             gcc_assert (!nunits.is_constant ());
+             step_mul = gimple_build (&init_stmts,
+                                      MINUS_EXPR, step_vectype,
+                                      build_zero_cst (step_vectype), peel_mul);
+           }
           else
             step_mul = gimple_build (&init_stmts,
                                      MINUS_EXPR, step_vectype,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index a6f4db43a495bf728bb77107ad70d9178b6f4eac..dbeb8bdbf4f642b51668d220e21f33516e329833 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2400,70 +2400,26 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
        /* We can only peel for loops, of course.  */
        gcc_checking_assert (loop_vinfo);
  
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      poly_uint64 read_amount
+       = vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+       read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+
        auto target_alignment
         = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
-      unsigned HOST_WIDE_INT target_align;
-
-      bool group_aligned = false;
-      if (target_alignment.is_constant (&target_align)
-         && nunits.is_constant ())
-       {
-         poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-         auto vectype_size
-           = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
-         poly_uint64 required_alignment = vf * vectype_size;
-         /* If we have a grouped access we require that the alignment be N * elem.  */
-         if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-           required_alignment *=
-               DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
-         if (!multiple_p (target_alignment, required_alignment))
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "desired alignment %wu not met. Instead got %wu "
-                            "for DR alignment at %G",
-                            required_alignment.to_constant (),
-                            target_align, STMT_VINFO_STMT (stmt_info));
-             return false;
-           }
-
-         if (!pow2p_hwi (target_align))
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "non-power-of-two vector alignment %wd "
-                            "for DR alignment at %G",
-                            target_align, STMT_VINFO_STMT (stmt_info));
-             return false;
-           }
-
-         /* For VLA we have to insert a runtime check that the vector loads
-            per iterations don't exceed a page size.  For now we can use
-            POLY_VALUE_MAX as a proxy as we can't peel for VLA.  */
-         if (known_gt (required_alignment, (unsigned)param_min_pagesize))
+      if (!multiple_p (target_alignment, read_amount))
+       {
+         if (dump_enabled_p ())
             {
-             if (dump_enabled_p ())
-               {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "alignment required for correctness (");
-                 dump_dec (MSG_MISSED_OPTIMIZATION, required_alignment);
-                 dump_printf (MSG_NOTE, ") may exceed page size\n");
-               }
-             return false;
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "desired alignment not met, target was ");
+             dump_dec (MSG_NOTE, target_alignment);
+             dump_printf (MSG_NOTE, " previously, but read amount is ");
+             dump_dec (MSG_NOTE, read_amount);
+             dump_printf (MSG_NOTE, " at %G.\n", STMT_VINFO_STMT (stmt_info));
             }
-
-         group_aligned = true;
-       }
-
-      /* There are multiple loads that have a misalignment that we couldn't
-        align.  We would need LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P to
-        vectorize. */
-      if (!group_aligned)
-       {
-         if (inbounds)
-           LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
-         else
-           return false;
+         return false;
         }
  
        /* When using a group access the first element may be aligned but the
@@ -2485,6 +2441,33 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
                              STMT_VINFO_STMT (stmt_info));
           return false;
         }
+
+      /* Reject vectorization if we know the read mount per vector iteration
+        exceeds the min page size.  */
+      if (known_gt (read_amount, (unsigned) param_min_pagesize))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "alignment required for correctness (");
+             dump_dec (MSG_MISSED_OPTIMIZATION, read_amount);
+             dump_printf (MSG_NOTE, ") may exceed page size.\n");
+           }
+         return false;
+       }
+
+      if (!vf.is_constant ())
+       {
+         /* For VLA modes, we need a runtime check to ensure any speculative
+            read amount does not exceed the page size.  Here we record the max
+            possible read amount for the check.  */
+         if (maybe_gt (read_amount,
+                       LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo)))
+           LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo) = read_amount;
+
+         /* For VLA modes, we must use partial vectors.  */
+         LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+       }
      }
  
    if (*alignment_support_scheme == dr_unaligned_unsupported)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index 9653496362ff82e96b73f037a636b24e01ee9a57..041cff80286f5c2d4b040db65eece12b07761f1e 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -919,7 +919,10 @@ public:
    int peeling_for_alignment;
  
    /* The mask used to check the alignment of pointers or arrays.  */
-  int ptr_mask;
+  poly_uint64 ptr_mask;
+
+  /* The maximum speculative read amount in VLA modes for runtime check.  */
+  poly_uint64 max_spec_read_amount;
  
    /* Indicates whether the loop has any non-linear IV.  */
    bool nonlinear_iv;
@@ -1155,6 +1158,7 @@ public:
  #define LOOP_VINFO_RGROUP_IV_TYPE(L)       (L)->rgroup_iv_type
  #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
  #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
+#define LOOP_VINFO_MAX_SPEC_READ_AMOUNT(L) (L)->max_spec_read_amount
  #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
  #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
  #define LOOP_VINFO_DDRS(L)                 (L)->shared->ddrs
@@ -1209,6 +1213,8 @@ public:
  
  #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L)      \
    ((L)->may_misalign_stmts.length () > 0)
+#define LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ(L)      \
+  (maybe_gt ((L)->max_spec_read_amount, 0U))
  #define LOOP_REQUIRES_VERSIONING_FOR_ALIAS(L)          \
    ((L)->comp_alias_ddrs.length () > 0 \
     || (L)->check_unequal_addrs.length () > 0 \
@@ -1219,6 +1225,7 @@ public:
    (LOOP_VINFO_SIMD_IF_COND (L))
  #define LOOP_REQUIRES_VERSIONING(L)                    \
    (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (L)          \
+   || LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (L)       \
     || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (L)           \
     || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L)          \
     || LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (L))
author	Pengfei Li <Pengfei.Li2@arm.com>
	Thu, 7 Aug 2025 11:08:35 +0000 (11:08 +0000)
committer	Pengfei Li <Pengfei.Li2@arm.com>
	Thu, 7 Aug 2025 11:10:10 +0000 (11:10 +0000)
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-data-refs.cc		patch \| blob \| blame \| history
gcc/tree-vect-loop-manip.cc		patch \| blob \| blame \| history
gcc/tree-vect-loop.cc		patch \| blob \| blame \| history
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history
gcc/tree-vectorizer.h		patch \| blob \| blame \| history