From eee51f9a4b6e584230f75e4616438bb5ad5935a9 Mon Sep 17 00:00:00 2001
From: Pengfei Li <Pengfei.Li2@arm.com>
Date: Thu, 7 Aug 2025 11:08:35 +0000
Subject: [PATCH] vect: Extend peeling and versioning for alignment to VLA
 modes

This patch extends the support for peeling and versioning for alignment
from VLS modes to VLA modes. The key change is allowing the DR target
alignment to be set to a non-constant poly_int. Since the value must be
a power-of-two, for variable VFs, the power-of-two check is deferred to
runtime through loop versioning. The vectorizable check for speculative
loads is also refactored in this patch to handle both constant and
variable target alignment values.

Additional changes for VLA modes include:

1) Peeling

In VLA modes, we use peeling with masking - using a partial vector in
the first iteration of the vectorized loop to ensure aligned DRs in
subsequent iterations. It was already enabled for VLS modes to avoid
scalar peeling. This patch reuses most of the existing logic and just
fixes a small issue of incorrect IV offset in VLA code path. This also
removes a power-of-two rounding when computing the number of iterations
to peel, as power-of-two VF has been guaranteed by a new runtime check.

2) Versioning

The type of the mask for runtime alignment check is updated to poly_int
to support variable VFs. After this change, both standalone versioning
and peeling with versioning are available in VLA modes. This patch also
introduces another runtime check for speculative read amount, to ensure
that all speculative loads remain within current valid memory page. We
plan to remove these runtime checks in the future by introducing capped
VF - using partial vectors to limit the actual VF value at runtime.

3) Speculative read flag

DRs whose scalar accesses are known to be in-bounds will be considered
unaligned unsupported with a variable target alignment. But in fact,
speculative reads can be naturally avoided for in-bounds DRs as long as
partial vectors are used. Therefore, this patch clears the speculative
flags and sets the "must use partial vectors" flag for these cases.

This patch is bootstrapped and regression-tested on x86_64-linux-gnu,
arm-linux-gnueabihf and aarch64-linux-gnu with bootstrap-O3.

gcc/ChangeLog:

	* tree-vect-data-refs.cc (vect_compute_data_ref_alignment):
	Allow DR target alignment to be a poly_int.
	(vect_enhance_data_refs_alignment): Support peeling and
	versioning for VLA modes.
	* tree-vect-loop-manip.cc (get_misalign_in_elems): Remove
	power-of-two rounding in peeling.
	(vect_create_cond_for_align_checks): Update alignment check
	logic for poly_int mask.
	(vect_create_cond_for_vla_spec_read): New runtime checks.
	(vect_loop_versioning): Support new runtime checks.
	* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Add a new
	loop_vinfo field.
	(vectorizable_induction): Fix wrong IV offset issue.
	* tree-vect-stmts.cc (get_load_store_type): Refactor
	vectorizable checks for speculative loads.
	* tree-vectorizer.h (LOOP_VINFO_MAX_SPEC_READ_AMOUNT): New
	macro for new runtime checks.
	(LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ): Likewise
	(LOOP_REQUIRES_VERSIONING): Update macro for new runtime checks.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/peel_ind_11.c: New test.
	* gcc.target/aarch64/sve/peel_ind_11_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_12.c: New test.
	* gcc.target/aarch64/sve/peel_ind_12_run.c: New test.
	* gcc.target/aarch64/sve/peel_ind_13.c: New test.
	* gcc.target/aarch64/sve/peel_ind_13_run.c: New test.
---
 .../gcc.target/aarch64/sve/peel_ind_11.c      |  20 ++++
 .../gcc.target/aarch64/sve/peel_ind_11_run.c  |  27 +++++
 .../gcc.target/aarch64/sve/peel_ind_12.c      |  21 ++++
 .../gcc.target/aarch64/sve/peel_ind_12_run.c  |  29 +++++
 .../gcc.target/aarch64/sve/peel_ind_13.c      |  24 ++++
 .../gcc.target/aarch64/sve/peel_ind_13_run.c  |  15 +++
 gcc/tree-vect-data-refs.cc                    |  61 ++++++-----
 gcc/tree-vect-loop-manip.cc                   |  71 ++++++++++--
 gcc/tree-vect-loop.cc                         |   8 +-
 gcc/tree-vect-stmts.cc                        | 103 ++++++++----------
 gcc/tree-vectorizer.h                         |   9 +-
 11 files changed, 291 insertions(+), 97 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c
new file mode 100644
index 00000000000..feb7ee7d61c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11.c
@@ -0,0 +1,20 @@
+/* Peeling for alignment with masking in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define START 3
+#define END 510
+
+int __attribute__((noipa))
+foo (int *a) {
+  for (signed int i = START; i < END; ++i) {
+    if (a[i] != 0)
+      return i;
+  }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
+/* { dg-final { scan-assembler {\tnot\tp[0-7]\.b, p[0-7]/z, p.*\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c
new file mode 100644
index 00000000000..b4c267f2845
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_11_run.c
@@ -0,0 +1,27 @@
+/* Peeling for alignment with masking in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_11.c"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 512
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  for (int k = 5; k < 30; k++) {
+    int *a = (int *) malloc (sizeof(int) * N);
+
+    /* Set only one non-zero element for test.  */
+    for (int i = 5; i < 30; i++)
+      a[i] = (i == k ? 1 : 0);
+
+    int res = foo (a);
+    asm volatile ("");
+    if (res != k) {
+      __builtin_abort ();
+    }
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c
new file mode 100644
index 00000000000..260482a94df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12.c
@@ -0,0 +1,21 @@
+/* Peeling for alignment with masking together with versioning in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define START 5
+#define END 509
+
+int __attribute__((noipa))
+foo (int *restrict a, int * restrict b) {
+  for (signed int i = START; i < END; ++i) {
+    if (a[i] != b[i])
+      return i;
+  }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Both peeling and versioning will be applied" "vect" } } */
+/* { dg-final { scan-assembler {\tnot\tp[0-7]\.b, p[0-7]/z, p.*\n} } } */
+/* { dg-final { scan-assembler {\teor\t.*\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c
new file mode 100644
index 00000000000..ba978fe01ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_12_run.c
@@ -0,0 +1,29 @@
+/* Peeling for alignment with masking together with versioning in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_12.c"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define N 512
+
+int __attribute__ ((optimize (1)))
+main (void) {
+  for (int k = 5; k < 50; k++) {
+    int *a = (int *) malloc (sizeof(int) * N);
+    int *b = (int *) malloc (sizeof(int) * N);
+
+    /* Set only one place of different values for test.  */
+    for (int i = 5; i < 50; i++) {
+      a[i] = (i == k ? 1 : 0);
+      b[i] = 0;
+    }
+
+    int res = foo (a, b);
+    asm volatile ("");
+    if (res != k) {
+      __builtin_abort ();
+    }
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c
new file mode 100644
index 00000000000..730e33ed8f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13.c
@@ -0,0 +1,24 @@
+/* Known inbounds DR in VLA modes.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 5
+#define END 509
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+  for (signed int i = START; i < END; ++i)
+    {
+      if (x[i] == 0)
+        return i;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c
new file mode 100644
index 00000000000..83352a83e50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_13_run.c
@@ -0,0 +1,15 @@
+/* Known inbounds DR in VLA modes.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast -msve-vector-bits=scalable --param aarch64-autovec-preference=sve-only" } */
+
+#include "peel_ind_13.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  int res = foo ();
+  asm volatile ("");
+  if (res != START)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index a9d4aaee771..a3d3b3e7f43 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1448,17 +1448,20 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
   if (loop_vinfo
       && dr_safe_speculative_read_required (stmt_info))
     {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      auto vectype_size
+      /* The required target alignment must be a power-of-2 value and is
+	 computed as the product of vector element size, VF and group size.
+	 We compute the constant part first as VF may be a variable.  For
+	 variable VF, the power-of-2 check of VF is deferred to runtime.  */
+      auto align_factor_c
 	= TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
-      poly_uint64 new_alignment = vf * vectype_size;
-      /* If we have a grouped access we require that the alignment be N * elem.  */
       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-	new_alignment *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+	align_factor_c *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      poly_uint64 new_alignment = vf * align_factor_c;
 
-      unsigned HOST_WIDE_INT target_alignment;
-      if (new_alignment.is_constant (&target_alignment)
-	  && pow2p_hwi (target_alignment))
+      if ((vf.is_constant () && pow2p_hwi (new_alignment.to_constant ()))
+	  || (!vf.is_constant () && pow2p_hwi (align_factor_c)))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -1467,7 +1470,7 @@ vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
 	      dump_dec (MSG_NOTE, new_alignment);
 	      dump_printf (MSG_NOTE, " bytes.\n");
 	    }
-	  vector_alignment = target_alignment;
+	  vector_alignment = new_alignment;
 	}
     }
 
@@ -2438,6 +2441,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
      - The cost of peeling (the extra runtime checks, the increase
        in code size).  */
 
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
@@ -2446,9 +2450,18 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
       stmt_vec_info stmt_info = dr_info->stmt;
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-      do_peeling
-	= vector_alignment_reachable_p (dr_info,
-					LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+
+      /* With variable VF, unsafe speculative read can be avoided for known
+	 inbounds DRs as long as partial vectors are used.  */
+      if (!vf.is_constant ()
+	  && dr_safe_speculative_read_required (stmt_info)
+	  && DR_SCALAR_KNOWN_BOUNDS (dr_info))
+	{
+	  dr_set_safe_speculative_read_required (stmt_info, false);
+	  LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	}
+
+      do_peeling = vector_alignment_reachable_p (dr_info, vf);
       if (do_peeling)
         {
 	  if (known_alignment_for_access_p (dr_info, vectype))
@@ -2488,7 +2501,6 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 	      poly_uint64 nscalars = npeel_tmp;
               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
 		{
-		  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 		  unsigned group_size = 1;
 		  if (STMT_SLP_TYPE (stmt_info)
 		      && STMT_VINFO_GROUPED_ACCESS (stmt_info))
@@ -2911,14 +2923,12 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
      2) there is at least one unsupported misaligned data ref with an unknown
         misalignment, and
      3) all misaligned data refs with a known misalignment are supported, and
-     4) the number of runtime alignment checks is within reason.
-     5) the vectorization factor is a constant.  */
+     4) the number of runtime alignment checks is within reason.  */
 
   do_versioning
     = (optimize_loop_nest_for_speed_p (loop)
        && !loop->inner /* FORNOW */
-       && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP)
-       && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ();
+       && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
 
   if (do_versioning)
     {
@@ -2965,25 +2975,22 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 		 ?? We could actually unroll the loop to achieve the required
 		 overall step alignment, and forcing the alignment could be
 		 done by doing some iterations of the non-vectorized loop.  */
-	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-			       * DR_STEP_ALIGNMENT (dr),
+	      if (!multiple_p (vf * DR_STEP_ALIGNMENT (dr),
 			       DR_TARGET_ALIGNMENT (dr_info)))
 		{
 		  do_versioning = false;
 		  break;
 		}
 
-              /* The rightmost bits of an aligned address must be zeros.
-                 Construct the mask needed for this test.  For example,
-                 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
-                 mask must be 15 = 0xf. */
-	      gcc_assert (DR_TARGET_ALIGNMENT (dr_info).is_constant ());
-	      int mask = DR_TARGET_ALIGNMENT (dr_info).to_constant () - 1;
+	      /* Use "mask = DR_TARGET_ALIGNMENT - 1" to test rightmost address
+		 bits for runtime alignment check.  For example, for 16 bytes
+		 target alignment the mask is 15 = 0xf.  */
+	      poly_uint64 mask = DR_TARGET_ALIGNMENT (dr_info) - 1;
 
 	      /* FORNOW: use the same mask to test all potentially unaligned
 		 references in the loop.  */
-	      if (LOOP_VINFO_PTR_MASK (loop_vinfo)
-		  && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
+	      if (maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), 0U)
+		  && maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), mask))
 		{
 		  do_versioning = false;
 		  break;
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 6c1b26adda3..566308f4fe5 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2454,10 +2454,7 @@ get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
   else
     {
       tree vla = build_int_cst (type, target_align);
-      tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
-				    fold_build2 (MINUS_EXPR, type,
-						 build_int_cst (type, 0), vla));
-      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
+      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
 					  build_int_cst (type, 1));
     }
 
@@ -3840,7 +3837,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
   const vec<stmt_vec_info> &may_misalign_stmts
     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
   stmt_vec_info stmt_info;
-  int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
+  poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
   tree mask_cst;
   unsigned int i;
   tree int_ptrsize_type;
@@ -3852,9 +3849,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
   tree ptrsize_zero;
   tree part_cond_expr;
 
-  /* Check that mask is one less than a power of 2, i.e., mask is
-     all zeros followed by all ones.  */
-  gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
+  gcc_assert (known_ne (mask, 0U));
 
   int_ptrsize_type = signed_type_for (ptr_type_node);
 
@@ -3962,6 +3957,62 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
   chain_cond_expr (cond_expr, part_cond_expr);
 }
 
+/* Function vect_create_cond_for_vla_spec_read.
+
+   Create a conditional expression that represents the run-time checks with
+   max speculative read amount in VLA modes.  We check two things:
+     1) if the max speculative read amount exceeds the min page size
+     2) if the VF is power-of-2 - done by checking the max read amount instead
+
+   Input:
+   COND_EXPR  - input conditional expression.  New conditions will be chained
+		with logical AND operation.
+   LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
+		possible speculative read amount in VLA modes.
+
+   Output:
+   COND_EXPR - conditional expression.
+
+   The returned COND_EXPR is the conditional expression to be used in the
+   if statement that controls which version of the loop gets executed at
+   runtime.  */
+
+static void
+vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
+{
+  poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
+  tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
+
+  /* Both the read amount and the VF must be variants, and the read amount must
+     be a constant power-of-2 multiple of the VF.  */
+  unsigned HOST_WIDE_INT multiple;
+  gcc_assert (!read_amount_poly.is_constant ()
+	      && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
+	      && constant_multiple_p (read_amount_poly,
+				      LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+				      &multiple)
+	      && pow2p_hwi (multiple));
+
+  tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
+  tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
+  tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
+					 (unsigned long) param_min_pagesize);
+
+  /* Create an expression of "amount & (amount - 1) == 0".  */
+  tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
+				amount, cst_ul_one);
+  tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
+				      amount, amount_m1);
+  tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+				       amount_and_expr, cst_ul_zero);
+  chain_cond_expr (cond_expr, powof2_cond_expr);
+
+  /* Create an expression of "amount <= cst_ul_pagesize".  */
+  tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
+					 amount, cst_ul_pagesize);
+  chain_cond_expr (cond_expr, pagesize_cond_expr);
+}
+
 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
@@ -4087,6 +4138,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
   gimple_seq gimplify_stmt_list = NULL;
   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
+  bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
   poly_uint64 versioning_threshold
@@ -4145,6 +4197,9 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
 				       &cond_expr_stmt_list);
 
+  if (version_spec_read)
+    vect_create_cond_for_vla_spec_read (loop_vinfo, &cond_expr);
+
   if (version_alias)
     {
       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 85f3e90c124..55a849561e3 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1009,6 +1009,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     unaligned_dr (NULL),
     peeling_for_alignment (0),
     ptr_mask (0),
+    max_spec_read_amount (0),
     nonlinear_iv (false),
     ivexpr_map (NULL),
     scan_map (NULL),
@@ -10141,7 +10142,12 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       if (peel_mul)
 	{
 	  if (!step_mul)
-	    step_mul = peel_mul;
+	    {
+	      gcc_assert (!nunits.is_constant ());
+	      step_mul = gimple_build (&init_stmts,
+				       MINUS_EXPR, step_vectype,
+				       build_zero_cst (step_vectype), peel_mul);
+	    }
 	  else
 	    step_mul = gimple_build (&init_stmts,
 				     MINUS_EXPR, step_vectype,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a6f4db43a49..dbeb8bdbf4f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2400,70 +2400,26 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
       /* We can only peel for loops, of course.  */
       gcc_checking_assert (loop_vinfo);
 
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      poly_uint64 read_amount
+	= vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+	read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+
       auto target_alignment
 	= DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
-      unsigned HOST_WIDE_INT target_align;
-
-      bool group_aligned = false;
-      if (target_alignment.is_constant (&target_align)
-	  && nunits.is_constant ())
-	{
-	  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-	  auto vectype_size
-	    = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
-	  poly_uint64 required_alignment = vf * vectype_size;
-	  /* If we have a grouped access we require that the alignment be N * elem.  */
-	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-	    required_alignment *=
-		DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
-	  if (!multiple_p (target_alignment, required_alignment))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "desired alignment %wu not met. Instead got %wu "
-			     "for DR alignment at %G",
-			     required_alignment.to_constant (),
-			     target_align, STMT_VINFO_STMT (stmt_info));
-	      return false;
-	    }
-
-	  if (!pow2p_hwi (target_align))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "non-power-of-two vector alignment %wd "
-			     "for DR alignment at %G",
-			     target_align, STMT_VINFO_STMT (stmt_info));
-	      return false;
-	    }
-
-	  /* For VLA we have to insert a runtime check that the vector loads
-	     per iterations don't exceed a page size.  For now we can use
-	     POLY_VALUE_MAX as a proxy as we can't peel for VLA.  */
-	  if (known_gt (required_alignment, (unsigned)param_min_pagesize))
+      if (!multiple_p (target_alignment, read_amount))
+	{
+	  if (dump_enabled_p ())
 	    {
-	      if (dump_enabled_p ())
-		{
-		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			       "alignment required for correctness (");
-		  dump_dec (MSG_MISSED_OPTIMIZATION, required_alignment);
-		  dump_printf (MSG_NOTE, ") may exceed page size\n");
-		}
-	      return false;
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "desired alignment not met, target was ");
+	      dump_dec (MSG_NOTE, target_alignment);
+	      dump_printf (MSG_NOTE, " previously, but read amount is ");
+	      dump_dec (MSG_NOTE, read_amount);
+	      dump_printf (MSG_NOTE, " at %G.\n", STMT_VINFO_STMT (stmt_info));
 	    }
-
-	  group_aligned = true;
-	}
-
-      /* There are multiple loads that have a misalignment that we couldn't
-	 align.  We would need LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P to
-	 vectorize. */
-      if (!group_aligned)
-	{
-	  if (inbounds)
-	    LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
-	  else
-	    return false;
+	  return false;
 	}
 
       /* When using a group access the first element may be aligned but the
@@ -2485,6 +2441,33 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 			     STMT_VINFO_STMT (stmt_info));
 	  return false;
 	}
+
+      /* Reject vectorization if we know the read mount per vector iteration
+	 exceeds the min page size.  */
+      if (known_gt (read_amount, (unsigned) param_min_pagesize))
+	{
+	  if (dump_enabled_p ())
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			       "alignment required for correctness (");
+	      dump_dec (MSG_MISSED_OPTIMIZATION, read_amount);
+	      dump_printf (MSG_NOTE, ") may exceed page size.\n");
+	    }
+	  return false;
+	}
+
+      if (!vf.is_constant ())
+	{
+	  /* For VLA modes, we need a runtime check to ensure any speculative
+	     read amount does not exceed the page size.  Here we record the max
+	     possible read amount for the check.  */
+	  if (maybe_gt (read_amount,
+			LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo)))
+	    LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo) = read_amount;
+
+	  /* For VLA modes, we must use partial vectors.  */
+	  LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+	}
     }
 
   if (*alignment_support_scheme == dr_unaligned_unsupported)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9653496362f..041cff80286 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -919,7 +919,10 @@ public:
   int peeling_for_alignment;
 
   /* The mask used to check the alignment of pointers or arrays.  */
-  int ptr_mask;
+  poly_uint64 ptr_mask;
+
+  /* The maximum speculative read amount in VLA modes for runtime check.  */
+  poly_uint64 max_spec_read_amount;
 
   /* Indicates whether the loop has any non-linear IV.  */
   bool nonlinear_iv;
@@ -1155,6 +1158,7 @@ public:
 #define LOOP_VINFO_RGROUP_IV_TYPE(L)       (L)->rgroup_iv_type
 #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
 #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
+#define LOOP_VINFO_MAX_SPEC_READ_AMOUNT(L) (L)->max_spec_read_amount
 #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
 #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
 #define LOOP_VINFO_DDRS(L)                 (L)->shared->ddrs
@@ -1209,6 +1213,8 @@ public:
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L)	\
   ((L)->may_misalign_stmts.length () > 0)
+#define LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ(L)	\
+  (maybe_gt ((L)->max_spec_read_amount, 0U))
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIAS(L)		\
   ((L)->comp_alias_ddrs.length () > 0 \
    || (L)->check_unequal_addrs.length () > 0 \
@@ -1219,6 +1225,7 @@ public:
   (LOOP_VINFO_SIMD_IF_COND (L))
 #define LOOP_REQUIRES_VERSIONING(L)			\
   (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (L)		\
+   || LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (L)	\
    || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (L)		\
    || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L)		\
    || LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (L))
-- 
2.47.2