Add new pass to perform autovectorization using unified representation - Current...

author Sameera Deshpande <sameera.deshpande@imgtec.com>

Mon, 11 Jul 2016 08:01:08 +0000 (08:01 +0000)

committer Sameera Deshpande <sameerad@gcc.gnu.org>

Mon, 11 Jul 2016 08:01:08 +0000 (13:31 +0530)
author Sameera Deshpande <sameera.deshpande@imgtec.com>
Mon, 11 Jul 2016 08:01:08 +0000 (08:01 +0000)
committer Sameera Deshpande <sameerad@gcc.gnu.org>
Mon, 11 Jul 2016 08:01:08 +0000 (13:31 +0530)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index afd022ed7f347208171d065acae0a55cf249fac1..4dd80966bdc7d9ec51e04e6e06156a9b7c260d82 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,74 @@
+2016-07-08  Sameera Deshpande <sameera.deshpande@imgtec.com>
+
+       * Makefile.in: Add tree-vect-unified.o in OBJS.
+       * common.opt (ftree-loop-vectorize-unified): New option.
+       * hsa.c (x_flag_tree_loop_vectorize_unified): Initialize.
+       * passes.def (pass_unified_vectorize): New pass.
+       * tree-pass.h (make_pass_unified_vectorize): New function declaration.
+       * tree-ssa-loop.c (gate): Change gate condition for tree loop
+         vectorization.
+       * tree-vect-data-refs.c (vect_mark_for_runtime_alias_test): Split
+         (vect_mark_for_runtime_alias_test_1): Into this.
+         (vect_mark_for_runtime_alias_test): And this.
+         (vect_check_gather_scatter): Split.
+         (vect_check_gather_scatter_1): Into this.
+         (vect_check_gather_scatter): And this.
+       * tree-vect-loop.c (vect_is_simple_iv_evolution): export.
+       * tree-vect-unified.c: New file to add lOOP Vectorization using unified
+         representation.
+         (pass_data_unified_vectorize): New DS for pass.
+         (pass_unified_vectorize): New pass.
+         (new_iter_node): New function.
+         (destroy_iter_node_info): Likewise.
+         (new_stmt_attr): Likewise.
+         (vect_populate_iter_node_from_loop): Likewise.
+         (vect_analyze_dataref_access): Likewise.
+         (vect_analyze_dataref_accesses): Likewise.
+         (vect_analyze_datarefs): Likewise.
+         (is_raw_dependence): Likewise.
+         (vect_analyze_dataref_dependence): Likewise.
+         (vect_analyze_dataref_dependences): Likewise.
+         (mark_addr_index_stmt): Likewise.
+         (classify_loop_stmt): Likewise.
+         (classify_loop_stmts): Likewise.
+         (mark_probable_root_nodes): Likewise.
+         (vect_is_simple_use): Likewise.
+         (normalize_base_addr): Likewise.
+         (init_primop_node): Likewise.
+         (populate_prim_node): Likewise.
+         (exists_primTree_with_memref): Likewise.
+         (create_primTree_memref): Likewise.
+         (create_primTree_combine): Likewise.
+         (create_primTree_partition): Likewise.
+         (add_child_at_index): Likewise.
+         (get_child_at_index): Likewise.
+         (vectorizable_store): Likewise.
+         (vectorizable_load): Likewise.
+         (analyze_and_create_ptree): Likewise.
+         (is_ptree_complete): Likewise.
+         (create_ptree): Likewise.
+         (vect_analyze_loop_with_prim_tree_2): Likewise.
+         (pretty_print_gimple_vec): Likewise.
+         (pp_primop_tree): Likewise.
+         (pretty_print_ptree_vec): Likewise.
+         (pretty_print_iter_node): Likewise.
+         (dump_iter_node): Likewise.
+         (vectorize_loops_using_uniop): Likewise.
+       * tree-vect-unified.h: New header file.
+         (struct ITER_node): New data structure.
+         (struct stmt_attr): Likewise.
+         (struct primop_tree): Likewise.
+         (enum primop_code): New enum.
+         (stmt_use_type): Likewise.
+       * tree-vectorizer.h:
+         (vect_mark_for_runtime_alias_test_1): Declare.
+         (vect_check_gather_scatter_1): Likewise.
+         (vect_analyze_loop_form_1): Likewise.
+         (vect_is_simple_iv_evolution): Likewise.
+       * tree-vectorizer.c:
+         (vect_loop_vectorized_call): Extern.
+         (fold_loop_vectorized_call): Likewise.
+
  2016-07-08  Martin Liska  <mliska@suse.cz>
  
         PR middle-end/71606
diff --git a/gcc/Makefile.in b/gcc/Makefile.in

index 5e7422da0c0c4aacd980a3d3f959b3c35ce1ae13..07c3a0ac2e8e46a26d3842e96cd0f33f591b86bd 100644 (file)
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1528,6 +1528,7 @@ OBJS = \
         tree-vect-loop-manip.o \
         tree-vect-slp.o \
         tree-vectorizer.o \
+       tree-vect-unified.o \
         tree-vrp.o \
         tree.o \
         valtrack.o \
diff --git a/gcc/common.opt b/gcc/common.opt

index a7c5125bf08141524e5782d85b94f2028f9f3dd4..c64f5f6a392ed4cdccca9eeda8ed52f11f4eb29d 100644 (file)
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2582,6 +2582,10 @@ ftree-loop-vectorize
  Common Report Var(flag_tree_loop_vectorize) Optimization
  Enable loop vectorization on trees.
  
+ftree-loop-vectorize-unified
+Common Report Var(flag_tree_loop_vectorize_unified) Optimization
+Enable loop vectorization using unified representation on trees.
+
  ftree-slp-vectorize
  Common Report Var(flag_tree_slp_vectorize) Optimization
  Enable basic block vectorization (SLP) on trees.
diff --git a/gcc/hsa.c b/gcc/hsa.c

index ff978702489720c1c6250d34d2fe23666b3d4048..1c0953945d254d5bde50dfe852a0af060563de9c 100644 (file)
--- a/gcc/hsa.c
+++ b/gcc/hsa.c
@@ -840,6 +840,7 @@ hsa_summary_t::link_functions (cgraph_node *gpu, cgraph_node *host,
      fn_opts = optimization_default_node;
    fn_opts = copy_node (fn_opts);
    TREE_OPTIMIZATION (fn_opts)->x_flag_tree_loop_vectorize = false;
+  TREE_OPTIMIZATION (fn_opts)->x_flag_tree_loop_vectorize_unified = false;
    TREE_OPTIMIZATION (fn_opts)->x_flag_tree_slp_vectorize = false;
    DECL_FUNCTION_SPECIFIC_OPTIMIZATION (gdecl) = fn_opts;
  
diff --git a/gcc/passes.def b/gcc/passes.def

index 3647e90ddfe59fe87e1e7a54564dfecd730b013c..ca056813e6c8e248cfb98962341cf7c121479e34 100644 (file)
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -278,6 +278,11 @@ along with GCC; see the file COPYING3.  If not see
           NEXT_PASS (pass_expand_omp_ssa);
           NEXT_PASS (pass_ch_vect);
           NEXT_PASS (pass_if_conversion);
+         NEXT_PASS (pass_unified_vectorize);
+          PUSH_INSERT_PASSES_WITHIN (pass_unified_vectorize)
+             NEXT_PASS (pass_dce);
+          POP_INSERT_PASSES ()
+
           /* pass_vectorize must immediately follow pass_if_conversion.
              Please do not add any other passes in between.  */
           NEXT_PASS (pass_vectorize);
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h

index 856dd58f3ad20be0c8d8e4f7cefd09d3c83b3450..2bb75bb0d6b7ac81ea8a809ed3056e05c2bf723d 100644 (file)
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -341,7 +341,8 @@ extern bool dr_may_alias_p (const struct data_reference *,
                             const struct data_reference *, bool);
  extern bool dr_equal_offsets_p (struct data_reference *,
                                  struct data_reference *);
-
+extern bool memref_analyze_innermost (gimple *, tree, struct loop *, tree *,
+                                     tree *, tree *, tree *, tree *);
  /* Return true when the base objects of data references A and B are
     the same memory object.  */
  
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h

index 36299a61a5b9624aa8215bee24dcccae17e1549e..9aadbf0616f3d7a3aaf5d94cb6ac6a96944aca31 100644 (file)
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -375,6 +375,7 @@ extern gimple_opt_pass *make_pass_record_bounds (gcc::context *ctxt);
  extern gimple_opt_pass *make_pass_graphite (gcc::context *ctxt);
  extern gimple_opt_pass *make_pass_graphite_transforms (gcc::context *ctxt);
  extern gimple_opt_pass *make_pass_if_conversion (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_unified_vectorize (gcc::context *ctxt);
  extern gimple_opt_pass *make_pass_loop_distribution (gcc::context *ctxt);
  extern gimple_opt_pass *make_pass_vectorize (gcc::context *ctxt);
  extern gimple_opt_pass *make_pass_simduid_cleanup (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-loop.c b/gcc/tree-ssa-loop.c

index 06354e39e78af44782065bd189ee3bb0281b4772..ff92a4b3523d91ac472a971b8f9bbf596ec4947b 100644 (file)
--- a/gcc/tree-ssa-loop.c
+++ b/gcc/tree-ssa-loop.c
@@ -398,7 +398,8 @@ public:
    /* opt_pass methods: */
    virtual bool gate (function *fun)
      {
-      return flag_tree_loop_vectorize || fun->has_force_vectorize_loops;
+      return ((flag_tree_loop_vectorize || fun->has_force_vectorize_loops) 
+             && !flag_tree_loop_vectorize_unified);
      }
  
    virtual unsigned int execute (function *);
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c

index 6fddd3a1c6894c988751d436df31806bff796b10..4171f65834094f61c4bdc1da404b24122875503e 100644 (file)
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -136,16 +136,9 @@ vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
    return scalar_type;
  }
  
-
-/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
-   tested at run-time.  Return TRUE if DDR was successfully inserted.
-   Return false if versioning is not supported.  */
-
-static bool
-vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
+bool
+vect_mark_for_runtime_alias_test_1 (ddr_p ddr, loop *loop)
  {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-
    if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
      return false;
  
@@ -189,11 +182,28 @@ vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
        return false;
      }
  
-  LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
    return true;
  }
  
  
+
+/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
+   tested at run-time.  Return TRUE if DDR was successfully inserted.
+   Return false if versioning is not supported.  */
+
+static bool
+vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  bool is_alias;
+
+  is_alias = vect_mark_for_runtime_alias_test_1 (ddr, loop);
+  if (is_alias)
+    LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
+  return is_alias;
+}
+
+
  /* Function vect_analyze_data_ref_dependence.
  
     Return TRUE if there (might) exist a dependence between a memory-reference
@@ -3199,16 +3209,35 @@ bool
  vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
                            gather_scatter_info *info)
  {
-  HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+  tree base;
+  bool is_read;
+  tree vec_type;
+
+  base = DR_REF (dr);
+  is_read = DR_IS_READ (dr);
+  vec_type = STMT_VINFO_VECTYPE (stmt_info);
+
+  return vect_check_gather_scatter_1 (stmt, base, is_read, loop, vec_type,
+                                      basep, offp, scalep);
+}
+
+/* Check whether a non-affine read or write in stmt is suitable for gather load
+   or scatter store and if so, return a builtin decl for that operation.  */
+
+tree
+vect_check_gather_scatter_1 (gimple *stmt, tree base, bool is_read,
+                            struct loop *loop, tree vec_type, tree *basep,
+                            tree *offp, int *scalep)
+{
+  HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
    tree offtype = NULL_TREE;
-  tree decl, base, off;
+  tree decl, off;
    machine_mode pmode;
    int punsignedp, reversep, pvolatilep = 0;
  
-  base = DR_REF (dr);
    /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
       see if we can use the def stmt of the address.  */
    if (is_gimple_call (stmt)
@@ -3389,12 +3418,10 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
    if (offtype == NULL_TREE)
      offtype = TREE_TYPE (off);
  
-  if (DR_IS_READ (dr))
-    decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
-                                            offtype, scale);
+  if (is_read)
+    decl = targetm.vectorize.builtin_gather (vec_type, offtype, scale);
    else
-    decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
-                                             offtype, scale);
+    decl = targetm.vectorize.builtin_scatter (vec_type, offtype, scale);
  
    if (decl == NULL_TREE)
      return false;
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 63c002e737c64df7fb8c1a5ee603b3c88042d61a..00318a7f65148efdd44e158563211b7202048103 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -704,7 +704,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
     FORNOW: A simple evolution of an induction variables in the loop is
     considered a polynomial evolution.  */
  
-static bool
+bool
  vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
                               tree * step)
  {
diff --git a/gcc/tree-vect-unified.c b/gcc/tree-vect-unified.c

new file mode 100644 (file)

index 0000000..2014f31
--- /dev/null
+++ b/gcc/tree-vect-unified.c
@@ -0,0 +1,2585 @@
+/* lOOP Vectorization using unified representation
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Loop autovectorization using unified representation for permute
+   instructions.  */
+#if 1
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "predict.h"
+#include "tree-pass.h"
+#include "ssa.h"
+#include "cgraph.h"
+#include "fold-const.h"
+#include "stor-layout.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-cfg.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
+#include "tree-ssa-propagate.h"
+#include "dbgcnt.h"
+#include "tree-scalar-evolution.h"
+#include "tree-vect-unified.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "target.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "optabs-tree.h"
+#include "dumpfile.h"
+#include "alias.h"
+#include "tree-eh.h"
+#include "gimplify.h"
+#include "gimplify-me.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-ssa-loop.h"
+#include "expr.h"
+#include "builtins.h"
+#include "params.h"
+#include "pretty-print.h"
+
+/* Entry point to the autovectorizer using tree tiling algorithm for permute
+   instruction selection using unified representation.  */
+
+namespace {
+
+const pass_data pass_data_unified_vectorize =
+{
+  GIMPLE_PASS, /* type */
+  "unified-vect", /* name */
+  OPTGROUP_LOOP | OPTGROUP_VEC, /* optinfo_flags */
+  TV_TREE_VECTORIZATION, /* tv_id */
+  ( PROP_cfg | PROP_ssa ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_unified_vectorize : public gimple_opt_pass
+{
+public:
+  pass_unified_vectorize (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_unified_vectorize, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *fun)
+    {
+      return (flag_tree_loop_vectorize && flag_tree_loop_vectorize_unified)
+             || fun->has_force_vectorize_loops;
+    }
+
+  virtual unsigned int execute (function *);
+
+}; // class pass_unified_vectorize
+
+unsigned int
+pass_unified_vectorize::execute (function *fun)
+{
+  if (number_of_loops (fun) <= 1)
+    return 0;
+
+  return vectorize_loops_using_uniop ();
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_unified_vectorize (gcc::context *ctxt)
+{
+  return new pass_unified_vectorize (ctxt);
+}
+
+/* Function new_iter_node.
+
+   Create new ITER_node for the loop LOOP, and return the pointer.  */
+
+static struct ITER_node *
+new_iter_node (struct loop *loop)
+{
+  struct ITER_node *res;
+  basic_block *bbs;
+  gimple_stmt_iterator si;
+  int i;
+
+  res = (struct ITER_node *) xcalloc (1, sizeof (struct ITER_node));
+  ITER_NODE_LOOP (res) = loop;
+  ITER_NODE_NITERS (res) = NULL;
+  ITER_NODE_PROLOGUE (res) = vNULL;
+  ITER_NODE_LOOP_PEEL_NEEDED (res) = 0;
+  ITER_NODE_LOOP_BODY (res) = vNULL;
+  ITER_NODE_PEELING_FOR_GAPS (res) = false;
+  ITER_NODE_PEELING_FOR_NITER (res) = false;
+  ITER_NODE_EPILOGUE (res) = vNULL;
+
+  bbs = get_loop_body (loop);
+
+  for (i = 0; i < loop->num_nodes; i++)
+  {
+    basic_block bb = bbs[i];
+
+    for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
+      {
+       gimple *phi = gsi_stmt (si);
+       gimple_set_uid (phi, 0);
+      }
+
+    for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+      {
+       gimple *stmt = gsi_stmt (si);
+       gimple_set_uid (stmt, 0);
+      }
+  }
+
+  return res;
+}
+
+/* Function destroy_iter_node_info.
+
+   Clear the statements within the loop corresponding the ITER_node INODE, and
+   destroy ITER_node.  */
+
+static void
+destroy_iter_node_info (struct ITER_node *inode)
+{
+  int i;
+  basic_block *bbs;
+  gimple_stmt_iterator si;
+
+  if (ITER_NODE_PROLOGUE (inode) != vNULL)
+    ITER_NODE_PROLOGUE (inode).release ();
+  if (ITER_NODE_LOOP_BODY (inode) != vNULL)
+    ITER_NODE_LOOP_BODY (inode).release ();
+  if (ITER_NODE_EPILOGUE (inode) != vNULL)
+    ITER_NODE_EPILOGUE (inode).release ();
+
+  bbs = get_loop_body (ITER_NODE_LOOP (inode));
+  for (i = 0; i < ITER_NODE_LOOP (inode)->num_nodes; i++)
+  {
+    basic_block bb = bbs[i];
+
+    for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
+      {
+       gimple *phi = gsi_stmt (si);
+       gimple_set_uid (phi, 0);
+      }
+
+    for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+      {
+       gimple *stmt = gsi_stmt (si);
+       gimple_set_uid (stmt, 0);
+      }
+  }
+
+  free (inode);
+}
+
+/* Function new_stmt_attr.
+
+   Create statement attribute information, and return the pointer for the
+   same.  */
+
+static struct stmt_attr *
+new_stmt_attr ()
+{
+  struct stmt_attr *info;
+  info = (struct stmt_attr *) xcalloc (1, sizeof (struct stmt_attr));
+  info->use_type = stmt_use_type_undef;
+  info->access_fn = NULL;
+  info->ptree = NULL;
+  info->dr = NULL;
+  info->probable_root = false;
+  info->vectype = NULL;
+  return info;
+}
+
+/* Function vect_populate_iter_node_from_loop.
+
+   Create new ITER_node corresponding to loop LOOP, and fill all fields in
+   ITER_node necessary for auto-vectorization.  */
+
+static struct ITER_node *
+vect_populate_iter_node_from_loop (struct loop *loop)
+{
+  tree number_of_iterations, number_of_iterationsm1;
+  basic_block *bbs;
+  gcond *loop_cond, *inner_loop_cond = NULL;
+  int i;
+  gimple_stmt_iterator si;
+
+  if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
+           &number_of_iterations, &inner_loop_cond))
+    return NULL;
+
+  struct ITER_node * t_iter_node = new_iter_node (loop);
+  ITER_NODE_NITERS (t_iter_node) = number_of_iterations;
+
+  bbs = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+  {
+    basic_block bb = bbs[i];
+
+    for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
+      {
+       gimple *phi = gsi_stmt (si);
+       set_stmt_attr (phi, new_stmt_attr ());
+      }
+
+    for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+      {
+       gimple *stmt = gsi_stmt (si);
+       set_stmt_attr (stmt, new_stmt_attr ());
+      }
+  }
+
+  if (!ITER_NODE_NITERS_KNOWN_P (t_iter_node))
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Symbolic number of iterations is ");
+         dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
+         dump_printf (MSG_NOTE, "\n");
+       }
+    }
+
+  STMT_ATTR_USE_TYPE (loop_cond) = stmt_use_type_loop_exit_ctrl;
+  if (inner_loop_cond)
+     STMT_ATTR_USE_TYPE (inner_loop_cond) = stmt_use_type_loop_exit_ctrl;
+
+  gcc_assert (!loop->aux);
+  loop->aux = t_iter_node;
+  return t_iter_node;
+}
+
+/* Function vect_analyze_dataref_access.
+
+   Analyze access pattern of data reference DR within the LOOP.  Except variable
+   stride, any access pattern is supported.  */
+
+static bool
+vect_analyze_dataref_access (struct data_reference *dr, struct loop * loop)
+{
+  tree step = DR_STEP (dr);
+  gimple *stmt = DR_STMT (dr);
+
+  if (!step)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data-ref access in loop\n");
+      return false;
+    }
+
+  if (integer_zerop (step))
+    {
+      /* Stride is 0, i.e. the data_ref is loop invariant.  So, writes cannot be
+        vectorized.  */
+      if (nested_in_vect_loop_p (loop, stmt))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "zero step in inner loop of nest\n");
+
+         if (!loop->force_vectorize)
+           return false;
+       }
+
+       return DR_IS_READ (dr);
+    }
+
+  if (loop && nested_in_vect_loop_p (loop, stmt))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "grouped access in outer loop.\n");
+      return false;
+    }
+
+  /* For now, do not vectorize for variable step size.  */
+  if (TREE_CODE (step) != INTEGER_CST)
+    return false;
+
+  return true;
+}
+
+/* Function vect_analyze_dataref_accesses.
+
+   Analyze all the data refs within the LOOP for the access pattern.  */
+
+static bool
+vect_analyze_dataref_accesses (struct ITER_node *inode)
+{
+  unsigned int i;
+  vec<data_reference_p> datarefs = ITER_NODE_DATA_REFS (inode);
+  struct data_reference *dr;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "=== vect_analyze_dataref_accesses ===\n");
+
+  if (datarefs.is_empty ())
+    return true;
+
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    if (!vect_analyze_dataref_access (dr, ITER_NODE_LOOP (inode)))
+      {
+       if (dump_enabled_p ())
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "not vectorized: complicated access pattern.\n");
+
+       return false;
+      }
+
+  return true;
+}
+
+/* Function vect_analyze_datarefs.
+
+   Analyze all data references DR within the LOOP to check if vectorization is
+   possible.  */
+
+static bool
+vect_analyze_datarefs (struct ITER_node *inode)
+{
+  struct loop *loop = ITER_NODE_LOOP (inode);
+  vec<data_reference_p> datarefs = ITER_NODE_DATA_REFS (inode);
+  tree scalar_type;
+  tree vec_type;
+  struct data_reference *dr;
+  unsigned int i;
+
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    {
+      gimple *stmt;
+      //tree base, offset, init;
+      bool simd_lane_access = false;
+      int vf;
+
+
+again:
+      if (!dr || !DR_REF (dr))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: unhandled data-ref\n");
+         return false;
+       }
+
+      stmt = DR_STMT (dr);
+
+      /* Discard clobbers from the dataref vector.  We will remove
+        clobber stmts during vectorization.  */
+      if (gimple_clobber_p (stmt))
+       {
+         free_data_ref (dr);
+         if (i == datarefs.length () - 1)
+           {
+             datarefs.pop ();
+             break;
+           }
+         datarefs.ordered_remove (i);
+         dr = datarefs[i];
+         goto again;
+       }
+
+      /* Check that analysis of the data-ref succeeded.  */
+      if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
+         || !DR_STEP (dr))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: data ref analysis "
+                              "failed ");
+             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+           }
+
+         return false;
+       }
+
+      if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: base addr of dr is a "
+                            "constant\n");
+
+         return false;
+       }
+
+      if (TREE_THIS_VOLATILE (DR_REF (dr)))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: volatile type ");
+             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+           }
+
+         return false;
+       }
+
+      if (stmt_can_throw_internal (stmt))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: statement can throw an "
+                              "exception ");
+             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+           }
+
+         return false;
+       }
+
+      if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
+         && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: statement is bitfield "
+                              "access ");
+             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+           }
+
+         return false;
+       }
+
+      if (is_gimple_call (stmt)
+         && (!gimple_call_internal_p (stmt)
+             || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
+                 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
+                              "not vectorized: dr in a call ");
+             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+           }
+
+         return false;
+       }
+
+      /* If the dataref is in an inner-loop of the loop that is considered for
+        for vectorization, we also want to analyze the access relative to
+        the outer-loop (DR contains information only relative to the
+        inner-most enclosing loop).  We do that by building a reference to the
+        first location accessed by the inner-loop, and analyze it relative to
+        the outer-loop.  */
+      if (loop && nested_in_vect_loop_p (loop, stmt))
+       {
+         /* Do nothing for now, as the purpose is unclear.  */
+#if 0
+         /* Build a reference to the first location accessed by the
+            inner-loop: *(BASE+INIT).  (The first location is actually
+            BASE+INIT+OFFSET, but we add OFFSET separately later).  */
+         tree inner_base = build_fold_indirect_ref
+                               (fold_build_pointer_plus (base, init));
+
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "analyze in outer-loop: ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
+             dump_printf (MSG_NOTE, "\n");
+           }
+
+         outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
+                                           &poffset, &pmode, &punsignedp,
+                                           &preversep, &pvolatilep, false);
+         gcc_assert (outer_base != NULL_TREE);
+
+         if (pbitpos % BITS_PER_UNIT != 0)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "failed: bit offset alignment.\n");
+             return false;
+           }
+
+         if (preversep)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "failed: reverse storage order.\n");
+             return false;
+           }
+
+         outer_base = build_fold_addr_expr (outer_base);
+         if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
+                         &base_iv, false))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "failed: evolution of base is not affine.\n");
+             return false;
+           }
+
+         if (offset)
+           {
+             if (poffset)
+               poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
+                                      poffset);
+             else
+               poffset = offset;
+           }
+
+         if (!poffset)
+           {
+             offset_iv.base = ssize_int (0);
+             offset_iv.step = ssize_int (0);
+           }
+         else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
+                              &offset_iv, false))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "evolution of offset is not affine.\n");
+             return false;
+           }
+
+         outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
+         split_constant_offset (base_iv.base, &base_iv.base, &dinit);
+         outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
+         split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
+         outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
+
+         outer_step = size_binop (PLUS_EXPR,
+                               fold_convert (ssizetype, base_iv.step),
+                               fold_convert (ssizetype, offset_iv.step));
+#endif
+       }
+
+      STMT_ATTR_DR (stmt) = dr;
+
+      if (simd_lane_access)
+       {
+         free_data_ref (datarefs[i]);
+         datarefs[i] = dr;
+       }
+
+      /* Set vectype for STMT.  */
+      scalar_type = TREE_TYPE (DR_REF (dr));
+      vec_type = get_vectype_for_scalar_type (scalar_type);
+      if (!vec_type)
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: no vectype for stmt: ");
+             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+             dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
+             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
+                                scalar_type);
+             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+           }
+
+         return false;
+       }
+      else
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "got vectype for stmt: ");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+             dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                                vec_type);
+             dump_printf (MSG_NOTE, "\n");
+           }
+       }
+
+      /* Adjust the minimal vectorization factor according to the
+        vector type.  */
+      vf = TYPE_VECTOR_SUBPARTS (vec_type);
+
+      if (TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
+       {
+         if (nested_in_vect_loop_p (loop, stmt))
+           {
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "not vectorized: not suitable for strided "
+                                  "load ");
+                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+               }
+             return false;
+           }
+       }
+    }
+
+  return true;
+}
+
+
+/* Function is_raw_dependence.
+
+   Helper function to check if there exists read-after-write dependence between
+   data reference DRA and DRB.  */
+
+static bool
+is_raw_dependence (struct data_reference *dra, struct data_reference *drb)
+{
+  gimple *earlier;
+  struct data_reference *earlier_ref;
+
+  /* Identify first reference, and check if read-after-write condition.  */
+  earlier = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
+  if (earlier)
+    {
+      if (earlier == DR_STMT (dra))
+       earlier_ref = dra;
+      else
+       earlier_ref = drb;
+
+      if (DR_IS_WRITE (earlier_ref))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "READ_WRITE dependence in interleaving."
+                            "\n");
+         return true;
+       }
+    }
+  return false;
+}
+
+/* Function vect_analyze_dataref_dependence.
+
+   Return TRUE if there exists dependence between any two data references DRA
+   and DRB in DDR, otherwise the loop is vectorizable.  */
+
+static bool
+vect_analyze_dataref_dependence (struct data_dependence_relation *ddr,
+                                struct ITER_node *inode)
+{
+  unsigned int i;
+  struct loop *loop = ITER_NODE_LOOP (inode);
+  struct data_reference *dra = DDR_A (ddr);
+  struct data_reference *drb = DDR_B (ddr);
+  lambda_vector dist_v;
+  unsigned int loop_depth;
+
+  /* Independent data accesses.  */
+  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+    return false;
+
+  if (dra == drb
+      || (DR_IS_READ (dra) && DR_IS_READ (drb)))
+    return false;
+
+  /* Even if we have an anti-dependence then, as the vectorized loop covers at
+     least two scalar iterations, there is always also a true dependence.
+     As the vectorizer does not re-order loads and stores we can ignore
+     the anti-dependence if TBAA can disambiguate both DRs similar to the
+     case with known negative distance anti-dependences (positive
+     distance anti-dependences would violate TBAA constraints).  */
+  if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
+       || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
+      && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
+                                get_alias_set (DR_REF (drb))))
+    return false;
+
+  /* Unknown data dependence.  */
+  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
+    {
+      /* If user asserted safelen consecutive iterations can be
+        executed concurrently, assume independence.  */
+      if (loop->safelen >= 2)
+       {
+         return false;
+       }
+
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "versioning for alias required: "
+                          "can't determine dependence between ");
+         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                            DR_REF (dra));
+         dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
+         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                            DR_REF (drb));
+         dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+       }
+
+      /* Add to list of ddrs that need to be tested at run-time.  */
+      return !vect_mark_for_runtime_alias_test_1 (ddr, loop);
+    }
+
+  /* Known data dependence.  */
+  if (DDR_NUM_DIST_VECTS (ddr) == 0)
+    {
+      /* If user asserted safelen consecutive iterations can be
+        executed concurrently, assume independence.  */
+      if (loop->safelen >= 2)
+       {
+         return false;
+       }
+
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "versioning for alias required: "
+                          "bad dist vector for ");
+         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
+         dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
+         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
+         dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+       }
+      /* Add to list of ddrs that need to be tested at run-time.  */
+      return !vect_mark_for_runtime_alias_test_1 (ddr, loop);
+    }
+
+  loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
+  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
+    {
+      int dist = dist_v[loop_depth];
+
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "dependence distance  = %d.\n", dist);
+
+      if (dist == 0)
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "dependence distance == 0 between ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
+             dump_printf (MSG_NOTE, " and ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
+             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+           }
+
+         if (is_raw_dependence (dra, drb))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "READ_WRITE dependence in interleaving."
+                                "\n");
+             return true;
+           }
+
+         continue;
+       }
+
+      if (dist > 0 && DDR_REVERSED_P (ddr))
+       {
+         /* If DDR_REVERSED_P the order of the data-refs in DDR was
+            reversed (to make distance vector positive), and the actual
+            distance is negative.  */
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "dependence distance negative.\n");
+         continue;
+       }
+
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                      "not vectorized, possible dependence "
+                      "between data-refs ");
+         dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
+         dump_printf (MSG_NOTE,  " and ");
+         dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
+         dump_printf (MSG_NOTE,  "\n");
+       }
+
+      return true;
+    }
+
+  return false;
+}
+
+/* Function vect_analyze_dataref_dependences.
+
+   Examine all the data references within the LOOP, and make sure there does not
+   exist any data dependence between them.  */
+
+static bool
+vect_analyze_dataref_dependences (struct ITER_node *inode,
+                                 vec<loop_p> loop_nest)
+{
+  unsigned int i;
+  struct data_dependence_relation *ddr;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "=== vect_analyze_data_ref_dependences ===\n");
+
+  ITER_NODE_DATA_DEPS (inode).create (ITER_NODE_DATA_REFS (inode).length ()
+            * ITER_NODE_DATA_REFS (inode).length ());
+  if (!compute_all_dependences (ITER_NODE_DATA_REFS (inode),
+                               &ITER_NODE_DATA_DEPS (inode),
+                               loop_nest, true))
+    return false;
+
+  FOR_EACH_VEC_ELT (ITER_NODE_DATA_DEPS (inode), i, ddr)
+    if (vect_analyze_dataref_dependence (ddr, inode))
+      return false;
+
+  return true;
+}
+
+/* Function mark_addr_index_stmt.
+
+   This function marks the statements involved in address computation within the
+   loop, which need not be vectorized, as scalars.  Currently, need of this
+   function is unknown.  */
+
+static void
+mark_addr_index_stmt (tree use, struct loop *loop)
+{
+  return;
+}
+
+/* Function classify_loop_stmt.
+
+   The computations within the loop can be classified as :
+   - Induction : Following form where v1 is loop invarient.
+
+                var_phi = PHI (var_loop, var_inv)
+                :
+                var_loop = op (var_phi, v1)
+               
+   - Reduction : Following form where reduction variable var_loop or reduction
+                component var_phi has single use.  v1 can be any vectorizable
+                statement.
+
+                var_phi = PHI (var_loop, var_inv)
+                :
+                var_loop = op (var_phi, v1)
+                :
+                var_out_phi = PHI (var_loop)
+
+   - Intermediate : Intermediate computations defining ssa_vars which are not
+                   live beyond the loop.  The operands can be loop invarient/
+                   results of computations within the loop/PHI nodes/constant/
+                   memory.
+
+   - Loop invarient : Constant/memory references/ssa_vars with computations
+                     outside the loop.  There is no redefinition of components
+                     of computations within the loop.
+
+   - Scalar : The statements which contribute in address computations or other
+             accesses which need not be vectorized.
+
+   - Complex : Not vectorizable.
+
+   classify_loop_stmt ():
+   - The loop statement with all operands on RHS as loop invariant (constants or
+     ssa_vars defined outside loop) are marked as loop invariant.  RO memory can
+     be marked as loop invariant.  Currently, RW memory is not marked as loop
+     invariant.
+
+   - The loop statement which is not vectorizable is marked as complex, and
+     vectorization is terminated.
+
+   - All other loop statements which have non-PHI nodes as output are marked as
+     intermediate.
+
+   - If any PHI node, which is neither reduction nor induction variable,
+     vectorization is terminated.
+*/
+
+static bool
+classify_loop_stmt (gimple *stmt, struct loop * loop)
+{
+  char *attr_name[] = {"Undefined", "Scalar", "loop_invariant", "induction",
+                      "reduction", "intermediate", "complex",
+                      "loop_exit_control"};
+  enum stmt_use_type stmt_type, def_type;
+  use_operand_p use_p;
+  ssa_op_iter iter;
+
+  if (gimple_has_volatile_ops (stmt))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: stmt has volatile operand\n");
+      return false;
+    }
+
+  if (STMT_ATTR_USE_TYPE (stmt) != stmt_use_type_undef
+      && STMT_ATTR_USE_TYPE (stmt) != stmt_use_type_scalar)
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location, " classify_loop_stmt: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "  Preprocessed : %s\n",
+                          attr_name[STMT_ATTR_USE_TYPE (stmt)]);
+       }
+      return (STMT_ATTR_USE_TYPE (stmt) != stmt_use_type_complex);
+    }
+
+  if (gimple_code (stmt) == GIMPLE_PHI)
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location, " classify_loop_stmt: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "  PHI : %s\n",
+                          attr_name[STMT_ATTR_USE_TYPE (stmt)]);
+       }
+      return true;
+    }
+       
+  /* The statement lies outside the loop, so no need to analyze the statement
+     further.  */
+  if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
+    {
+      STMT_ATTR_USE_TYPE (stmt) = stmt_use_type_loop_invariant;
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location, " classify_loop_stmt: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+         dump_printf_loc (MSG_NOTE, vect_location,
+                  "  Statement outside the loop under consideration.\n");
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "  : %s\n",
+                          attr_name[STMT_ATTR_USE_TYPE (stmt)]);
+       }
+      return true;
+    }
+
+  /* If DR in STMT, there is LOAD/STORE operation.  Mark the instructions
+     computing address for indexing as non-vectorizable/scalar.  */
+  if (STMT_ATTR_DR (stmt) && gimple_assign_single_p (stmt))
+    {
+      tree scalar_type;
+      tree op0 = gimple_assign_lhs (stmt);
+      tree op1 = gimple_assign_rhs1 (stmt);
+      enum tree_code code;
+
+      if (TREE_CODE (op0) == SSA_NAME)
+       {
+         code = TREE_CODE (op1);
+         if (code == ARRAY_REF
+             || code == BIT_FIELD_REF
+             || code == INDIRECT_REF
+             || code == COMPONENT_REF
+             || code == IMAGPART_EXPR
+             || code == REALPART_EXPR
+             || code == MEM_REF
+             || TREE_CODE_CLASS (code) == tcc_declaration)
+           {
+             /* LOAD - trace SSA_NAME for address if any.  Mark it as scalar if
+                not marked already.  For loads, no need to analyze uses.  */
+             mark_addr_index_stmt (TREE_OPERAND (op1, 0), loop);
+             STMT_ATTR_USE_TYPE (stmt) = stmt_use_type_intermediate;
+             scalar_type = TREE_TYPE (DR_REF (STMT_ATTR_DR (stmt)));
+             STMT_ATTR_VECTYPE (stmt) =
+                        get_vectype_for_scalar_type (scalar_type);
+
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  " classify_loop_stmt: ");
+                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  " : %s\n",
+                                  attr_name[STMT_ATTR_USE_TYPE (stmt)]);
+               }
+             return true;
+           }
+         /* TODO: Conversion needs to be handled here.  */
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Expected load.\n");
+         return false;
+       }
+      else
+       {
+         code = TREE_CODE (op0);
+         if (code == ARRAY_REF
+             || code == BIT_FIELD_REF
+             || code == INDIRECT_REF
+             || code == COMPONENT_REF
+             || code == IMAGPART_EXPR
+             || code == REALPART_EXPR
+             || code == MEM_REF
+             || TREE_CODE_CLASS (code) == tcc_declaration)
+           {
+             /* STORE - Trace SSA_NAME for address if any.  Mark it as scalar
+                if not marked already.  Do not return, as we need to analyze
+                uses.  However, the statement itself is marked of type
+                intermediate, so that it is not marked as loop invariant.  */
+             mark_addr_index_stmt (TREE_OPERAND (op0, 0), loop);
+             STMT_ATTR_USE_TYPE (stmt) = stmt_use_type_intermediate;
+           }
+         else
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "Expected store.\n");
+
+             return false;
+           }
+       }
+    }
+
+  /* TODO: PAREN_EXPR and CONVERT_EXPR_CODE_P need to be handled here?? - may
+     not.  But still, marking the location.  */
+  bool retval = true;
+  tree vec_type = NULL;
+  stmt_type = STMT_ATTR_USE_TYPE (stmt);
+  FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, iter, SSA_OP_USE)
+    {
+      tree op = USE_FROM_PTR (use_p);
+      enum tree_code code = TREE_CODE (op);
+
+      if (code == SSA_NAME)
+       {
+         if (!SSA_NAME_IS_DEFAULT_DEF (op))
+           {
+             gimple *def_stmt = SSA_NAME_DEF_STMT (op);
+             retval &= classify_loop_stmt (def_stmt, loop);
+
+             def_type = STMT_ATTR_USE_TYPE (def_stmt);
+
+             if (def_type == stmt_use_type_induction &&
+                 stmt_type == stmt_use_type_undef)
+               stmt_type = stmt_use_type_scalar;
+             else if (stmt_type > stmt_use_type_scalar &&
+                      stmt_type != def_type)
+               stmt_type = stmt_use_type_intermediate;
+             else
+               stmt_type = def_type;
+
+             vec_type = STMT_ATTR_VECTYPE (def_stmt);
+             if (STMT_ATTR_VECTYPE (stmt) && vec_type
+                 && !useless_type_conversion_p (vec_type,
+                                                STMT_ATTR_VECTYPE (stmt)))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                       "Not vectorized: Types of operands do not match.\n");
+                 return false;
+               }
+             STMT_ATTR_VECTYPE (stmt) = vec_type;
+           }
+       }
+      else if (CONSTANT_CLASS_P (op) || is_gimple_min_invariant (op))
+       {
+         if (stmt_type <= stmt_use_type_loop_invariant)
+           stmt_type = stmt_use_type_loop_invariant;
+         else
+           stmt_type = stmt_use_type_intermediate;
+       }
+    }
+
+  /* Once all the USEs of stmt are marked, mark the type of this statement.  */
+  STMT_ATTR_USE_TYPE (stmt) = stmt_type;
+
+  if (stmt_type == stmt_use_type_undef)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                "Not vectorized: stmt use type UNDEF at end of processing.\n");
+      return false;
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, " classify_loop_stmt: ");
+      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "  : %s\n",
+                      attr_name[STMT_ATTR_USE_TYPE (stmt)]);
+    }
+
+  return retval;
+}
+
+/*
+  FUNCTION classify_loop_stmts.
+
+  - Analyze PHI nodes in loop header to identify induction variables.  As
+    induction variables have initial definition and a definition within loop,
+    each induction variable aught to be in the PHI nodes of loop header.  Hence,
+    we need not identify them from loop body.
+
+  - If the induction variable has variable step, ILV node creation is not
+    possible (for now), as the arity depends upon the stepsize.  It can still be
+    reduction variable, hence decision of vectorization is not taken yet.
+
+  - The variables that are not induction variables, are checked if they are
+    reduction variables - the reduction operation is vectorizable only if the
+    reduction variable does not have any use apart from that in outgoing PHI
+    node.
+
+  - Invoke function classify_loop_stmt () to classify each statement in
+    PROBABLE_ROOTS list recursively so that all the USEs in the statement are
+    processed.
+*/
+
+static bool
+classify_loop_stmts (struct ITER_node *inode, vec<gimple *> *probable_roots)
+{
+  basic_block *bbs;
+  int nbbs;
+  struct loop *loop = ITER_NODE_LOOP (inode);
+  vec<gimple *> worklist = vNULL;
+
+  bbs = get_loop_body (loop);
+  nbbs = loop->num_nodes;
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "==== classify_loop_stmts ====\n");
+    }
+
+  /* Mark phi node*/
+  for (gphi_iterator si = gsi_start_phis (loop->header);
+       !gsi_end_p (si); gsi_next (&si))
+    {
+      gphi *phi = si.phi ();
+      tree access_fn = NULL;
+      tree def = PHI_RESULT (phi);
+      tree init, step;
+
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+       }
+
+      if (virtual_operand_p (def))
+       continue;
+
+      access_fn = analyze_scalar_evolution (loop, def);
+      if (access_fn)
+       {
+         STMT_ATTR_ACCESS_FN (phi) = access_fn;
+       }
+      else
+       {
+         worklist.safe_push (phi);
+         continue;
+       }
+
+      if (vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step))
+       {
+         STMT_ATTR_USE_TYPE (phi) = stmt_use_type_induction;
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "Detected induction.\n");
+             dump_printf (MSG_NOTE, "\n");
+           }
+       }
+      else
+       worklist.safe_push (phi);
+    }
+#define REDUCTION_COMPLETE 0
+#if REDUCTION_COMPLETE
+  while (worklist.length () > 0)
+    {
+      gimple *phi = worklist.pop ();
+      tree def = PHI_RESULT (phi);
+
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Analyze phi for reduction: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+       }
+
+      gcc_assert (!virtual_operand_p (def)
+                 && STMT_ATTR_USE_TYPE (phi) == stmt_use_type_undef);
+
+      reduc_stmt = vect_simple_reduction ();
+
+      if (reduc_stmt)
+       {
+       }
+      else
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "Unknown def-use cycle pattern.\n");
+             dump_printf (MSG_NOTE, "\n");
+           }
+       }
+    }
+#else
+  if (worklist.length () > 0)
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+           "Reduction not supported yet.  Unknown def-use cycle pattern.\n");
+         dump_printf (MSG_NOTE, "\n");
+       }
+      return false;
+    }
+#endif
+
+  worklist = (*probable_roots).copy ();
+  while (worklist.length () > 0)
+    {
+      gimple *stmt = worklist.pop ();
+
+      if (!classify_loop_stmt (stmt, loop))
+       return false;
+    }
+
+  return true;
+}
+
+/* FUNCTION mark_probable_root_nodes.
+
+   The loop has side effects only if
+   - Loop writes to memory location.
+   - The computations within loop are live after the loop.
+
+   This function identifies such nodes.
+
+   mark_probable_root_nodes () -
+     FORALL statements within loop DO
+     - If the result of statement is in outgoing PHI node (live across loop
+       body) mark the statement for probable root p-node.
+
+     - If the result of statement is memory, mark the statement as probable root
+       p-node.
+
+     - If the loop is inner loop, and any statement uses induction variable of
+       outer loop - because of which stride > vector_size, do not vectorize.
+*/
+
+static void
+mark_probable_root_nodes (struct ITER_node *inode,
+                         vec<gimple *> *probable_roots)
+{
+  int i;
+  gimple *stmt;
+  ssa_op_iter op_iter;
+  imm_use_iterator imm_iter;
+  use_operand_p use_p;
+  def_operand_p def_p;
+  struct loop *loop = ITER_NODE_LOOP (inode);
+  int nbbs = loop->num_nodes;
+  basic_block *bbs = get_loop_body (ITER_NODE_LOOP (inode));
+
+  for (i = 0; i < nbbs; i++)
+    {
+      basic_block bb = bbs[i];
+      gimple_stmt_iterator si;
+
+      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+       {
+         stmt = gsi_stmt (si);
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                " mark_probable_root_nodes: ");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+           }
+
+         if (gimple_has_volatile_ops (stmt))
+           continue;
+
+         if (gimple_vdef (stmt) && !gimple_clobber_p (stmt))
+           {
+                     if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "  : Memory def.\n");
+             probable_roots->safe_push (stmt);
+             STMT_ATTR_PROOT (stmt) = true;
+           }
+
+
+         FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
+           {
+             FOR_EACH_IMM_USE_FAST (use_p, imm_iter, DEF_FROM_PTR (def_p))
+               {
+                 basic_block bb = gimple_bb (USE_STMT (use_p));
+                 if (!flow_bb_inside_loop_p (loop, bb))
+                   {
+                     if (dump_enabled_p ())
+                       dump_printf_loc (MSG_NOTE, vect_location,
+                                        "  : Live beyond loop.\n");
+
+                     if (is_gimple_debug (USE_STMT (use_p)))
+                       continue;
+
+                       /* We expect all such uses to be in the loop exit phis
+                          (because of loop closed form)   */
+                     gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI);
+                     gcc_assert (bb == single_exit (loop)->dest);
+
+                     probable_roots->safe_push (stmt);
+                     STMT_ATTR_PROOT (stmt) = true;
+                   }
+               }
+           }
+       }
+    }
+}
+
+/* Function vect_is_simple_use.
+
+   Return TRUE if OPERAND is either constant, invariant, or internal SSA_NAME
+   with simple DEF_STMT.  If it is SSA_NAME, return the statement defining the
+   name in DEF_STMT.  */
+
+bool
+vect_is_simple_use (tree operand, gimple **def_stmt)
+{
+  *def_stmt = NULL;
+
+  if (CONSTANT_CLASS_P (operand))
+    return true;
+  if (is_gimple_min_invariant (operand))
+    return true;
+  if (TREE_CODE (operand) != SSA_NAME)
+    return false;
+  if (SSA_NAME_IS_DEFAULT_DEF (operand))
+    return true;
+
+  *def_stmt = SSA_NAME_DEF_STMT (operand);
+
+  switch (gimple_code (*def_stmt))
+    {
+    case GIMPLE_PHI:
+    case GIMPLE_ASSIGN:
+    case GIMPLE_CALL:
+      break;
+    default:
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "unsupported defining stmt:\n");
+      return false;
+    }
+  return true;
+}
+
+/* Function normalize_base_addr.
+
+   This function normalizes the address used by the STMT within LOOP.  The data
+   reference DR holds BASE_ADDRESS, INIT component of first element, and OFFSET
+   of current element.  This function combines all the components of the address
+   and splits it into 2 components - BASE_ADDRESS and OFFSET which is < stride.
+*/
+
+static tree
+normalize_base_addr (gimple *stmt, struct loop *loop, tree *offset)
+{
+  tree addr, var, constant, retval;
+
+  addr = size_binop (PLUS_EXPR,
+             fold_convert (ssizetype, DR_BASE_ADDRESS (STMT_ATTR_DR (stmt))),
+             size_binop (PLUS_EXPR,
+                  fold_convert (ssizetype, DR_INIT (STMT_ATTR_DR (stmt))),
+                  fold_convert (ssizetype, DR_OFFSET (STMT_ATTR_DR (stmt)))));
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "normalize_base_addr: ");
+      dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                        addr);
+      dump_printf (MSG_NOTE, "\n");
+    }
+
+  split_constant_offset (addr, &var, &constant);
+
+  if (tree_fits_shwi_p (constant)
+      && tree_fits_shwi_p (DR_STEP (STMT_ATTR_DR (stmt))))
+    {
+      int c, step;
+      c = tree_to_shwi (constant);
+      step = tree_to_shwi (DR_STEP (STMT_ATTR_DR (stmt)));
+
+      if (c >= step)
+       {
+         *offset = ssize_int (c % step);
+         retval = size_binop (PLUS_EXPR, var, ssize_int (c / step * step));
+       }
+      else
+       {
+         *offset = constant;
+         retval = var;
+       }
+
+    }
+  else
+    {
+      *offset = ssize_int (0);
+      retval = size_binop (PLUS_EXPR, var, constant);
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "\tbase: ");
+      dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                        retval);
+      dump_printf (MSG_NOTE, "\toffset: ");
+      dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                        *offset);
+      dump_printf (MSG_NOTE, "\n");
+    }
+  return retval;
+}
+
+/***** Helper functions for prim-tree creation *****/
+
+/* Function init_primop_node.
+
+   This function creates PRIMOP_TREE node and initializes all its fields to 0.
+*/
+
+struct primop_tree *
+init_primop_node (void)
+{
+  static int pid = 0;
+  struct primop_tree *ptree;
+  ptree = (struct primop_tree *) xcalloc (1, sizeof (struct primop_tree));
+
+  PT_PID (ptree) = pid++;
+  PT_NODE_OP (ptree) = 0;
+  PT_ARITY (ptree) = 0;
+  ptree->children = vNULL;
+  PT_PARENT (ptree) = NULL;
+  PT_ITER_COUNT (ptree) = NULL;
+  PT_VEC_SIZE (ptree) = 0;
+  PT_VEC_TYPE (ptree) = NULL;
+  PT_VEC_INST (ptree) = vNULL;
+  PT_TARGET_COST (ptree) = 0;
+  PT_NUM_INSTANCES (ptree) = 0;
+  PT_LOOP_DEPENDENCES (ptree) = vNULL;
+  PT_DEP (ptree) =vNULL;
+  PT_DEPTH (ptree) = 0;
+  PT_ATTR_NO (ptree) = 0;
+  PT_AUX (ptree) = NULL;
+  memset (&ptree->u, 0, sizeof (ptree->u));
+  return ptree;
+}
+
+/* Function populate_prim_node.
+
+   This function returns PRIMOP_TREE node initialized with given information.
+*/
+
+struct primop_tree *
+populate_prim_node (enum primop_code pcode, struct ITER_node *inode,
+                   struct primop_tree *parent, gimple *stmt)
+{
+  struct primop_tree *ptree;
+  ptree = init_primop_node ();
+
+  PT_NODE_OP (ptree) = (int) pcode;
+  PT_PARENT (ptree) = parent;
+  PT_ITER_COUNT (ptree) = ITER_NODE_NITERS (inode);
+
+  if (stmt)
+    {
+      PT_VEC_TYPE (ptree) = STMT_ATTR_VECTYPE (stmt);
+      PT_ATTR_NO (ptree) = gimple_uid (stmt);
+      STMT_ATTR_TREE (stmt) = ptree;
+    }
+
+  return ptree;
+}
+
+/* Function exists_primTree_with_memref.
+
+    This function checks if PRIMOP_TREE node corresponding to MEMREF already
+    exists in ITER_node.  If yes, return the pointer of PRIMOP_TREE node, else,
+    return NULL.  */
+
+struct primop_tree *
+exists_primTree_with_memref (tree base, tree step, bool is_read,
+                            struct ITER_node *inode)
+{
+  vec<struct primop_tree *> worklist;
+  worklist = (ITER_NODE_LOOP_BODY (inode)).copy ();
+
+  while (worklist.length () > 0)
+    {
+      primop_tree *ptree = worklist.pop ();
+
+      if (!operand_equal_p (PT_MEMVAL_BASE (ptree), base, 0))
+       continue;
+
+      if (!operand_equal_p (PT_MEMVAL_MULT_IDX (ptree), step, 0))
+       continue;
+
+      if (is_read != PT_MEMVAL_IS_READ (ptree))
+        continue;
+
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          " exists_primTree_with_memref: TRUE\n");
+       }
+
+      return ptree;
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      " exists_primTree_with_memref: FALSE\n");
+    }
+
+  return NULL;
+}
+
+/* Create primtree of type mem_ref - it is only for load/store.  */
+
+struct primop_tree *
+create_primTree_memref (tree base, tree step, bool is_read, int num,
+                        struct ITER_node *inode, struct primop_tree *parent)
+{
+  struct primop_tree * ptree;
+
+  ptree = populate_prim_node (POP_MEMREF, inode, parent, NULL);
+
+  PT_MEMVAL_BASE (ptree) = unshare_expr (base);
+  PT_MEMVAL_MULT_IDX (ptree) = unshare_expr (step);
+  PT_MEMVAL_IS_READ (ptree) = is_read;
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      " create_primTree_memref : stride - %d\n ",
+                      tree_to_uhwi (step) / num);
+     }
+  return ptree;
+}
+
+/* Function create_primTree_combine.
+
+   Create primtree with PCODE as interleave or concat.  STMT is statement for
+   which primtree is being created.  */
+struct primop_tree *
+create_primTree_combine (enum primop_code pcode, gimple *stmt, int parts,
+                        struct ITER_node *inode, struct primop_tree *parent)
+{
+  struct primop_tree * ptree;
+
+  ptree = populate_prim_node (pcode, inode, parent, stmt);
+  PT_OPERAND_SELECTOR (ptree) = -1;
+  PT_DIVISION (ptree) = parts;
+  PT_VAR_STRIDE (ptree) = NULL;
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      " create_primTree_combine: parts - %d\n", parts);
+    }
+
+  return ptree;
+}
+
+/* Function create_primTree_partition.
+
+   Create primtree with PCODE as split or extract.  STMT is statement for which
+   primtree is being created.  PARTS is number of partitions to be created.
+   SELECTOR is the part being selected.  */
+struct primop_tree *
+create_primTree_partition (enum primop_code pcode, gimple *stmt, int parts,
+                          int selector, struct ITER_node *inode,
+                          struct primop_tree *parent)
+{
+  struct primop_tree * ptree;
+
+  ptree = populate_prim_node (pcode, inode, parent, stmt);
+  PT_OPERAND_SELECTOR (ptree) = selector;
+  PT_DIVISION (ptree) = parts;
+  PT_VAR_STRIDE (ptree) = NULL;
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                  " create_primTree_partition : parts - %d selector - %d\n",
+                  parts, selector);
+    }
+
+  return ptree;
+}
+
+/* Function add_child_at_index.
+
+   Attach PCHILD node as idx^th child of PNODE.  */
+void
+add_child_at_index (struct primop_tree *ptree,
+                   struct primop_tree *pchild, int idx)
+{
+  (PT_ARITY (ptree))++;
+  while (idx >= ptree->children.length ())
+    {
+      ptree->children.safe_push (NULL);
+    }
+  PT_CHILD (ptree, idx) = pchild;
+}
+
+/* Function get_child_at_index.
+
+   Get idx^th child of PNODE.  */
+struct primop_tree *
+get_child_at_index (struct primop_tree *ptree, int idx)
+{
+  gcc_assert (idx < PT_ARITY (ptree));
+  return PT_CHILD (ptree, idx);
+}
+
+
+/* Function vectorizable_store.
+
+   This function checks if STMT is STORE instruction and is vectorizable for
+   target architecture.  If TRUE, it computes MEMREF node with respect to the
+   loop LOOP for which the vectorization is targetted.
+
+   TODO: Currently, we are not handling variable strides as vectorization is not
+   possible in all the cases with variable stride.
+
+   The variable step can be of 3 types:
+   1. Monotonically dependent on index
+   2. Non-monotonically dependent on index.
+   3. Loop invariant step.
+      a. Case for classic vectorization
+      b. Case for SLP
+
+   1. Monotonically dependent on index:
+   The address expression under consideration is base + step * index + offset.
+   However, even if the induction variable is linear in nature, if the step is
+   monotonically dependent on index - which is multiplied by induction variable,
+   the expression no longer remains linear, but becomes quadratic - because of
+   which loop unrolling cannot take place.
+
+   2. Non-monotonically dependent on index:
+   Again, variable step can cause RAW or WAW conflicts if it is not monotonic
+   function of index, because of which optimization will be incorrect.
+    eg:
+     step = a[i];
+     c[step * i + d] = b[i]
+
+   If it is on RHS, though there won't be conflicts, we cannot determine memory
+   locations which are accessed at compile-time, because of which optimization
+   not possible.
+
+   3. Loop invariant step:
+    a) Classic auto-vectorization for stride = j.
+     eg:
+      j = op (v1, v2)
+      for (i = 0; i < 2048; i++)
+        a[i] = b[i*j]
+    So, we should extract multiples of j from array b : for which instruction
+    generation is very difficult as we don't know what permute order to use.
+
+    If we have some instruction like (un)pack (vreg, reg) which (un)packs vector
+    elements from vec with index = multiples of reg to form new vec, we can
+    think of this optimization.
+
+    b) The only case in which variable step can work unconditionally is if the
+    variable is iteration invariant - for which SLP might work if SLP factor is
+    same as vector size.
+     eg:
+      For vector size = 4,
+      for loop body as follows:
+
+      a[4 * i] = b[var * i]
+      a[4 * i + 1] = b[var * i + 1]
+      a[4 * i + 2] = b[var * i  + 2]
+      a[4 * i + 3] = b[var * i + 3]
+
+    above can be vectorized with vector load at b[var * i].  Conditions: a and b
+    are distinct.
+
+    Looked at paper "Automatic Vectorization of Interleaved Data Revisited" for
+    implementation of this part, however, I personally don't find it useful for
+    MIPS.  */
+
+static struct primop_tree *
+vectorizable_store (gimple *stmt, struct ITER_node *inode,
+                   struct primop_tree *parent)
+{
+  tree src_op, dest_op, base, offset, step;
+  enum tree_code code;
+  tree vec_type, scalar_type, rhs_vectype;
+  gimple *def_stmt;
+  machine_mode vec_mode;
+  struct loop *loop = ITER_NODE_LOOP (inode);
+  struct primop_tree *pnode, *pchild1, *pchild2;
+  int num;
+
+  if (STMT_ATTR_USE_TYPE (stmt) != stmt_use_type_intermediate)
+    return NULL;
+
+  if (!gimple_assign_single_p (stmt))
+    return NULL;
+
+  dest_op = gimple_assign_lhs (stmt);
+  code = TREE_CODE (dest_op);
+
+  if (code != ARRAY_REF
+      && code != BIT_FIELD_REF
+      && code != INDIRECT_REF
+      && code != COMPONENT_REF
+      && code != IMAGPART_EXPR
+      && code != REALPART_EXPR
+      && code != MEM_REF)
+    return NULL;
+
+  if (!STMT_ATTR_DR (stmt))
+    return NULL;
+
+  scalar_type = TREE_TYPE (DR_REF (STMT_ATTR_DR (stmt)));
+  vec_type = get_vectype_for_scalar_type (scalar_type);
+
+  src_op = gimple_assign_rhs1 (stmt);
+
+  if (!vect_is_simple_use (src_op, &def_stmt))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "use not simple.\n");
+      return NULL;
+    }
+
+  rhs_vectype = STMT_ATTR_VECTYPE (def_stmt);
+  if (rhs_vectype && !useless_type_conversion_p (vec_type, rhs_vectype))
+    return NULL;
+
+
+  vec_mode = TYPE_MODE (vec_type);
+  if (optab_handler (mov_optab, vec_mode) == CODE_FOR_nothing)
+    return NULL;
+
+  /* The dataref for store is available.  Analyze it, and create memref for the
+     same.  */
+
+  num = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (STMT_ATTR_DR (stmt)))));
+  base = normalize_base_addr (stmt, loop, &offset);
+  step = DR_STEP (STMT_ATTR_DR (stmt));
+
+  if (!tree_fits_uhwi_p (step) || !tree_fits_uhwi_p (offset))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "Variable stride or offset.\n");
+      return NULL;
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      " vectorize_store: ptree creation\n ");
+    }
+
+
+  pnode = exists_primTree_with_memref (base, step, false, inode);
+  if (pnode == NULL)
+    {
+      pnode = create_primTree_memref (base, step, false, num, inode, NULL);
+      ITER_NODE_LOOP_BODY (inode).safe_push (pnode);
+      pchild1 =  create_primTree_combine (POP_ILV, stmt,
+                       tree_to_uhwi (step) / num, inode, parent);
+      add_child_at_index (pnode, pchild1, 0);
+    }
+  else
+    {
+      pchild1 = get_child_at_index (pnode, 0);
+    }
+    if (def_stmt)
+      {
+       pchild2 = analyze_and_create_ptree (pchild1, def_stmt, inode);
+       if (pchild2 == NULL)
+         return NULL;
+      }
+    else
+      {
+       /* RHS is not SSA name - it is either invariant or constant.  Create
+          appropriate primtree node accordingly.  */
+       /* TODO - Create POP_CONST or POP_INV node - which will create vector
+          using VEC_INIT at later stages.  */
+       return NULL;
+      }
+
+    if (tree_to_uhwi (offset) / num >= PT_DIVISION (pchild1))
+      {
+       if (dump_enabled_p ())
+         {
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Not vectorized: stride < offset\n");
+         }
+       return NULL;
+      }
+
+    add_child_at_index (pchild1, pchild2, tree_to_uhwi (offset) / num);
+
+    return pnode;
+}
+
+/* Function vectorizable_load.
+
+   This function checks if STMT is LOAD instruction and is vectorizable for
+   target architecture.  If TRUE, it computes MEMREF node with respect to the
+   loop LOOP for which the vectorization is targetted.  */
+
+static struct primop_tree *
+vectorizable_load (gimple *stmt, struct ITER_node *inode,
+                  struct primop_tree *parent)
+{
+  tree src_op, dest_op, step, base, offset;
+  enum tree_code code;
+  tree vec_type, scalar_type;
+  machine_mode vec_mode;
+  struct loop * loop = ITER_NODE_LOOP (inode);
+  struct primop_tree *pnode, *pchild1;
+  int num;
+
+
+  if (STMT_ATTR_USE_TYPE (stmt) != stmt_use_type_intermediate)
+    return NULL;
+
+  if (!gimple_assign_single_p (stmt))
+    return NULL;
+
+  dest_op = gimple_assign_lhs (stmt);
+  src_op = gimple_assign_rhs1 (stmt);
+  code = TREE_CODE (src_op);
+
+  if (code != ARRAY_REF
+      && code != BIT_FIELD_REF
+      && code != INDIRECT_REF
+      && code != COMPONENT_REF
+      && code != IMAGPART_EXPR
+      && code != REALPART_EXPR
+      && code != MEM_REF)
+    return NULL;
+
+  if (!STMT_ATTR_DR (stmt))
+    return NULL;
+
+
+  scalar_type = TREE_TYPE (DR_REF (STMT_ATTR_DR (stmt)));
+  vec_type = get_vectype_for_scalar_type (scalar_type);
+
+  vec_mode = TYPE_MODE (vec_type);
+  if (optab_handler (mov_optab, vec_mode) == CODE_FOR_nothing)
+    return NULL;
+
+  /* The dataref for load is available.  Analyze it, and create memref for the
+     same.  */
+
+  num = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (STMT_ATTR_DR (stmt)))));
+  base = normalize_base_addr (stmt, loop, &offset);
+  step = DR_STEP (STMT_ATTR_DR (stmt));
+
+  if (!tree_fits_uhwi_p (step) || !tree_fits_uhwi_p (offset))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "Variable stride or offset.\n");
+      return NULL;
+    }
+
+  gcc_assert (parent);
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      " vectorize_load: ptree creation\n ");
+    }
+
+  pnode = create_primTree_partition (POP_EXTR, stmt,
+               tree_to_uhwi (step) / num,
+               tree_to_uhwi (offset) / num, inode, parent);
+  pchild1 = create_primTree_memref (base, step, true, num, inode, pnode);
+  add_child_at_index (pnode, pchild1, 0);
+  return pnode;
+}
+
+/* FUNCTION analyze_and_create_ptree ().
+
+    - If the result is storing in memory, try accessing the address - extract
+      <base, idx, mult_idx, offset> tuple from gimple chains
+
+       *_4 = op (v1, v2, ...)
+
+      - If <offset> >= <mult_idx> : Cannot vectorize currently.  (Optimization
+       TODO: create new induction variable ind_var and adjust the offset to
+       <offset> % <mult_idx>, and ind_var = idx + <offset>/<mult_idx>).
+      - Else
+       1. Create root node with mem_ref <base,idx> (if not existing already.
+          If exists, goto step 3.)
+       2. Attach child - ILV node with arity = <mult_idx> to root node.
+       3. Create a child to ILV at index <offset>:
+          - analyze_and_create_ptree (op (v1, v2, ...)).
+
+    - If LHS is SSA_NAME, look at the definition of the SSA_NAME.
+      -  If RHS of DEF is op (v1, v2, ...) and <op> is vectorizable, create
+        c-node with arity = arity (op), and attach the children:
+        - analyze_and_create_ptree (v1)
+        - analyze_and_create_ptree (v2)
+        -  :
+        -  :
+
+      - If RHS is accessing memory - try accessing address - create mem_ref node
+       as above.
+       1. Create EXTR node with arity = <mult_idx> and extr_idx = <offset>
+       2. Create a child node with mem_ref <base, idx>
+
+    - If result is outgoing PHI node, there should be single use node
+      - Access definition of the use node.
+      - If op on LHS of definition is collapsible, generate a
+       COLPS <op, PHI name> in epilogue.
+      - Create root node with memref<PHI name, loop ind_var>
+      - Create ITER node with single child -  analyze_and_create (<second_opd>)
+*/
+
+struct primop_tree *
+analyze_and_create_ptree (struct primop_tree *parent, gimple *stmt,
+                         struct ITER_node *inode)
+{
+  struct primop_tree *ptree;
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                " analyze_and_create_ptree: ");
+      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+    }
+
+  /* If the statement is not within the loop, create VEC_INIT node and
+     return.  */
+
+  if (!flow_bb_inside_loop_p (ITER_NODE_LOOP (inode), gimple_bb (stmt)))
+    {
+      if (dump_enabled_p ())
+        {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "Not vectorized: Loop invariant not handled.\n");
+       }
+      return NULL;
+    }
+
+  if ( (ptree = vectorizable_store (stmt, inode, parent)) == NULL
+      && (ptree = vectorizable_load (stmt, inode, parent)) == NULL
+      /* && (ptree = vectorizable_assign (stmt, inode, parent)) == NULL
+        && (ptree = vectorizable_reduction (stmt, inode, parent)) == NULL
+        && (ptree = vectorizable_arith (stmt, inode, parent)) == NULL */
+     )
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                "Not vectorized: Target does not support instruction.\n");
+    }
+  return ptree;
+}
+
+/* Function is_ptree_complete.
+
+   The function checks if the generated ptree  */
+
+static bool
+is_ptree_complete (struct ITER_node *inode)
+{
+  vec<struct primop_tree *> worklist;
+  vec<struct primop_tree *> chlist;
+
+  worklist = (ITER_NODE_LOOP_BODY (inode)).copy ();
+
+  while (worklist.length () > 0)
+    {
+      primop_tree *ptree = worklist.pop ();
+      if (PT_ARITY (ptree) == 0)
+       continue;
+
+      gcc_assert (ptree->children.length () == 1);
+
+      ptree = get_child_at_index (ptree, 0);
+      gcc_assert (PT_NODE_OP (ptree) == POP_ILV);
+      if (PT_ARITY (ptree) > ptree->children.length ())
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                       "Not vectorized: Multiple writes to same index.\n");
+           }
+         return false;
+       }
+
+      chlist = ptree->children.copy ();
+
+      while (chlist.length () > 0)
+        {
+         if (chlist.pop () == NULL)
+           {
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                       "Not vectorized: Void in memory writes.\n");
+               }
+             return false;
+           }
+        }
+    }
+  return true;
+}
+
+/* Function create_ptree.
+
+   This function is invoked if all the data reference related checks are
+   successful and autovectorization is possible.  This function identifies
+   statements within the LOOP which have side-effects, as PROBABLE_ROOT_NODES.
+   It then classifies all the STMTs within the LOOP.  Once successful, it
+   invokes analyze_and_create_ptree which actually create permute order tree.
+*/
+
+static bool
+create_ptree (struct ITER_node *inode)
+{
+  auto_vec<gimple *, 64> worklist;
+  bool is_ok;
+
+  mark_probable_root_nodes (inode, &worklist);
+
+  if (worklist.length () == 0)
+    {
+      dump_printf (MSG_MISSED_OPTIMIZATION,
+       "Not vectorizable: no probable root nodes.\n");
+      return false;
+    }
+
+  is_ok = classify_loop_stmts (inode, &worklist);
+  if (! is_ok)
+    {
+      dump_printf (MSG_MISSED_OPTIMIZATION,
+       "Not vectorizable: Classification failed.\n");
+      return false;
+    }
+
+  while (worklist.length () > 0)
+    {
+      is_ok = (analyze_and_create_ptree (NULL, worklist.pop (), inode) != NULL);
+      if (! is_ok)
+       {
+         dump_printf (MSG_MISSED_OPTIMIZATION,
+               "Not vectorized: p-tree creation failed.\n");
+         return false;
+       }
+    }
+
+  ITER_NODE_NITERS (inode) = integer_one_node;
+
+  if (is_ptree_complete (inode))
+    {
+      if (dump_enabled_p ())
+        {
+          dump_printf (MSG_NOTE,
+                       "Vectorized: ptree complete.\n");
+        }
+      return true;
+    }
+  else
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf (MSG_MISSED_OPTIMIZATION,
+                      "Not vectorized: ptree incomplete.\n");
+       }
+    return false;
+    }
+}
+
+/* Function vect_analyze_loop_with_prim_tree_2.
+
+   Perform various analysis on the loop LOOP, and record the information in
+   ITER_node structure.  */
+
+static bool
+vect_analyze_loop_with_prim_tree_2 (struct ITER_node *inode)
+{
+  bool ok;
+  int max_vf = MAX_VECTORIZATION_FACTOR;
+  int min_vf = 2;
+  unsigned int n_stmts = 0;
+  auto_vec<loop_p, 64> loop_nest;
+
+  /* Find all data references in the loop (which correspond to vdefs/vuses)
+     and analyze their evolution in the loop.  */
+
+  struct loop *loop = ITER_NODE_LOOP (inode);
+  basic_block *bbs = get_loop_body (ITER_NODE_LOOP (inode));
+  if (!find_loop_nest (loop, &loop_nest))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: loop contains function calls"
+                        " or data references that cannot be analyzed\n");
+      return false;
+    }
+
+  for (unsigned i = 0; i < loop->num_nodes; i++)
+    for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
+        !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+       gimple *stmt = gsi_stmt (gsi);
+       if (is_gimple_debug (stmt))
+         continue;
+       ++n_stmts;
+       if (!find_data_references_in_stmt (loop, stmt,
+                                          &ITER_NODE_DATA_REFS (inode)))
+         {
+           if (is_gimple_call (stmt) && loop->safelen)
+             {
+               tree fndecl = gimple_call_fndecl (stmt), op;
+               if (fndecl != NULL_TREE)
+                 {
+                   cgraph_node *node = cgraph_node::get (fndecl);
+                   if (node != NULL && node->simd_clones != NULL)
+                     {
+                       unsigned int j, n = gimple_call_num_args (stmt);
+                       for (j = 0; j < n; j++)
+                         {
+                           op = gimple_call_arg (stmt, j);
+                           if (DECL_P (op)
+                               || (REFERENCE_CLASS_P (op)
+                                   && get_base_address (op)))
+                             break;
+                         }
+                       op = gimple_call_lhs (stmt);
+                       // Ignore #pragma omp declare simd functions
+                       //   if they don't have data references in the
+                       //   call stmt itself.
+                       if (j == n
+                           && !(op
+                                && (DECL_P (op)
+                                    || (REFERENCE_CLASS_P (op)
+                                        && get_base_address (op)))))
+                         continue;
+                     }
+                 }
+             }
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: loop contains function "
+                              "calls or data references that cannot "
+                              "be analyzed\n");
+           return false;
+         }
+       }
+
+  if (!vect_analyze_datarefs (inode))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data references.\n");
+      return false;
+    }
+
+  ok = vect_analyze_dataref_accesses (inode);
+  if (!ok)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data access.\n");
+      return false;
+    }
+
+  ok = vect_analyze_dataref_dependences (inode, loop_nest);
+  if (!ok
+      || max_vf < min_vf)
+    {
+      if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "bad data dependence.\n");
+      return false;
+    }
+
+  return create_ptree (inode);
+}
+
+/* Function pretty_print_gimple_vec.
+
+   Function to pretty print all the gimple statements in LIST.  */
+
+void
+pretty_print_gimple_vec (pretty_printer *pp, vec<gimple *> list)
+{
+
+  vec<gimple *> worklist;
+
+  worklist = list.copy ();
+
+  while (worklist.length () > 0)
+    {
+      pp_newline_and_indent (pp, 2);
+      pp_gimple_stmt_1 (pp, worklist.pop (), 2, TDF_SLIM);
+      pp_printf (pp, ";");
+    }
+}
+
+#define DEFTREECODE(SYM, NAME, TYPE, LEN) NAME,
+#define END_OF_BASE_TREE_CODES "@dummy",
+
+static const char *const tree_code_name[] = {
+#include "all-tree.def"
+"ILV",
+"CONCAT",
+"EXTR",
+"SPLT",
+"COLLAPSE",
+"MEMREF",
+"CONST",
+"INVAR",
+"ITER"
+};
+
+#undef DEFTREECODE
+#undef END_OF_BASE_TREE_CODES
+
+/* Function pp_primop_tree.
+
+   Function to pretty print primop_tree PTREE.  */
+
+void
+pp_primop_tree (pretty_printer *pp, struct primop_tree * ptree)
+{
+  pp_newline_and_indent (pp, 0);
+  pp_indent (pp);
+  pp_printf (pp,"node [shape=record];");
+  pp_newline_and_indent (pp, 0);
+  pp_indent (pp);
+  pp_printf (pp, "%d [label=\"{", PT_PID (ptree));
+  dump_generic_node (pp, PT_ITER_COUNT (ptree), 0, TDF_SLIM, false);
+  pp_printf (pp, "}|{%s", tree_code_name[PT_NODE_OP (ptree)]);
+
+switch (PT_NODE_OP (ptree))
+    {
+      case POP_ILV:
+      case POP_CONCAT:
+        pp_printf (pp, "|%d", PT_DIVISION (ptree));
+        break;
+      case POP_EXTR:
+      case POP_SPLT:
+        pp_printf (pp, "|div:%d", PT_DIVISION (ptree));
+        pp_printf (pp, "|sel:%d", PT_OPERAND_SELECTOR (ptree));
+        break;
+      case POP_COLLAPSE:
+        break;
+      case POP_MEMREF:
+        pp_printf (pp, "|");
+        dump_generic_node (pp, PT_MEMVAL_BASE (ptree), 0, TDF_SLIM, false);
+        pp_printf (pp, "|stride:");
+        dump_generic_node (pp, PT_MEMVAL_MULT_IDX (ptree), 0, TDF_SLIM, false);
+        break;
+      case POP_CONST:
+        break;
+      case POP_INV:
+        break;
+      case POP_ITER:
+        break;
+      default:
+        break;
+    }
+
+  pp_printf (pp, "}|{%d}\"];", PT_ARITY (ptree));
+
+  if (PT_NODE_OP (ptree) == POP_ITER)
+    {
+      pretty_print_iter_node (pp, PT_INODE (ptree), 2);
+      return;
+    }
+
+  if (PT_ARITY (ptree) != 0)
+    {
+      pretty_print_ptree_vec (pp, ptree->children);
+
+      vec<struct primop_tree *> worklist;
+
+      worklist = ptree->children.copy ();
+
+      while (worklist.length () > 0)
+       {
+         struct primop_tree *child = worklist.pop ();
+         pp_newline_and_indent (pp, 0);
+          pp_indent (pp);
+         pp_printf (pp, "%d -> %d;", PT_PID (ptree), PT_PID (child));
+       }
+    }
+
+}
+
+/* Function pretty_print_ptree_vec.
+
+   Function to pretty print the list LIST of primop nodes.  */
+
+void
+pretty_print_ptree_vec (pretty_printer *pp, vec<struct primop_tree *> list)
+{
+  vec<struct primop_tree *> worklist;
+
+  worklist = list.copy ();
+
+  while (worklist.length () > 0)
+    {
+      pp_newline_and_indent (pp, 0);
+      pp_primop_tree (pp, worklist.pop ());
+    }
+}
+
+/* Function pretty_print_iter_node.
+
+   INODE dup in .dot format.  */
+
+void
+pretty_print_iter_node (pretty_printer *pp, struct ITER_node *inode, int depth)
+{
+  pp_indentation (pp) += depth;
+  pp_indent (pp);
+  pp_printf (pp, "subgraph cluster_iter_node_%d {\n",
+                ITER_NODE_LOOP (inode)->num);
+  pp_indentation (pp) += 2;
+  pp_indent (pp);
+  pp_printf (pp, "label=\"LOOP #%d. NUM_ITER:", ITER_NODE_LOOP (inode)->num);
+  dump_generic_node (pp, ITER_NODE_NITERS (inode), 0, TDF_SLIM, false);
+  pp_printf (pp, "\";\n");
+
+  pp_indent (pp);
+  pp_printf (pp, "subgraph cluster_iter_node_%d_pro {\n",
+                ITER_NODE_LOOP (inode)->num);
+  pp_indentation (pp) += 2;
+  pp_indent (pp);
+  pp_printf (pp, "label=\"PROLOGUE\";\n");
+  pretty_print_gimple_vec (pp, ITER_NODE_PROLOGUE (inode));
+  pp_indentation (pp) -= 2;
+  pp_indent (pp);
+  pp_printf (pp, "}\n");
+
+  pp_printf (pp, "subgraph cluster_iter_node_%d_epi {\n",
+                ITER_NODE_LOOP (inode)->num);
+  pp_indentation (pp) += 2;
+  pp_indent (pp);
+  pp_printf (pp, "label=\"EPILOGUE\";\n");
+  pretty_print_gimple_vec (pp, ITER_NODE_EPILOGUE (inode));
+  pp_indentation (pp) -= 2;
+  pp_indent (pp);
+  pp_printf (pp, "}\n");
+
+  pp_indentation (pp) += 2;
+  pp_printf (pp, "subgraph cluster_iter_node_%d_body {\n",
+                ITER_NODE_LOOP (inode)->num);
+  pp_indentation (pp) += 2;
+  pp_indent (pp);
+  pp_printf (pp, "label=\"LOOP\\nBODY\";\n");
+  pretty_print_ptree_vec (pp, ITER_NODE_LOOP_BODY (inode));
+  pp_indentation (pp) -= 2;
+  pp_indent (pp);
+  pp_printf (pp, "}\n");
+  pp_indentation (pp) -= 2;
+  pp_indent (pp);
+  pp_printf (pp, "}\n");
+}
+
+/* Function dump_iter_node.
+
+   Dump the permute order trees represented by INODE in .dot format.  */
+
+void
+dump_iter_node (struct ITER_node *inode, FILE *fp)
+{
+  pretty_printer pp;
+
+  pp.buffer->stream = fp;
+  pp_printf (&pp, "digraph uniopDG {\n");
+  pp_printf (&pp, "  " "color=blue;" "\n");
+  pp_printf (&pp, "  " "style=bold;" "\n");
+  pp_printf (&pp, "  " "compound=true;" "\n");
+
+  pretty_print_iter_node (&pp, inode, 4);
+
+  pp_printf (&pp, "}\n");
+  pp_flush (&pp);
+}
+
+/* Function vectorize_loops_using_uniop.
+
+   Entry point to autovectorization using unified representation:
+   For each loop D:
+   - analyze the loop, and create p-tree if loop is vectorizable.
+   - generate vectorised code for corresponding p-tree.  */
+
+unsigned
+vectorize_loops_using_uniop (void)
+{
+  unsigned int vect_loops_num;
+  struct loop *loop;
+  bool any_ifcvt_loops = false;
+  unsigned int num_vectorized_loops = 0;
+  unsigned int ret = 0;
+  unsigned int i;
+  vect_loops_num = number_of_loops (cfun);
+
+  /* Bail out if there are no loops.  */
+  if (vect_loops_num <= 1)
+    return 0;
+
+  if (cfun->has_simduid_loops)
+    return 0;
+
+  //iter_node = NULL;
+
+  init_stmt_attr_vec ();
+
+  FOR_EACH_LOOP (loop, 0)
+    if (loop->dont_vectorize)
+      any_ifcvt_loops = true;
+    else if (loop->simduid)
+      continue;
+    else if ((flag_tree_loop_vectorize
+             && optimize_loop_nest_for_speed_p (loop))
+            || loop->force_vectorize)
+      {
+       /* Vectorization should be possible.  Let us find if all statements are
+          vectorizable, and if yes, create p-tree.  */
+        struct ITER_node * tmp_iter_node;
+
+       vect_location = find_loop_location (loop);
+       if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
+           && dump_enabled_p ())
+         dump_printf (MSG_NOTE, "\nAnalyzing loop at %s:%d\n",
+                      LOCATION_FILE (vect_location),
+                      LOCATION_LINE (vect_location));
+
+       tmp_iter_node = vect_populate_iter_node_from_loop (loop);
+
+        if (!tmp_iter_node)
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "bad loop form.\n");
+           continue;
+         }
+
+        if (!vect_analyze_loop_with_prim_tree_2 (tmp_iter_node))
+         {
+            destroy_iter_node_info (tmp_iter_node);
+           loop->aux = NULL;
+           continue;
+         }
+
+       loop->aux = tmp_iter_node;
+
+       if (!dbg_cnt (vect_loop))
+         {
+           any_ifcvt_loops = true;
+           break;
+         }
+
+       if (dump_enabled_p ())
+         {
+           dump_printf (MSG_NOTE, "\nLoop is vectorizable.\n");
+           if (dump_file)
+             dump_iter_node (tmp_iter_node, dump_file);
+           if (alt_dump_file)
+             dump_iter_node (tmp_iter_node, alt_dump_file);
+         }
+
+       gimple *loop_vectorized_call = vect_loop_vectorized_call (loop);
+       /* If the loop is vectorized, set uid of stmts within scalar loop to
+          0.  This change is needed if transform phase uses this loop info.  */
+       /*if (loop_vectorized_call)
+         set_uid_loop_bbs (loop_vinfo, loop_vectorized_call);*/
+
+       /* TODO: Insert call to transformation entry point.  */
+
+       num_vectorized_loops++;
+       /* Now that the loop has been vectorized, allow it to be unrolled
+          etc.  */
+       loop->force_vectorize = false;
+       if (loop_vectorized_call)
+         {
+           fold_loop_vectorized_call (loop_vectorized_call, boolean_true_node);
+           ret |= TODO_cleanup_cfg;
+         }
+      }
+
+
+  vect_location = UNKNOWN_LOCATION;
+
+  statistics_counter_event (cfun, "Vectorized loops", num_vectorized_loops);
+  if (dump_enabled_p ()
+      || (num_vectorized_loops > 0 && dump_enabled_p ()))
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "vectorized %u loops in function.\n",
+                    num_vectorized_loops);
+
+
+  if (any_ifcvt_loops)
+    for (i = 1; i < vect_loops_num; i++)
+      {
+       loop = get_loop (cfun, i);
+       if (loop && loop->dont_vectorize)
+         {
+           gimple *g = vect_loop_vectorized_call (loop);
+           if (g)
+             {
+               fold_loop_vectorized_call (g, boolean_false_node);
+               ret |= TODO_cleanup_cfg;
+             }
+         }
+      }
+
+  for (i = 1; i < vect_loops_num; i++)
+    {
+      struct ITER_node *inode;
+
+      loop = get_loop (cfun, i);
+      if (!loop)
+       continue;
+      inode = (struct ITER_node *) loop->aux;
+      if (inode)
+       destroy_iter_node_info (inode);
+      loop->aux = NULL;
+    }
+
+  free_stmt_attr_vec ();
+
+  if (num_vectorized_loops > 0)
+    {
+      /* If we vectorized any loop only virtual SSA form needs to be updated.
+        ???  Also while we try hard to update loop-closed SSA form we fail
+        to properly do this in some corner-cases (see PR56286).  */
+      rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
+      return TODO_cleanup_cfg;
+    }
+
+  return ret;
+}
+#endif
diff --git a/gcc/tree-vect-unified.h b/gcc/tree-vect-unified.h

new file mode 100644 (file)

index 0000000..95cb8a4
--- /dev/null
+++ b/gcc/tree-vect-unified.h
@@ -0,0 +1,291 @@
+/* Loop Vectorization using unified representation for permute instructions.
+   Copyright (C) 2003-2015 Free Software Foundation, Inc.
+   Contributed by Sameera Deshpande <sameera.deshpande@imgtec.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+
+/* Prim-op ITER : ITER node has 3 main objectives in loop representation using
+   primitive-ops for reordering -
+   1) Enlist all p-trees representing live destination vectors written within
+      the loop.
+   2) Record preparatory operations, loop transformations and cleanup operations
+      needed to enable vectorization.
+   3) Record number of iterations once vector-size reduction is applied on
+      primitive trees.  */
+
+struct ITER_node {
+  /* Preamble */
+
+  /* Loop for which the ITER node is created.  */
+  struct loop *loop;
+
+  /* Number of iterations - At the beginning, it is 1.  After vector-size
+     reduction operation, it has appropriate value.  */
+  tree num_iter;
+
+  /* Preparatory statements to be emitted before vectorized loop body.  */
+  vec<gimple *> prep_stmts;
+
+  /* If loop peeling is needed - before/after depending upon this variable.  */
+  int loop_peel_needed;
+
+  /* Actual loop body */
+
+  /* This vector enlists the statements which are live beyond the loop.  Each
+     iteration performs all the operations in this vector in same order.  For
+     most of the cases, this vector holds perm-trees responsible for vector
+     updation.  */
+  vec<struct primop_tree *> stmts;
+
+  /* Epilogue */
+
+  /* When we have grouped data accesses with gaps, we may introduce invalid
+     memory accesses.  We peel the last iteration of the loop to prevent
+     this.  */
+  bool peeling_for_gaps;
+
+  /* When the number of iterations is not a multiple of the vector size
+     we need to peel off iterations at the end to form an epilogue loop.  */
+  bool peeling_for_niter;
+
+  /* Concluding operations to be performed after loop body - e.g: collapse op
+     on temporary vectors.  */
+  vec<gimple *> finish_stmts;
+
+  /* All data references within the loop.  */
+  vec<data_reference_p> datarefs;
+
+  /* All data dependences within the loop.  */
+  vec<ddr_p> ddrs;
+};
+
+#define ITER_NODE_NITERS(x) (x)->num_iter
+#define ITER_NODE_NITERS_KNOWN_P(x) \
+  (tree_fits_shwi_p ((x)->num_iter) && tree_to_shwi ((x)->num_iter) > 0)
+#define ITER_NODE_LOOP(x) (x)->loop
+#define ITER_NODE_PROLOGUE(x) (x)->prep_stmts
+#define ITER_NODE_LOOP_BODY(x) (x)->stmts
+#define ITER_NODE_EPILOGUE(x) (x)->finish_stmts
+#define ITER_NODE_LOOP_PEEL_NEEDED(x) (x)->loop_peel_needed
+#define ITER_NODE_PEELING_FOR_GAPS(x) (x)->peeling_for_gaps
+#define ITER_NODE_PEELING_FOR_NITER(x) (x)->peeling_for_niter
+#define ITER_NODE_DATA_REFS(x) (x)->datarefs
+#define ITER_NODE_DATA_DEPS(x) (x)->ddrs
+
+enum stmt_use_type {
+  stmt_use_type_undef,
+  stmt_use_type_scalar,
+  stmt_use_type_loop_invariant,
+  stmt_use_type_induction,
+  stmt_use_type_reduction,
+  stmt_use_type_intermediate,
+  stmt_use_type_complex,
+  stmt_use_type_loop_exit_ctrl
+};
+
+struct stmt_attr {
+  enum stmt_vec_info_type type;
+  enum stmt_use_type use_type;
+  tree access_fn;
+  struct primop_tree *ptree;
+  bool probable_root;
+  struct data_reference *dr;
+  tree vectype;
+};
+
+#define STMT_ATTR_USE_TYPE(s) (get_stmt_attr (s))->use_type
+#define STMT_ATTR_ACCESS_FN(s) (get_stmt_attr (s))->access_fn
+#define STMT_ATTR_TREE(s) (get_stmt_attr (s))->ptree
+#define STMT_ATTR_PROOT(s) (get_stmt_attr (s))->probable_root
+#define STMT_ATTR_DR(s) (get_stmt_attr (s))->dr
+#define STMT_ATTR_VECTYPE(s) (get_stmt_attr (s))->vectype
+
+vec<struct stmt_attr *> stmt_attr_vec;
+
+void
+init_stmt_attr_vec (void)
+{ 
+  gcc_assert (!stmt_attr_vec.exists ());
+  stmt_attr_vec.create (50);
+}
+
+void
+free_stmt_attr_vec (void)
+{ 
+  gcc_assert (stmt_attr_vec.exists ());
+  stmt_attr_vec.release ();
+}
+
+inline void
+set_stmt_attr (gimple *stmt, struct stmt_attr *info)
+{ 
+  unsigned int uid = gimple_uid (stmt);
+  if (uid == 0)
+    { 
+      gcc_checking_assert (info);
+      uid = stmt_attr_vec.length () + 1;
+      gimple_set_uid (stmt, uid);
+      stmt_attr_vec.safe_push (info);
+    }
+  else
+    {
+      gcc_checking_assert (info == NULL);
+      stmt_attr_vec[uid - 1] = info;
+    }
+}
+
+inline struct stmt_attr *
+get_stmt_attr (gimple *stmt)
+{ 
+  unsigned int uid = gimple_uid (stmt);
+  if (uid == 0)
+    return NULL;
+
+  return stmt_attr_vec[uid - 1];
+}
+
+/* PRIMOP_TREE : Memory accesses within a loop have definite repetative pattern
+   which can be captured using primitive permute operators which can be used to
+   determine desired permute order for the vector computations.  The PRIMOP_TREE
+   is AST which records all computations and permutations required to store
+   destination vector into continuous memory at the end of all iterations of the
+   loop.  */
+struct primop_tree {
+  /* Unique ptree ID for dump.  */
+  int pid;
+
+  /* stmt_attr number.  */
+  int attr_no;
+
+  /* Operation.  */
+  int node_op;
+
+  /* Arity of the operation.  */
+  int arity;
+
+  /* List of children to this node.  */
+  vec<struct primop_tree *> children;
+
+  /* Parent node.  */
+  struct primop_tree *parent;
+
+  /* Number of iterations - At the beginning, it is loop_count.  After vec-size
+     reduction operation, it is changed to vectorization factor for the
+     operation.  */
+  tree iter_count;
+
+  /* Actual vector size supported by target.  */
+  int vec_size;
+
+  /* The vector type which should be used to vectorize this node.  */
+  tree vec_type;
+
+  /* Set of vector instructions to represent this node.  */
+  vec<gimple *> vec_inst;
+
+  /* Instruction cost for vec_size.  */
+  int target_cost;
+
+  /* Number of instances of vec_inst needed for iter_count.  */
+  int instances;
+
+  /* If the tree is loop-varient, the loops on which this tree depends.  */
+  /* TODO: I am not very sure of we need all the ITERs or just innermost
+     affecting loop.  However, for now, having list of all loops from inner-
+     most to outer-most.  */
+  vec<struct ITER_node *> loop_dependences;
+
+  /* Dependence links if any to other statements.  */
+  vec<ddr_p> dependences;
+
+  /* Depth within sub-tree of same type.  */
+  int depth;
+
+  union {
+    /* In case of c-node, gimple statement cooresponding to c-op.  */
+    gimple * gimple_for_comp;
+
+    /* In case of permute-node, some permute-specific attributes.  */
+    struct perm_node {
+      /* The component to be selected for EXTRACT or SPLIT op.  */
+      int opd_selector;
+
+      /* Number of partitions for permute op.  In case of variable mult-idx,
+        this gives ARITY for ILV and CONCAT as well.  For constant mult-idx,
+        ARITY = DIVISION.  */
+      int division;
+      tree *var_stride;
+    } val;
+
+    /* ITER-node representing inner loop, if any.  */
+    struct ITER_node * inode;
+
+    /* mem_ref without offset information.  */
+    struct mem_ref {
+      tree base;
+      tree mult_idx;
+      bool is_read;
+    } memval;
+  } u;
+  void *aux;
+};
+
+#define PT_PID(x) (x)->pid
+#define PT_NODE_OP(x) (x)->node_op
+#define PT_ATTR_NO(x) (x)->attr_no
+#define PT_ARITY(x) (x)->arity
+#define PT_CHILD(x,i) (x)->children[i]
+#define PT_PARENT(x) (x)->parent
+#define PT_ITER_COUNT(x) (x)->iter_count
+#define PT_VEC_SIZE(x) (x)->vec_size
+#define PT_VEC_TYPE(x) (x)->vec_type
+#define PT_VEC_INST(x) (x)->vec_inst
+#define PT_TARGET_COST(x) (x)->target_cost
+#define PT_NUM_INSTANCES(x) (x)->instances
+#define PT_LOOP_DEPENDENCES(x) (x)->loop_dependences
+#define PT_DEP(x) (x)->dependences
+#define PT_DEPTH(x) (x)->depth
+#define PT_COMPUTE_STMT(x) (x)->u.gimple_for_comp
+#define PT_OPERAND_SELECTOR(x) (x)->u.val.opd_selector
+#define PT_DIVISION(x) (x)->u.val.division
+#define PT_VAR_STRIDE(x) (x)->u.val.var_stride
+#define PT_INODE(x) (x)->u.inode
+#define PT_MEMVAL_BASE(x) (x)->u.memval.base
+#define PT_MEMVAL_MULT_IDX(x) (x)->u.memval.mult_idx
+#define PT_MEMVAL_IS_READ(x) (x)->u.memval.is_read
+#define PT_AUX(x) (x)->aux
+
+//struct ITER_node *iter_node;
+
+extern unsigned int vectorize_loops_using_uniop (void);
+extern struct primop_tree * analyze_and_create_ptree (struct primop_tree *,
+                gimple *, struct ITER_node *);
+extern void pretty_print_ptree_vec (pretty_printer *,
+                                   vec<struct primop_tree*>);
+extern void pretty_print_iter_node (pretty_printer *, struct ITER_node *, int);
+
+enum primop_code {
+  POP_ILV=MAX_TREE_CODES,
+  POP_CONCAT,
+  POP_EXTR,
+  POP_SPLT,
+  POP_COLLAPSE,
+  POP_MEMREF,
+  POP_CONST,
+  POP_INV,
+  POP_ITER};
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c

index 9fbd1836ecb905e228147d8ea3eab61a51ad737a..a8a5760136b268c7d448c1f28bd6a24e52e3b83d 100644 (file)
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -399,7 +399,7 @@ vect_stmt_in_region_p (vec_info *vinfo, gimple *stmt)
  /* If LOOP has been versioned during ifcvt, return the internal call
     guarding it.  */
  
-static gimple *
+gimple *
  vect_loop_vectorized_call (struct loop *loop)
  {
    basic_block bb = loop_preheader_edge (loop)->src;
@@ -435,7 +435,7 @@ vect_loop_vectorized_call (struct loop *loop)
  /* Fold LOOP_VECTORIZED internal call G to VALUE and
     update any immediate uses of it's LHS.  */
  
-static void
+void
  fold_loop_vectorized_call (gimple *g, tree value)
  {
    tree lhs = gimple_call_lhs (g);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index 2cfb72a6fa3ac17100fdf7086f96e96e4145147a..43c187be53d1077940c9f575eba36e38753f35be 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -798,7 +798,7 @@ set_vinfo_for_stmt (gimple *stmt, stmt_vec_info info)
  
  /* Return the earlier statement between STMT1 and STMT2.  */
  
-static inline gimple *
+inline gimple *
  get_earlier_stmt (gimple *stmt1, gimple *stmt2)
  {
    unsigned int uid1, uid2;
@@ -1099,6 +1099,9 @@ extern bool vect_analyze_data_ref_accesses (vec_info *);
  extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
  extern bool vect_check_gather_scatter (gimple *, loop_vec_info,
                                        gather_scatter_info *);
+extern tree vect_check_gather_scatter_1 (gimple *, tree, bool, struct loop *,
+                                        tree, tree *, tree *, int *);
+
  extern bool vect_analyze_data_refs (vec_info *, int *);
  extern tree vect_create_data_ref_ptr (gimple *, tree, struct loop *, tree,
                                       tree *, gimple_stmt_iterator *,
@@ -1125,6 +1128,7 @@ extern tree vect_get_new_ssa_name (tree, enum vect_var_kind,
  extern tree vect_create_addr_base_for_vector_ref (gimple *, gimple_seq *,
                                                   tree, struct loop *,
                                                   tree = NULL_TREE);
+extern bool vect_mark_for_runtime_alias_test_1 (ddr_p, loop *);
  
  /* In tree-vect-loop.c.  */
  /* FORNOW: Used in tree-parloops.c.  */
@@ -1136,6 +1140,8 @@ extern loop_vec_info vect_analyze_loop (struct loop *);
  /* Drive for loop transformation stage.  */
  extern void vect_transform_loop (loop_vec_info);
  extern loop_vec_info vect_analyze_loop_form (struct loop *);
+extern bool vect_analyze_loop_form_1 (struct loop *, gcond **, tree *, tree *,
+                                     gcond **);
  extern bool vectorizable_live_operation (gimple *, gimple_stmt_iterator *,
                                          slp_tree, int, gimple **);
  extern bool vectorizable_reduction (gimple *, gimple_stmt_iterator *,
@@ -1147,6 +1153,7 @@ extern int vect_get_known_peeling_cost (loop_vec_info, int, int *,
                                         stmt_vector_for_cost *,
                                         stmt_vector_for_cost *,
                                         stmt_vector_for_cost *);
+extern bool vect_is_simple_iv_evolution (unsigned, tree, tree *, tree *);
  
  /* In tree-vect-slp.c.  */
  extern void vect_free_slp_instance (slp_instance);
@@ -1177,5 +1184,7 @@ void vect_pattern_recog (vec_info *);
  unsigned vectorize_loops (void);
  void vect_destroy_datarefs (vec_info *);
  bool vect_stmt_in_region_p (vec_info *, gimple *);
+extern void fold_loop_vectorized_call (gimple *, tree);
+extern gimple *vect_loop_vectorized_call (struct loop *);
  
  #endif  /* GCC_TREE_VECTORIZER_H  */
author	Sameera Deshpande <sameera.deshpande@imgtec.com>
	Mon, 11 Jul 2016 08:01:08 +0000 (08:01 +0000)
committer	Sameera Deshpande <sameerad@gcc.gnu.org>
	Mon, 11 Jul 2016 08:01:08 +0000 (13:31 +0530)
gcc/ChangeLog		patch \| blob \| blame \| history
gcc/Makefile.in		patch \| blob \| blame \| history
gcc/common.opt		patch \| blob \| blame \| history
gcc/hsa.c		patch \| blob \| blame \| history
gcc/passes.def		patch \| blob \| blame \| history
gcc/tree-data-ref.h		patch \| blob \| blame \| history
gcc/tree-pass.h		patch \| blob \| blame \| history
gcc/tree-ssa-loop.c		patch \| blob \| blame \| history
gcc/tree-vect-data-refs.c		patch \| blob \| blame \| history
gcc/tree-vect-loop.c		patch \| blob \| blame \| history
gcc/tree-vect-unified.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-unified.h	[new file with mode: 0644]	patch \| blob
gcc/tree-vectorizer.c		patch \| blob \| blame \| history
gcc/tree-vectorizer.h		patch \| blob \| blame \| history