]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Avoid splitting store dataref groups during SLP discovery
authorRichard Biener <rguenther@suse.de>
Fri, 29 Sep 2023 11:13:16 +0000 (13:13 +0200)
committerRichard Biener <rguenther@suse.de>
Fri, 24 May 2024 05:48:41 +0000 (07:48 +0200)
The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating VEC_PERM
SLP nodes merging them so the store will always cover the whole group.

With this for example

int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
  for (int i = 0; i < 256; i++)
    {
      x[4*i+0] = y[2*i+0];
      x[4*i+1] = y[2*i+1];
      x[4*i+2] = z[i];
      x[4*i+3] = w[i];
    }
}

which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark).  We of course need a VF of four here.

.L2:
        movdqa  z(%rax), %xmm0
        movdqa  w(%rax), %xmm4
        movdqa  y(%rax,%rax), %xmm2
        movdqa  y+16(%rax,%rax), %xmm1
        movdqa  %xmm0, %xmm3
        punpckhdq       %xmm4, %xmm0
        punpckldq       %xmm4, %xmm3
        movdqa  %xmm2, %xmm4
        shufps  $238, %xmm3, %xmm2
        movaps  %xmm2, x+16(,%rax,4)
        movdqa  %xmm1, %xmm2
        shufps  $68, %xmm3, %xmm4
        shufps  $68, %xmm0, %xmm2
        movaps  %xmm4, x(,%rax,4)
        shufps  $238, %xmm0, %xmm1
        movaps  %xmm2, x+32(,%rax,4)
        movaps  %xmm1, x+48(,%rax,4)
        addq    $16, %rax
        cmpq    $1024, %rax
        jne     .L2

The extra permute nodes merging distinct branches of the SLP
tree might be unexpected for some code, esp. since
SLP_TREE_REPRESENTATIVE cannot be meaningfully set and we
cannot populate SLP_TREE_SCALAR_STMTS or SLP_TREE_SCALAR_OPS
consistently as we can have a mix of both.

The patch keeps the sub-trees form consecutive lanes but that's
in principle not necessary if we for example have an even/odd
split which now would result in N single-lane sub-trees.  That's
left for future improvements.

The interesting part is how VLA vector ISAs handle merging of
two vectors that's not trivial even/odd merging.  The strathegy
of how to build the permute tree might need adjustments for that
(in the end splitting each branch to single lanes and then doing
even/odd merging would be the brute-force fallback).  Not sure
how much we can or should rely on the SLP optimize pass to handle
this.

The gcc.dg/vect/slp-12a.c case is interesting as we currently split
the 8 store group into lanes 0-5 which we SLP with an unroll factor
of two (on x86-64 with SSE) and the remaining two lanes are using
interleaving vectorization with a final unroll factor of four.  Thus
we're using hybrid SLP within a single store group.  After the change
we discover the same 0-5 lane SLP part as well as two single-lane
parts feeding the full store group.  But that results in a load
permutation that isn't supported (I have WIP patchs to rectify that).
So we end up cancelling SLP and vectorizing the whole loop with
interleaving which is IMO good and results in better code.

This is similar for gcc.target/i386/pr52252-atom.c where interleaving
generates much better code than hybrid SLP.  I'm unsure how to update
the testcase though.

gcc.dg/vect/slp-21.c runs into similar situations.  Note that when
when analyzing SLP operations we discard an instance we currently
force the full loop to have no SLP because hybrid detection is
broken.  It's probably not worth fixing this at this moment.

For gcc.dg/vect/pr97428.c we are not splitting the 16 store group
into two but merge the two 8 lane loads into one before doing the
store and thus have only a single SLP instance.  A similar situation
happens in gcc.dg/vect/slp-11c.c but the branches feeding the
single SLP store only have a single lane.  Likewise for
gcc.dg/vect/vect-complex-5.c and gcc.dg/vect/vect-gather-2.c.

gcc.dg/vect/slp-cond-1.c has an additional SLP vectorization
with a SLP store group of size two but two single-lane branches.

* tree-vect-slp.cc (vect_build_slp_instance): Do not split
store dataref groups on loop SLP discovery failure but create
a single SLP instance for the stores but branch to SLP sub-trees
and merge with a series of VEC_PERM nodes.

* gcc.dg/vect/pr97428.c: Expect a single store SLP group.
* gcc.dg/vect/slp-11c.c: Likewise, if !vect_load_lanes.
* gcc.dg/vect/vect-complex-5.c: Likewise.
* gcc.dg/vect/slp-12a.c: Do not expect SLP.
* gcc.dg/vect/slp-21.c: Remove not important scanning for SLP.
* gcc.dg/vect/slp-cond-1.c: Expect one more SLP if !vect_load_lanes.
* gcc.dg/vect/vect-gather-2.c: Expect SLP to be used.
* gcc.target/i386/pr52252-atom.c: XFAIL test for palignr.

gcc/testsuite/gcc.dg/vect/pr97428.c
gcc/testsuite/gcc.dg/vect/slp-11c.c
gcc/testsuite/gcc.dg/vect/slp-12a.c
gcc/testsuite/gcc.dg/vect/slp-21.c
gcc/testsuite/gcc.dg/vect/slp-cond-1.c
gcc/testsuite/gcc.dg/vect/vect-complex-5.c
gcc/testsuite/gcc.dg/vect/vect-gather-2.c
gcc/testsuite/gcc.target/i386/pr52252-atom.c
gcc/tree-vect-slp.cc

index 60dd984cfd365b17d83da4c81f5aa38b1d0e3ca1..3cc9976c00cd5a8a9860664c400f4ceb8c1fd113 100644 (file)
@@ -44,5 +44,5 @@ void foo_i2(dcmlx4_t dst[], const dcmlx_t src[], int n)
 /* { dg-final { scan-tree-dump "Detected interleaving store of size 16" "vect" } } */
 /* We're not able to peel & apply re-aligning to make accesses well-aligned for !vect_hw_misalign,
    but we could by peeling the stores for alignment and applying re-aligning loads.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { ! vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { ! vect_hw_misalign } } } } */
 /* { dg-final { scan-tree-dump-not "gap of 6 elements" "vect" } } */
index 0f680cd4e60c41624992e4fb68d2c3664ff1722e..2e70fca39ba1bc0a9b1a15d67612a1dd1c88374a 100644 (file)
@@ -13,7 +13,8 @@ main1 ()
   unsigned int in[N*8] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
   float out[N*8];
 
-  /* Different operations - not SLPable.  */
+  /* Different operations - we SLP the store and split the group to two
+     single-lane branches.  */
   for (i = 0; i < N*4; i++)
     {
       out[i*2] = ((float) in[i*2] * 2 + 6) ;
@@ -44,4 +45,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0  "vect"  } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } } } } */
index 973de6ada21467a4b13f39c2dfd98fd5ce4a9603..2f98dc9da0b18044721ea0232ec198980503b53f 100644 (file)
@@ -40,6 +40,10 @@ main1 ()
       out[i*8 + 3] = b3 - 1;
       out[i*8 + 4] = b4 - 8;
       out[i*8 + 5] = b5 - 7;
+      /* Due to the use in the ia[i] store we keep the feeding expression
+         in the form ((in[i*8 + 6] + 11) * 3 - 3) while other expressions
+        got associated as for example (in[i*5 + 5] * 4 + 33).  That
+        causes SLP discovery to fail.  */
       out[i*8 + 6] = b6 - 3;
       out[i*8 + 7] = b7 - 7;
 
@@ -76,5 +80,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_strided8 && {! vect_load_lanes } } && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { vect_strided8 && {! vect_load_lanes } } && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
index 5875168841428a5722630c6ae053d3c60d5772b3..0528a144e579e23cf94c1445f95ec0cc4b743753 100644 (file)
@@ -12,6 +12,7 @@ main1 ()
   unsigned short out[N*8], out2[N*8], b0, b1, b2, b3, b4, a0, a1, a2, a3, b5;
   unsigned short in[N*8];
 
+#pragma GCC novector
   for (i = 0; i < N*8; i++)
     {
       in[i] = i;
@@ -202,18 +203,5 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect"  { target { vect_strided4 || vect_extract_even_odd } } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target  { ! { vect_strided4 || vect_extract_even_odd } } } } } */
-/* Some targets can vectorize the second of the three main loops using
-   hybrid SLP.  For 128-bit vectors, the required 4->3 permutations are:
-
-   { 0, 1, 2, 4, 5, 6, 8, 9 }
-   { 2, 4, 5, 6, 8, 9, 10, 12 }
-   { 5, 6, 8, 9, 10, 12, 13, 14 }
-
-   Not all vect_perm targets support that, and it's a bit too specific to have
-   its own effective-target selector, so we just test targets directly.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { powerpc64*-*-* s390*-*-* loongarch*-*-* } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_strided4 && { ! { powerpc64*-*-* s390*-*-* loongarch*-*-* } } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  { target { ! { vect_strided4 } } } } } */
-  
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  { target { vect_strided4 || vect_extract_even_odd } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  { target  { ! { vect_strided4 || vect_extract_even_odd } } } } } */
index 450c7141c96b07b9f798c62950d3de30eeab9a28..c76ea5d17efa938d61a2b967e1392cd9860b1bbe 100644 (file)
@@ -125,4 +125,5 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { ! vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target { vect_load_lanes } } } } */
index addcf60438cd85a311532be3aa24915e25302ddd..ac562dc475c7bf73cf49a662fa406d51b6bddf69 100644 (file)
@@ -41,4 +41,4 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */
index 4c23b80833326a7d5242299ec32e959aa8095c04..10e64e64d47511825def499faf6523d0b8dde61a 100644 (file)
@@ -36,6 +36,5 @@ f3 (int *restrict y, int *restrict x, int *restrict indices)
     }
 }
 
-/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect } } */
 /* { dg-final { scan-tree-dump "different gather base" vect { target { ! vect_gather_load_ifn } } } } */
 /* { dg-final { scan-tree-dump "different gather scale" vect { target { ! vect_gather_load_ifn } } } } */
index 11f94411575784c7021f20cbd47fdd67c4bb6fe0..02736d56d3106fb9c73ae52628b239af499d8ac7 100644 (file)
@@ -25,4 +25,5 @@ matrix_mul (byte *in, byte *out, int size)
     }
 }
 
-/* { dg-final { scan-assembler "palignr" } } */
+/* We are no longer using hybrid SLP.  */
+/* { dg-final { scan-assembler "palignr" { xfail *-*-* } } } */
index 3f8209b43a756645d7e7a9825e6b11e045c4d802..c7ed520b62942cc321cb3792d9efa0f6d0a78ed7 100644 (file)
@@ -3468,12 +3468,7 @@ vect_build_slp_instance (vec_info *vinfo,
          return true;
        }
     }
-  else
-    {
-      /* Failed to SLP.  */
-      /* Free the allocated memory.  */
-      scalar_stmts.release ();
-    }
+  /* Failed to SLP.  */
 
   stmt_vec_info stmt_info = stmt_info_;
   /* Try to break the group up into pieces.  */
@@ -3491,6 +3486,9 @@ vect_build_slp_instance (vec_info *vinfo,
       if (is_a <bb_vec_info> (vinfo)
          && (i > 1 && i < group_size))
        {
+         /* Free the allocated memory.  */
+         scalar_stmts.release ();
+
          tree scalar_type
            = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
          tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3535,38 +3533,236 @@ vect_build_slp_instance (vec_info *vinfo,
            }
        }
 
-      /* For loop vectorization split into arbitrary pieces of size > 1.  */
-      if (is_a <loop_vec_info> (vinfo)
-         && (i > 1 && i < group_size)
-         && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
+      /* For loop vectorization split the RHS into arbitrary pieces of
+        size >= 1.  */
+      else if (is_a <loop_vec_info> (vinfo)
+              && (i > 0 && i < group_size)
+              && !vect_slp_prefer_store_lanes_p (vinfo,
+                                                 stmt_info, group_size, i))
        {
-         unsigned group1_size = i;
-
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "Splitting SLP group at stmt %u\n", i);
 
-         stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
-                                                          group1_size);
-         /* Loop vectorization cannot handle gaps in stores, make sure
-            the split group appears as strided.  */
-         STMT_VINFO_STRIDED_P (rest) = 1;
-         DR_GROUP_GAP (rest) = 0;
-         STMT_VINFO_STRIDED_P (stmt_info) = 1;
-         DR_GROUP_GAP (stmt_info) = 0;
+         /* Analyze the stored values and pinch them together with
+            a permute node so we can preserve the whole store group.  */
+         auto_vec<slp_tree> rhs_nodes;
+
+         /* Calculate the unrolling factor based on the smallest type.  */
+         poly_uint64 unrolling_factor = 1;
+
+         unsigned int start = 0, end = i;
+         while (start < group_size)
+           {
+             gcc_assert (end - start >= 1);
+             vec<stmt_vec_info> substmts;
+             substmts.create (end - start);
+             for (unsigned j = start; j < end; ++j)
+               substmts.quick_push (scalar_stmts[j]);
+             max_nunits = 1;
+             node = vect_build_slp_tree (vinfo, substmts, end - start,
+                                         &max_nunits,
+                                         matches, limit, &tree_size, bst_map);
+             if (node)
+               {
+                 /* ???  Possibly not safe, but not sure how to check
+                    and fail SLP build?  */
+                 unrolling_factor
+                   = force_common_multiple (unrolling_factor,
+                                            calculate_unrolling_factor
+                                              (max_nunits, end - start));
+                 rhs_nodes.safe_push (node);
+                 start = end;
+                 end = group_size;
+               }
+             else
+               {
+                 substmts.release ();
+                 if (end - start == 1)
+                   {
+                     /* Single-lane discovery failed.  Free ressources.  */
+                     for (auto node : rhs_nodes)
+                       vect_free_slp_tree (node);
+                     scalar_stmts.release ();
+                     if (dump_enabled_p ())
+                       dump_printf_loc (MSG_NOTE, vect_location,
+                                        "SLP discovery failed\n");
+                     return false;
+                   }
+
+                 /* ???  It really happens that we soft-fail SLP
+                    build at a mismatch but the matching part hard-fails
+                    later.  As we know we arrived here with a group
+                    larger than one try a group of size one!  */
+                 if (!matches[0])
+                   end = start + 1;
+                 else
+                   for (unsigned j = start; j < end; j++)
+                     if (!matches[j - start])
+                       {
+                         end = j;
+                         break;
+                       }
+               }
+           }
+
+         /* Now we assume we can build the root SLP node from all
+            stores.  */
+         node = vect_create_new_slp_node (scalar_stmts,
+                                          SLP_TREE_CHILDREN
+                                            (rhs_nodes[0]).length ());
+         SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+         for (unsigned l = 0;
+              l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
+           {
+             /* And a permute merging all RHS SLP trees.  */
+             slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
+                                                       VEC_PERM_EXPR);
+             SLP_TREE_CHILDREN (node).quick_push (perm);
+             SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
+             SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+             SLP_TREE_LANES (perm) = group_size;
+             /* ???  We should set this NULL but that's not expected.  */
+             SLP_TREE_REPRESENTATIVE (perm)
+               = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
+             for (unsigned j = 0; j < rhs_nodes.length (); ++j)
+               {
+                 SLP_TREE_CHILDREN (perm)
+                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
+                 for (unsigned k = 0;
+                      k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
+                   {
+                     /* ???  We should populate SLP_TREE_SCALAR_STMTS
+                        or SLP_TREE_SCALAR_OPS but then we might have
+                        a mix of both in our children.  */
+                     SLP_TREE_LANE_PERMUTATION (perm)
+                       .quick_push (std::make_pair (j, k));
+                   }
+               }
 
-         bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
-                                               kind, max_tree_size, limit);
-         if (i + 1 < group_size)
-           res |= vect_analyze_slp_instance (vinfo, bst_map,
-                                             rest, kind, max_tree_size, limit);
+             /* Now we have a single permute node but we cannot code-generate
+                the case with more than two inputs.
+                Perform pairwise reduction, reducing the two inputs
+                with the least number of lanes to one and then repeat until
+                we end up with two inputs.  That scheme makes sure we end
+                up with permutes satisfying the restriction of requiring at
+                most two vector inputs to produce a single vector output.  */
+             while (SLP_TREE_CHILDREN (perm).length () > 2)
+               {
+                 /* Pick the two nodes with the least number of lanes,
+                    prefer the earliest candidate and maintain ai < bi.  */
+                 int ai = -1;
+                 int bi = -1;
+                 for (unsigned ci = 0;
+                      ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
+                   {
+                     if (ai == -1)
+                       ai = ci;
+                     else if (bi == -1)
+                       bi = ci;
+                     else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
+                               < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
+                              || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
+                                  < SLP_TREE_LANES
+                                      (SLP_TREE_CHILDREN (perm)[bi])))
+                       {
+                         if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
+                             <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
+                           bi = ci;
+                         else
+                           {
+                             ai = bi;
+                             bi = ci;
+                           }
+                       }
+                   }
 
-         return res;
+                 /* Produce a merge of nodes ai and bi.  */
+                 slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
+                 slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
+                 unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
+                 slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
+                 SLP_TREE_LANES (permab) = n;
+                 SLP_TREE_LANE_PERMUTATION (permab).create (n);
+                 SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
+                 /* ???  We should set this NULL but that's not expected.  */
+                 SLP_TREE_REPRESENTATIVE (permab)
+                   = SLP_TREE_REPRESENTATIVE (perm);
+                 SLP_TREE_CHILDREN (permab).quick_push (a);
+                 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
+                   SLP_TREE_LANE_PERMUTATION (permab)
+                     .quick_push (std::make_pair (0, k));
+                 SLP_TREE_CHILDREN (permab).quick_push (b);
+                 for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
+                   SLP_TREE_LANE_PERMUTATION (permab)
+                     .quick_push (std::make_pair (1, k));
+
+                 /* Put the merged node into 'perm', in place of a.  */
+                 SLP_TREE_CHILDREN (perm)[ai] = permab;
+                 /* Adjust the references to b in the permutation
+                    of perm and to the later children which we'll
+                    remove.  */
+                 for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
+                   {
+                     std::pair<unsigned, unsigned> &p
+                         = SLP_TREE_LANE_PERMUTATION (perm)[k];
+                     if (p.first == (unsigned) bi)
+                       {
+                         p.first = ai;
+                         p.second += SLP_TREE_LANES (a);
+                       }
+                     else if (p.first > (unsigned) bi)
+                       p.first--;
+                   }
+                 SLP_TREE_CHILDREN (perm).ordered_remove (bi);
+               }
+           }
+
+         /* Create a new SLP instance.  */
+         slp_instance new_instance = XNEW (class _slp_instance);
+         SLP_INSTANCE_TREE (new_instance) = node;
+         SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+         SLP_INSTANCE_LOADS (new_instance) = vNULL;
+         SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+         SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
+         SLP_INSTANCE_KIND (new_instance) = kind;
+         new_instance->reduc_phis = NULL;
+         new_instance->cost_vec = vNULL;
+         new_instance->subgraph_entries = vNULL;
+
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "SLP size %u vs. limit %u.\n",
+                            tree_size, max_tree_size);
+
+         vinfo->slp_instances.safe_push (new_instance);
+
+         /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+            the number of scalar stmts in the root in a few places.
+            Verify that assumption holds.  */
+         gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+                       .length () == group_size);
+
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "Final SLP tree for instance %p:\n",
+                              (void *) new_instance);
+             vect_print_slp_graph (MSG_NOTE, vect_location,
+                                   SLP_INSTANCE_TREE (new_instance));
+           }
+         return true;
        }
+      else
+       /* Free the allocated memory.  */
+       scalar_stmts.release ();
 
       /* Even though the first vector did not all match, we might be able to SLP
         (some) of the remainder.  FORNOW ignore this possibility.  */
     }
+  else
+    /* Free the allocated memory.  */
+    scalar_stmts.release ();
 
   /* Failed to SLP.  */
   if (dump_enabled_p ())