return NULL_RTX;
}
-/* Expand a vector initialisation sequence, such that TARGET is
- initialised to contain VALS. */
+/* A subroutine of aarch64_expand_vector_init, with the same interface.
+ The caller has already tried a divide-and-conquer approach, so do
+ not consider that case here. */
void
-aarch64_expand_vector_init (rtx target, rtx vals)
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
{
machine_mode mode = GET_MODE (target);
scalar_mode inner_mode = GET_MODE_INNER (mode);
return;
}
- /* Check for interleaving case.
- For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
- Generate following code:
- dup v0.h, x
- dup v1.h, y
- zip1 v0.h, v0.h, v1.h
- for "large enough" initializer. */
-
- if (n_elts >= 8)
- {
- int i;
- for (i = 2; i < n_elts; i++)
- if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
- break;
-
- if (i == n_elts)
- {
- machine_mode mode = GET_MODE (target);
- rtx dest[2];
-
- for (int i = 0; i < 2; i++)
- {
- rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
- dest[i] = force_reg (mode, x);
- }
-
- rtvec v = gen_rtvec (2, dest[0], dest[1]);
- emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
- return;
- }
- }
-
enum insn_code icode = optab_handler (vec_set_optab, mode);
gcc_assert (icode != CODE_FOR_nothing);
}
XVECEXP (copy, 0, i) = subst;
}
- aarch64_expand_vector_init (target, copy);
+ aarch64_expand_vector_init_fallback (target, copy);
}
/* Insert the variable lanes directly. */
}
}
+/* Return even or odd half of VALS depending on EVEN_P. */
+
+static rtx
+aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
+{
+ int n = XVECLEN (vals, 0);
+ machine_mode new_mode
+ = aarch64_simd_container_mode (GET_MODE_INNER (mode),
+ GET_MODE_BITSIZE (mode).to_constant () / 2);
+ rtvec vec = rtvec_alloc (n / 2);
+ for (int i = 0; i < n / 2; i++)
+ RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+ : XVECEXP (vals, 0, 2 * i + 1);
+ return gen_rtx_PARALLEL (new_mode, vec);
+}
+
+/* Expand a vector initialization sequence, such that TARGET is
+ initialized to contain VALS. */
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+ /* Try decomposing the initializer into even and odd halves and
+ then ZIP them together. Use the resulting sequence if it is
+ strictly cheaper than loading VALS directly.
+
+ Prefer the fallback sequence in the event of a tie, since it
+ will tend to use fewer registers. */
+
+ machine_mode mode = GET_MODE (target);
+ int n_elts = XVECLEN (vals, 0);
+
+ if (n_elts < 4
+ || maybe_ne (GET_MODE_BITSIZE (mode), 128))
+ {
+ aarch64_expand_vector_init_fallback (target, vals);
+ return;
+ }
+
+ start_sequence ();
+ rtx halves[2];
+ unsigned costs[2];
+ for (int i = 0; i < 2; i++)
+ {
+ start_sequence ();
+ rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
+ rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
+ aarch64_expand_vector_init (tmp_reg, new_vals);
+ halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
+ rtx_insn *rec_seq = get_insns ();
+ end_sequence ();
+ costs[i] = seq_cost (rec_seq, !optimize_size);
+ emit_insn (rec_seq);
+ }
+
+ rtvec v = gen_rtvec (2, halves[0], halves[1]);
+ rtx_insn *zip1_insn
+ = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+ unsigned seq_total_cost
+ = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
+ seq_total_cost += insn_cost (zip1_insn, !optimize_size);
+
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
+
+ start_sequence ();
+ aarch64_expand_vector_init_fallback (target, vals);
+ rtx_insn *fallback_seq = get_insns ();
+ unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
+ end_sequence ();
+
+ emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
+}
+
/* Emit RTL corresponding to:
insr TARGET, ELEM. */
+++ /dev/null
-/* { dg-do compile } */
-/* { dg-options "-O3" } */
-/* { dg-final { check-function-bodies "**" "" "" } } */
-
-#include <arm_neon.h>
-
-/*
-** foo:
-** ...
-** dup v[0-9]+\.8h, w[0-9]+
-** dup v[0-9]+\.8h, w[0-9]+
-** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
-** ...
-** ret
-*/
-
-int16x8_t foo(int16_t x, int y)
-{
- int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
- return v;
-}
-
-/*
-** foo2:
-** ...
-** dup v[0-9]+\.8h, w[0-9]+
-** movi v[0-9]+\.8h, 0x1
-** zip1 v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
-** ...
-** ret
-*/
-
-int16x8_t foo2(int16_t x)
-{
- int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
- return v;
-}
/*
** cons2_8_float:
-** dup v([0-9]+)\.4s, .*
-** ...
+** dup v[0-9]+\.2s, v[0-9]+\.s\[0\]
+** dup v[0-9]+\.2s, v[0-9]+\.s\[0\]
+** zip1 v([0-9]+)\.4s, v[0-9]+\.4s, v[0-9]+\.4s
** stp q\1, q\1, \[x0\]
** stp q\1, q\1, \[x0, #?32\]
** ret
/* { dg-final { scan-assembler-not {\tldr\t} } } */
/* { dg-final { scan-assembler {, [wx]0\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w3\n} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.4s\n} } } */
/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
/* { dg-final { scan-assembler-not {\tldr\t} } } */
/* { dg-final { scan-assembler {, [wx]0\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w3\n} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.4s\n} } } */
/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int16x8_t foo(int16_t x, int y)
+{
+ int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
+ return v;
+}
+
+int16x8_t foo2(int16_t x)
+{
+ int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
+ return v;
+}
+
+/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
+/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
+/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int8x16_t f_s8(int8_t x)
+{
+ return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+ x, 5, x, 6, x, 7, x, 8 };
+}
+
+/* { dg-final { scan-assembler {\tdup\tv[0-9]+\.8b, w[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\td[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+ return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+ 4, y, 5, y, 6, y, 7, y };
+}
+
+/* { dg-final { scan-assembler {\tdup\tv[0-9]+\.8b, w[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\td[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.b\[0|7\], w[0-9]+} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+ return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14 };
+}
+
+/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.b\[0|15\], w0} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.b\[1|14\], w1} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+/* Verify that fallback code-sequence is chosen over
+ recursively generated code-sequence merged with zip1. */
+
+#include "vec-init-22.h"
+
+/* { dg-final { scan-assembler {\tfmov\ts[0-9]+, w0|w7} } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[[1-7]\], w[0-9]+} 7 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+/* Verify that we recursively generate code for even and odd halves
+ instead of fallback code. This is so despite the longer code-gen
+ because it has fewer dependencies and thus has lesser cost. */
+
+#include "vec-init-22.h"
+
+/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[[1-3]\], w[0-9]+} 6 } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} } } */
--- /dev/null
+#include <arm_neon.h>
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+ int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+ return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}