/* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-options "-O2 -msse2 -mno-avx --param ix86-vect-compare-costs=0" } */
void foo (int * __restrict a, int *b, int s)
{
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2 -mno-avx --param ix86-vect-compare-costs=1" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[8*i+0] = b[s*i+0];
+ a[8*i+1] = b[s*i+1];
+ a[8*i+2] = b[s*i+2];
+ a[8*i+3] = b[s*i+3];
+ a[8*i+4] = b[s*i+4];
+ a[8*i+5] = b[s*i+5];
+ a[8*i+6] = b[s*i+4];
+ a[8*i+7] = b[s*i+5];
+ }
+}
+
+/* Three two-element loads, four two-element stores. No wider loads
+ or permutes. */
+/* { dg-final { scan-assembler-times "movq" 7 } } */
+/* { dg-final { scan-assembler-times "movhps" 0 } } */
+/* { dg-final { scan-assembler-times "movups" 0 } } */
/* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-options "-O2 -msse2 -mno-avx --param ix86-vect-compare-costs=0" } */
void foo (int * __restrict a, int *b, int s)
{
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2 -mno-avx --param ix86-vect-compare-costs=1" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[4*i+0] = b[s*i+0];
+ a[4*i+1] = b[s*i+1];
+ a[4*i+2] = b[s*i+0];
+ a[4*i+3] = b[s*i+1];
+ }
+}
+
+/* One two-element load, two two-element stores. */
+/* { dg-final { scan-assembler-times "movq" 3 } } */
+/* { dg-final { scan-assembler-times "movups" 0 } } */
/* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -mno-sse4 -fno-tree-slp-vectorize" } */
+/* { dg-options "-O2 -msse2 -mno-sse4 -fno-tree-slp-vectorize --param ix86-vect-compare-costs=0" } */
void foo (int * __restrict a, int *b, int s)
{
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2 -mno-sse4 -fno-tree-slp-vectorize --param ix86-vect-compare-costs=1" } */
+
+void foo (int * __restrict a, int *b, int s)
+{
+ if (s >= 6)
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[s*i+0] = b[4*i+0];
+ a[s*i+1] = b[4*i+1];
+ a[s*i+2] = b[4*i+2];
+ a[s*i+3] = b[4*i+3];
+ a[s*i+4] = b[4*i+0];
+ a[s*i+5] = b[4*i+1];
+ }
+}
+
+/* The vectorizer generates 3 uint64 stores and two uint64 loads. */
+/* { dg-final { scan-assembler-times "movq" 5 } } */
+/* { dg-final { scan-assembler-times "movhps" 0 } } */
/* { dg-do compile } */
-/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize" } */
+/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize --param ix86-vect-compare-costs=0" } */
void foo (int * __restrict a, int * __restrict b, int *c, int s)
{
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize --param ix86-vect-compare-costs=1" } */
+
+void foo (int * __restrict a, int * __restrict b, int *c, int s)
+{
+ if (s >= 2)
+ for (int i = 0; i < 1024; ++i)
+ {
+ a[s*i+0] = c[4*i+0];
+ a[s*i+1] = c[4*i+1];
+ b[s*i+0] = c[4*i+2];
+ b[s*i+1] = c[4*i+3];
+ }
+}
+
+/* Vectorization factor two, two two-element stores using movq
+ and two two-element stores to b via movq. One reg-reg copy with movq. */
+/* { dg-final { scan-assembler-times "movq\[^\r\n\]+\\\(" 4 } } */
+/* { dg-final { scan-assembler-times "pextrq" 0 } } */