}
)
+(define_insn "*aarch64_combine_internal<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VS32_I_SUB64_F 1 "register_operand")
+ (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_nonimmediate_operand")))]
+ "TARGET_FLOAT
+ && !BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ]
+ [ w , w , w ; neon_permute , simd ] uzp1\t%0.<Vdduptype>, %1.<Vdduptype>, %2.<Vdduptype>
+ [ w , 0 , w ; neon_move , simd ] mov\t%0.<single_type>[1], %2.<single_type>[0]
+ [ w , 0 , Utv ; neon_load1_one_lane , simd ] ld1\t{%0.<single_type>}[1], %2
+ [ w , 0 , r ; neon_from_gp , simd ] ins\t%0.<single_type>[1], %<single_wx>2
+ [ ?r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+ }
+)
+
+(define_insn "*aarch64_combine_internal<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VSSUB32_I 1 "register_operand")
+ (match_operand:VSSUB32_I 2 "aarch64_simd_nonimmediate_operand")))]
+ "TARGET_FLOAT
+ && !BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ]
+ [ r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+ }
+)
+
(define_insn "*aarch64_combine_internal_be<mode>"
[(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand")
(vec_concat:<VDBL>
}
)
+(define_insn "*aarch64_combine_internal_be<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_nonimmediate_operand")
+ (match_operand:VS32_I_SUB64_F 1 "register_operand")))]
+ "TARGET_FLOAT
+ && BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ]
+ [ w , w , w ; neon_permute , simd ] uzp1\t%0.<Vdduptype>, %1.<Vdduptype>, %2.<Vdduptype>
+ [ w , 0 , w ; neon_move , simd ] mov\t%0.<single_type>[1], %2.<single_type>[0]
+ [ w , 0 , Utv ; neon_load1_one_lane , simd ] ld1\t{%0.<single_type>}[1], %2
+ [ w , 0 , r ; neon_from_gp , simd ] ins\t%0.<single_type>[1], %<single_wx>2
+ [ ?r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+ }
+)
+
+(define_insn "*aarch64_combine_internal_be<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VSSUB32_I 2 "aarch64_simd_nonimmediate_operand")
+ (match_operand:VSSUB32_I 1 "register_operand")))]
+ "TARGET_FLOAT
+ && BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ]
+ [ r , 0 , r ; bfm , * ] bfi\t%<single_dwx>0, %<single_dwx>2, <bitsize>, <bitsize>
+ }
+)
+
+
;; In this insn, operand 1 should be low, and operand 2 the high part of the
;; dest vector.
}
)
+(define_insn "*aarch64_combinez<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VSSUB32_I 1 "nonimmediate_operand")
+ (match_operand:VSSUB32_I 2 "aarch64_simd_or_scalar_imm_zero")))]
+ "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 ; attrs: type ]
+ [ r , r ; mov_reg ] uxt<size>\t%w0, %w1
+ [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1
+ }
+)
+
+(define_insn "*aarch64_combinez<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand")
+ (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_or_scalar_imm_zero")))]
+ "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 ; attrs: type ]
+ [ w , w ; neon_move ] fmov\t%<single_type>0, %<single_type>1
+ [ w , r ; neon_from_gp ] fmov\t%<single_type>0, %<single_wx>1
+ [ w , m ; neon_load1_1reg ] ldr\t%<single_type>0, %1
+ [ r , r ; mov_reg ] uxtw\t%x0, %w1
+ [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1
+ }
+)
+
(define_insn "*aarch64_combinez_be<mode>"
[(set (match_operand:<VDBL> 0 "register_operand")
(vec_concat:<VDBL>
}
)
+(define_insn "*aarch64_combinez_be<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VSSUB32_I 2 "aarch64_simd_or_scalar_imm_zero")
+ (match_operand:VSSUB32_I 1 "nonimmediate_operand")))]
+ "TARGET_FLOAT && BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 ; attrs: type ]
+ [ r , r ; mov_reg ] uxt<size>\t%w0, %w1
+ [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1
+ }
+)
+
+(define_insn "*aarch64_combinez_be<mode>"
+ [(set (match_operand:<VDBL> 0 "register_operand")
+ (vec_concat:<VDBL>
+ (match_operand:VS32_I_SUB64_F 2 "aarch64_simd_or_scalar_imm_zero")
+ (match_operand:VS32_I_SUB64_F 1 "nonimmediate_operand")))]
+ "TARGET_FLOAT && BYTES_BIG_ENDIAN"
+ {@ [ cons: =0 , 1 ; attrs: type ]
+ [ w , w ; neon_move ] fmov\t%<single_type>0, %<single_type>1
+ [ w , r ; neon_from_gp ] fmov\t%<single_type>0, %<single_wx>1
+ [ w , m ; neon_load1_1reg ] ldr\t%<single_type>0, %1
+ [ r , r ; mov_reg ] uxtw\t%x0, %w1
+ [ r , m ; load_4 ] ldr<size>\t%<single_wx>0, %1
+ }
+)
+
;; Form a vector whose first half (in array order) comes from operand 1
;; and whose second half (in array order) comes from operand 2.
;; This operand order follows the RTL vec_concat operation.
(define_expand "@aarch64_vec_concat<mode>"
[(set (match_operand:<VDBL> 0 "register_operand")
(vec_concat:<VDBL>
- (match_operand:VDCSIF 1 "general_operand")
- (match_operand:VDCSIF 2 "general_operand")))]
+ (match_operand:VQDUP 1 "general_operand")
+ (match_operand:VQDUP 2 "general_operand")))]
"TARGET_FLOAT"
{
int lo = BYTES_BIG_ENDIAN ? 2 : 1;
int n_var = 0;
/* The first element of vals. */
rtx v0 = XVECEXP (vals, 0, 0);
+ machine_mode v0mode = GET_MODE (v0);
bool all_same = true;
- /* This is a special vec_init<M><N> where N is not an element mode but a
+ /* This is a special vec_init<M><N> where N is either an element mode or a
vector mode with half the elements of M. We expect to find two entries
of mode N in VALS and we must put their concatentation into TARGET. */
- if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
+ if (n_elts == 2 && (VECTOR_MODE_P (v0mode)
+ || SCALAR_INT_MODE_P (v0mode)
+ || SCALAR_FLOAT_MODE_P (v0mode)))
{
- machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
+ rtx v1 = XVECEXP (vals, 0, 1);
+ machine_mode narrow_mode = GET_MODE (v0);
gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
&& known_eq (GET_MODE_SIZE (mode),
2 * GET_MODE_SIZE (narrow_mode)));
- emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
- XVECEXP (vals, 0, 0),
- XVECEXP (vals, 0, 1)));
- return;
+ if (rtx_equal_p (v0, v1))
+ aarch64_emit_move (target,
+ gen_vec_duplicate (mode,
+ force_reg (narrow_mode, v0)));
+ else
+ emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
+ v0, v1));
+ return;
}
/* Count the number of variable elements to initialise. */
;; All sub-64-bit vector modes.
(define_mode_iterator VSUB64 [V2QI V4QI V2HI V2HF V2BF])
+;; All sub-64-bit scalar modes.
+(define_mode_iterator SSUB64 [QI HI HF BF SI SF])
+
+;; All sub-64-bit modes.
+(define_mode_iterator VSSUB64 [VSUB64 SSUB64])
+
+;; All sub-32-bit integer modes.
+(define_mode_iterator VSSUB32_I [V2QI QI HI])
+
+;; All sub-64-bit floating-point modes.
+(define_mode_iterator VSSUB64_F [V2HF V2BF HF BF])
+
+;; All 32-bit integer and sub-64-bit floating point modes.
+(define_mode_iterator VS32_I_SUB64_F [V4QI V2HI VSSUB64_F])
+
;; All Advanced SIMD modes suitable for moving, loading, and storing.
(define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
(define_mode_attr bitsize [(V8QI "64") (V16QI "128")
(V4HI "64") (V8HI "128")
(V2SI "64") (V4SI "128")
- (V1DI "64") (V2DI "128")])
+ (V1DI "64") (V2DI "128")
+ (QI "8") (V2QI "16")
+ (V4QI "32") (HI "16")
+ (HF "16") (BF "16")
+ (SI "32") (SF "32")
+ (V2HI "32") (V2HF "32")
+ (V2BF "32")])
;; Map a floating point or integer mode to the appropriate register name prefix
(define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
(define_mode_attr V1half [(V2DI "v1di") (V2DF "v1df")])
;; Double modes of vector modes.
-(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
+(define_mode_attr VDBL [(V8QI "V16QI") (V4QI "V8QI")
+ (V2QI "V4QI") (V4HI "V8HI")
(V4HF "V8HF") (V4BF "V8BF")
+ (V2BF "V4BF")
(V2SI "V4SI") (V2SF "V4SF")
+ (V2HI "V4HI") (V2HF "V4HF")
+ (BF "V2BF")
(SI "V2SI") (SF "V2SF")
+ (QI "V2QI")
+ (HI "V2HI") (HF "V2HF")
(DI "V2DI") (DF "V2DF")])
;; Load/store pair mode.
(V2SI "x") (V2SF "x")
(DI "x") (DF "x")])
+(define_mode_attr single_dwx [(SI "x") (SF "x")
+ (V2QI "w") (V4QI "x")
+ (V2HI "x") (V2HF "x")
+ (HF "w") (QI "w")
+ (V2BF "x") (BF "w")
+ (HI "w")])
+
+
;; Whether a mode fits in S or D registers (i.e. "s" for 32-bit modes
;; and "d" for 64-bit modes).
(define_mode_attr single_type [(SI "s") (SF "s")
x[i] += y[index[i]];
}
-/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl #?3\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tldr\td[0-9]+, \[x[0-9]+, x[0-9]+, lsl #?3\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1\t{v[0-9]+\.d}\[1\], \[x[0-9]+\]} 1 } } */
/* { dg-final { scan-assembler-not {\tshl\tv[0-9]+\.2d,} } } */
/* { dg-final { scan-assembler-not {\tumov\t} } } */
/* { dg-final { scan-assembler {\tadd\tv[0-9]+\.2d,} } } */
TEST_ALL (VEC_PERM)
/* We should use one DUP for each of the 8-, 16- and 32-bit types,
- (for now, insert both elements with ins for _Float16). We should use two
+ (and we now use fmov + ins for _Float16). We should use two
DUPs for each of the three 64-bit types. */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
-/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[0\], v[0-9]+\.h\[0\]} 3 } } */
-/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tfmov\th[0-9]+, h} 1 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[1\], v[0-9]+\.h\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
/* { dg-final { scan-assembler-not {\tzip2\t} } } */
return v;
}
-/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, v[0-9]+\.s\[0\]} 1 } } */
-/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4s, w[0-9]+} 1 } } */
-/* { dg-final { scan-assembler-times {\tmov\tw[0-9]+, 65537} 1 } } */
-/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 0, 16} 1 } } */
-/* { dg-final { scan-assembler-times {\tbfi\tw[0-9]+, w[0-9]+, 16, 16} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tw1, 1} 1 } } */
+/* { dg-final { scan-assembler-times {\tdup\tv0+\.4s, w0} 2 } } */
+/* { dg-final { scan-assembler-times {\tbfi\tw0, w1, 16, 16} 2 } } */
/*
** test_int8_5:
-** mov w1, 0
-** bfi w1, w0, 0, 8
-** dup v0\.8h, w1
+** uxtb w0, w0
+** dup v0\.8h, w0
** ret
*/
** test_float16_2:
** fcvt h1, s1
** fcvt h0, s0
-** ins v0\.h\[1\], v1\.h\[0\]
+** uzp1 v0\.4h, v0\.4h, v1\.4h
** dup v0\.4s, v0\.s\[0\]
** ret
*/
** uzp1 v2\.2s, v0\.2s, v2\.2s
** uzp1 v3\.2s, v1\.2s, v3\.2s
** zip1 v3\.4s, v2\.4s, v3\.4s
-** fcvtn v0\.4h, v3\.4s
-** uzp1 v0\.2d, v0\.2d, v0\.2d
+** fcvtn v3\.4h, v3\.4s
+** dup v0\.2d, v3\.d\[0\]
** ret
*/
/*
** test_float16_4:
** fcvt h0, s0
-** movi v31\.2d, #0
-** ins v31\.h\[0\], v0\.h\[0\]
-** dup v0\.4s, v31\.s\[0\]
+** fmov h0, h0
+** dup v0\.4s, v0\.s\[0\]
** ret
*/
/*
** test_float16_5:
+** movi v31\.4h, #0
** fcvt h0, s0
-** movi v31\.2d, #0
-** ins v31\.h\[1\], v0\.h\[0\]
-** dup v0\.4s, v31\.s\[0\]
+** uzp1 v0\.4h, v31\.4h, v0\.4h
+** dup v0\.4s, v0\.s\[0\]
** ret
*/
/*
** test_float16_6:
-** fcvt h1, s1
** fcvt h0, s0
-** movi v31\.2d, #0
-** mov w0, 1006648320
-** umov w1, v1\.h\[0\]
-** ins v31\.h\[0\], v0\.h\[0\]
-** bfi w0, w1, 0, 16
-** dup v31\.2s, v31\.s\[0\]
-** dup v0\.2s, w0
-** zip1 v0\.8h, v31\.8h, v0\.8h
+** fcvt h1, s1
+** fmov h31, 1.0e\+0
+** fmov h0, h0
+** uzp1 v1\.4h, v1\.4h, v31\.4h
+** dup v0\.2s, v0\.s\[0\]
+** dup v1\.2s, v1\.s\[0\]
+** zip1 v0\.8h, v0\.8h, v1\.8h
** ret
*/
/*
** test_float16_7:
-** fcvt h1, s1
** fcvt h0, s0
-** movi v31\.2d, #0
-** mov w0, 1006648320
-** umov w1, v1\.h\[0\]
-** ins v31\.h\[1\], v0\.h\[0\]
-** bfi w0, w1, 16, 16
+** movi v31\.4h, #0
+** fcvt h1, s1
+** uzp1 v31\.4h, v31\.4h, v0\.4h
+** fmov h0, 1.0e\+0
+** uzp1 v0\.4h, v0\.4h, v1\.4h
** dup v31\.2s, v31\.s\[0\]
-** dup v0\.2s, w0
+** dup v0\.2s, v0\.s\[0\]
** zip1 v0\.8h, v31\.8h, v0\.8h
** ret
*/
** fcvt h1, s1
** fcvt h0, s0
** movi v31\.2s, 0x3c, lsl 24
-** ins v0\.h\[1\], v1\.h\[0\]
+** uzp1 v0\.4h, v0\.4h, v1\.4h
** dup v0\.2s, v0\.s\[0\]
** zip1 v0\.8h, v31\.8h, v0\.8h
** ret
/*
** test_int16_4:
-** mov w1, 0
-** bfi w1, w0, 0, 16
-** dup v0\.4s, w1
+** uxth w0, w0
+** dup v0\.4s, w0
** ret
*/
/*
** test_int16_6:
-** mov w2, 0
-** bfi w2, w0, 0, 16
-** mov w0, 65537
-** bfi w0, w1, 0, 16
-** dup v31\.2s, w2
-** dup v0\.2s, w0
+** uxth w0, w0
+** dup v31\.2s, w0
+** mov w0, 1
+** bfi w1, w0, 16, 16
+** dup v0\.2s, w1
** zip1 v0\.8h, v31\.8h, v0\.8h
** ret
*/
/*
** test_float32_3:
-** movi v31\.2s, 0
-** dup v0\.2s, v0\.s\[0\]
-** zip1 v0\.4s, v0\.4s, v31\.4s
+** fmov s0, s0
+** dup v0\.2d, v0\.d\[0\]
** ret
*/
/*
** test_float32_4:
-** movi v31\.2s, 0
-** dup v0\.2s, v0\.s\[0\]
-** zip1 v0\.4s, v31\.4s, v0\.4s
+** movi v31\.2s, #0
+** uzp1 v0\.2s, v31\.2s, v0\.2s
+** dup v0\.2d, v0\.d\[0\]
** ret
*/
/*
** test_int32_3:
-** dup v31\.2s, w0
-** movi v0\.2s, 0
-** zip1 v0\.4s, v31\.4s, v0\.4s
+** fmov s0, w0
+** dup v0\.2d, v0\.d\[0\]
** ret
*/