;; Only used for splitting insert_d and copy_{u,s}.d.
(define_mode_iterator LASX_WD [V4DI V4DF V8SI V8SF])
+(define_mode_iterator LASX_PART [V4DI V4DF V8SF])
;; Only used for copy256_{u,s}.w.
(define_mode_iterator LASX_W [V8SI V8SF])
[(set_attr "move_type" "fmove")
(set_attr "mode" "<MODE>")])
+;; vr0 -> low xr0
+;;
+(define_insn "vec_cast<mode>"
+ [(set (match_operand:LASX_PART 0 "register_operand" "=f")
+ (subreg:LASX_PART
+ (match_operand:<VHMODE256_ALL> 1 "register_operand" "0") 0))]
+ "ISA_HAS_LASX"
+ ""
+ [(set_attr "type" "simd_splat")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "vec_insert_lo_<mode>"
+ [(set (match_operand:LASX_PART 0 "register_operand" "=f")
+ (vec_concat:LASX_PART
+ (match_operand:<VHMODE256_ALL> 2 "register_operand" "f")
+ (vec_select:<VHMODE256_ALL>
+ (match_operand:LASX_PART 1 "register_operand" "0")
+ (match_operand:LASX_PART 3 "vect_par_cnst_high_half"))))]
+ "ISA_HAS_LASX"
+ "xvpermi.q\t%u0,%u2,0x30"
+ [(set_attr "type" "simd_splat")
+ (set_attr "mode" "<MODE>")])
+
+(define_insn "vec_insert_hi_<mode>"
+ [(set (match_operand:LASX_PART 0 "register_operand" "=f")
+ (vec_concat:LASX_PART
+ (vec_select:<VHMODE256_ALL>
+ (match_operand:LASX_PART 1 "register_operand" "0")
+ (match_operand:LASX_PART 3 "vect_par_cnst_low_half"))
+ (match_operand:<VHMODE256_ALL> 2 "register_operand" "f")))]
+ "ISA_HAS_LASX"
+ "xvpermi.q\t%u0,%u2,0x02"
+ [(set_attr "type" "simd_splat")
+ (set_attr "mode" "<MODE>")])
+
(define_expand "vec_perm<mode>"
[(match_operand:LASX 0 "register_operand")
(match_operand:LASX 1 "register_operand")
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
+#include <lsxintrin.h>
+
#ifndef _GCC_LOONGSON_ASXINTRIN_H
#define _GCC_LOONGSON_ASXINTRIN_H 1
#define __lasx_xvrepli_w(/*si10*/ _1) \
((__m256i)__builtin_lasx_xvrepli_w ((_1)))
+#if defined (__loongarch_asx_sx_conv)
+/* Add builtin interfaces for 128 and 256 vector conversions.
+ For the assembly instruction format of some functions of the following vector
+ conversion, it is not described exactly in accordance with the format of the
+ generated assembly instruction.
+ In the front end of the Rust language, different built-in functions are called
+ by analyzing the format of assembly instructions. The data types of instructions
+ are all defined based on the interfaces of the defined functions, in the
+ following order: output, input... . */
+/* Assembly instruction format: xd, vj. */
+/* Data types in instruction templates: V8SF, V4SF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_cast_128_s (__m128 _1)
+{
+ return (__m256)__builtin_lasx_cast_128_s ((v4f32)_1);
+}
+
+/* Assembly instruction format: xd, vj. */
+/* Data types in instruction templates: V4DF, V2DF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_cast_128_d (__m128d _1)
+{
+ return (__m256d)__builtin_lasx_cast_128_d ((v2f64)_1);
+}
+
+/* Assembly instruction format: xd, vj. */
+/* Data types in instruction templates: V4DI, V2DI. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_cast_128 (__m128i _1)
+{
+ return (__m256i)__builtin_lasx_cast_128 ((v2i64)_1);
+}
+
+/* Assembly instruction format: xd, vj, vk. */
+/* Data types in instruction templates: V8SF, V4SF, V4SF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_concat_128_s (__m128 _1, __m128 _2)
+{
+ return (__m256)__builtin_lasx_concat_128_s ((v4f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format: xd, vj, vk. */
+/* Data types in instruction templates: V4DF, V2DF, V2DF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_concat_128_d (__m128d _1, __m128d _2)
+{
+ return (__m256d)__builtin_lasx_concat_128_d ((v2f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format: xd, vj, vk. */
+/* Data types in instruction templates: V4DI, V2DI, V2DI. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_concat_128 (__m128i _1, __m128i _2)
+{
+ return (__m256i)__builtin_lasx_concat_128 ((v2i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format: vd, xj. */
+/* Data types in instruction templates: V4SF, V8SF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lasx_extract_128_lo_s (__m256 _1)
+{
+ return (__m128)__builtin_lasx_extract_128_lo_s ((v8f32)_1);
+}
+
+/* Assembly instruction format: vd, xj. */
+/* Data types in instruction templates: V4SF, V8SF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128 __lasx_extract_128_hi_s (__m256 _1)
+{
+ return (__m128)__builtin_lasx_extract_128_hi_s ((v8f32)_1);
+}
+
+/* Assembly instruction format: vd, xj. */
+/* Data types in instruction templates: V2DF, V4DF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lasx_extract_128_lo_d (__m256d _1)
+{
+ return (__m128d)__builtin_lasx_extract_128_lo_d ((v4f64)_1);
+}
+
+/* Assembly instruction format: vd, xj. */
+/* Data types in instruction templates: V2DF, V4DF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128d __lasx_extract_128_hi_d (__m256d _1)
+{
+ return (__m128d)__builtin_lasx_extract_128_hi_d ((v4f64)_1);
+}
+
+/* Assembly instruction format: vd, xj. */
+/* Data types in instruction templates: V2DI, V4DI. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lasx_extract_128_lo (__m256i _1)
+{
+ return (__m128i)__builtin_lasx_extract_128_lo ((v4i64)_1);
+}
+
+/* Assembly instruction format: vd, xj. */
+/* Data types in instruction templates: V2DI, V4DI. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m128i __lasx_extract_128_hi (__m256i _1)
+{
+ return (__m128i)__builtin_lasx_extract_128_hi ((v4i64)_1);
+}
+
+/* Assembly instruction format: xd, xj, vk. */
+/* Data types in instruction templates: V8SF, V8SF, V4SF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_insert_128_lo_s (__m256 _1, __m128 _2)
+{
+ return (__m256)__builtin_lasx_insert_128_lo_s ((v8f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format: xd, xj, vk. */
+/* Data types in instruction templates: V8SF, V8SF, V4SF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256 __lasx_insert_128_hi_s (__m256 _1, __m128 _2)
+{
+ return (__m256)__builtin_lasx_insert_128_hi_s ((v8f32)_1, (v4f32)_2);
+}
+
+/* Assembly instruction format: xd, xj, vk. */
+/* Data types in instruction templates: V4DF, V4DF, V2DF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_insert_128_lo_d (__m256d _1, __m128d _2)
+{
+ return (__m256d)__builtin_lasx_insert_128_lo_d ((v4f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format: xd, xj, vk. */
+/* Data types in instruction templates: V4DF, V4DF, V2DF. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256d __lasx_insert_128_hi_d (__m256d _1, __m128d _2)
+{
+ return (__m256d)__builtin_lasx_insert_128_hi_d ((v4f64)_1, (v2f64)_2);
+}
+
+/* Assembly instruction format: xd, xj, vk. */
+/* Data types in instruction templates: V4DI, V4DI, V2DI. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_insert_128_lo (__m256i _1, __m128i _2)
+{
+ return (__m256i)__builtin_lasx_insert_128_lo ((v4i64)_1, (v2i64)_2);
+}
+
+/* Assembly instruction format: xd, xj, vk. */
+/* Data types in instruction templates: V4DI, V4DI, V2DI. */
+extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__m256i __lasx_insert_128_hi (__m256i _1, __m128i _2)
+{
+ return (__m256i)__builtin_lasx_insert_128_hi ((v4i64)_1, (v2i64)_2);
+}
+
+#endif /* defined(__loongarch_asx_sx_conv). */
#endif /* defined(__loongarch_asx). */
#endif /* _GCC_LOONGSON_ASXINTRIN_H. */
#define CODE_FOR_lasx_xvmaddwod_q_du CODE_FOR_lasx_maddwod_q_du_punned
#define CODE_FOR_lasx_xvmaddwod_q_du_d CODE_FOR_lasx_maddwod_q_du_d_punned
+
+/* Add mutual conversion between 128 and 256 vectors. */
+#define CODE_FOR_lasx_extract_128_lo_s CODE_FOR_vec_extract_lo_v8sf
+#define CODE_FOR_lasx_extract_128_hi_s CODE_FOR_vec_extract_hi_v8sf
+#define CODE_FOR_lasx_extract_128_lo_d CODE_FOR_vec_extract_lo_v4df
+#define CODE_FOR_lasx_extract_128_hi_d CODE_FOR_vec_extract_hi_v4df
+#define CODE_FOR_lasx_extract_128_lo CODE_FOR_vec_extract_lo_v4di
+#define CODE_FOR_lasx_extract_128_hi CODE_FOR_vec_extract_hi_v4di
+#define CODE_FOR_lasx_insert_128_lo_s CODE_FOR_vec_insert_lo_v8sf
+#define CODE_FOR_lasx_insert_128_hi_s CODE_FOR_vec_insert_hi_v8sf
+#define CODE_FOR_lasx_insert_128_lo_d CODE_FOR_vec_insert_lo_v4df
+#define CODE_FOR_lasx_insert_128_hi_d CODE_FOR_vec_insert_hi_v4df
+#define CODE_FOR_lasx_insert_128_lo CODE_FOR_vec_insert_lo_v4di
+#define CODE_FOR_lasx_insert_128_hi CODE_FOR_vec_insert_hi_v4di
+#define CODE_FOR_lasx_concat_128_s CODE_FOR_vec_concatv8sf
+#define CODE_FOR_lasx_concat_128_d CODE_FOR_vec_concatv4df
+#define CODE_FOR_lasx_concat_128 CODE_FOR_vec_concatv4di
+#define CODE_FOR_lasx_cast_128_s CODE_FOR_vec_castv8sf
+#define CODE_FOR_lasx_cast_128_d CODE_FOR_vec_castv4df
+#define CODE_FOR_lasx_cast_128 CODE_FOR_vec_castv4di
+
static const struct loongarch_builtin_description loongarch_builtins[] = {
#define LARCH_MOVFCSR2GR 0
DIRECT_BUILTIN (movfcsr2gr, LARCH_USI_FTYPE_UQI, hard_float),
LASX_BUILTIN (xvssrarni_bu_h, LARCH_UV32QI_FTYPE_UV32QI_V32QI_USI),
LASX_BUILTIN (xvssrarni_hu_w, LARCH_UV16HI_FTYPE_UV16HI_V16HI_USI),
LASX_BUILTIN (xvssrarni_wu_d, LARCH_UV8SI_FTYPE_UV8SI_V8SI_USI),
- LASX_BUILTIN (xvssrarni_du_q, LARCH_UV4DI_FTYPE_UV4DI_V4DI_USI)
+ LASX_BUILTIN (xvssrarni_du_q, LARCH_UV4DI_FTYPE_UV4DI_V4DI_USI),
+ LASX_BUILTIN (extract_128_lo_s, LARCH_V4SF_FTYPE_V8SF),
+ LASX_BUILTIN (extract_128_hi_s, LARCH_V4SF_FTYPE_V8SF),
+ LASX_BUILTIN (extract_128_lo_d, LARCH_V2DF_FTYPE_V4DF),
+ LASX_BUILTIN (extract_128_hi_d, LARCH_V2DF_FTYPE_V4DF),
+ LASX_BUILTIN (extract_128_lo, LARCH_V2DI_FTYPE_V4DI),
+ LASX_BUILTIN (extract_128_hi, LARCH_V2DI_FTYPE_V4DI),
+ LASX_BUILTIN (insert_128_lo_s, LARCH_V8SF_FTYPE_V8SF_V4SF),
+ LASX_BUILTIN (insert_128_hi_s, LARCH_V8SF_FTYPE_V8SF_V4SF),
+ LASX_BUILTIN (insert_128_lo_d, LARCH_V4DF_FTYPE_V4DF_V2DF),
+ LASX_BUILTIN (insert_128_hi_d, LARCH_V4DF_FTYPE_V4DF_V2DF),
+ LASX_BUILTIN (insert_128_lo, LARCH_V4DI_FTYPE_V4DI_V2DI),
+ LASX_BUILTIN (insert_128_hi, LARCH_V4DI_FTYPE_V4DI_V2DI),
+ LASX_BUILTIN (concat_128_s, LARCH_V8SF_FTYPE_V4SF_V4SF),
+ LASX_BUILTIN (concat_128_d, LARCH_V4DF_FTYPE_V2DF_V2DF),
+ LASX_BUILTIN (concat_128, LARCH_V4DI_FTYPE_V2DI_V2DI),
+ LASX_BUILTIN (cast_128_s, LARCH_V8SF_FTYPE_V4SF),
+ LASX_BUILTIN (cast_128_d, LARCH_V4DF_FTYPE_V2DF),
+ LASX_BUILTIN (cast_128, LARCH_V4DI_FTYPE_V2DI)
};
/* Index I is the function declaration for loongarch_builtins[I], or null if
{
struct expand_operand ops[MAX_RECOG_OPERANDS];
int opno, argno;
+ /* For vector extraction/insertion operations, sel_high_p being true
+ indicates that the high of the data is selected/retained from the
+ vector register. */
+ bool sel_high_p = true;
/* Map any target to operand 0. */
opno = 0;
create_input_operand (&ops[1], CONST1_RTX (ops[0].mode), ops[0].mode);
return loongarch_expand_builtin_insn (icode, 3, ops, has_target_p);
+ case CODE_FOR_vec_extract_lo_v8sf:
+ case CODE_FOR_vec_extract_lo_v4df:
+ case CODE_FOR_vec_extract_lo_v4di:
+ sel_high_p = false;
+ /* Fall through. */
+ case CODE_FOR_vec_extract_hi_v8sf:
+ case CODE_FOR_vec_extract_hi_v4df:
+ case CODE_FOR_vec_extract_hi_v4di:
+ {
+ /* The selection method for constructing the high/low half. */
+ loongarch_prepare_builtin_arg (&ops[1], exp, 0);
+ int nelts = GET_MODE_NUNITS (GET_MODE (ops[1].value));
+ int half_nelts = nelts / 2;
+ int base = sel_high_p ? half_nelts : 0;
+
+ rtx pat_rtx
+ = loongarch_gen_stepped_int_parallel (half_nelts, base, 1);
+ create_input_operand (&ops[2], pat_rtx, ops[1].mode);
+
+ return loongarch_expand_builtin_insn (icode, 3, ops, has_target_p);
+ }
+
+ case CODE_FOR_vec_insert_hi_v8sf:
+ case CODE_FOR_vec_insert_hi_v4df:
+ case CODE_FOR_vec_insert_hi_v4di:
+ sel_high_p = false;
+ /* Fall through. */
+ case CODE_FOR_vec_insert_lo_v8sf:
+ case CODE_FOR_vec_insert_lo_v4df:
+ case CODE_FOR_vec_insert_lo_v4di:
+ {
+ /* The selection method for constructing the high/low half. */
+ loongarch_prepare_builtin_arg (&ops[1], exp, 0);
+ loongarch_prepare_builtin_arg (&ops[2], exp, 1);
+ int nelts = GET_MODE_NUNITS (GET_MODE (ops[1].value));
+ int half_nelts = nelts / 2;
+ int base = sel_high_p ? half_nelts : 0;
+
+ rtx pat_rtx
+ = loongarch_gen_stepped_int_parallel (half_nelts, base, 1);
+ create_input_operand (&ops[3], pat_rtx, ops[1].mode);
+
+ return loongarch_expand_builtin_insn (icode, 4, ops, has_target_p);
+ }
+
default:
break;
}
loongarch_def_or_undef (ISA_HAS_LSX, "__loongarch_simd", pfile);
loongarch_def_or_undef (ISA_HAS_LSX, "__loongarch_sx", pfile);
loongarch_def_or_undef (ISA_HAS_LASX, "__loongarch_asx", pfile);
+ loongarch_def_or_undef (ISA_HAS_LASX, "__loongarch_asx_sx_conv", pfile);
builtin_undef ("__loongarch_simd_width");
if (ISA_HAS_LSX)
DEF_LARCH_FTYPE (1, (UDI, USI))
DEF_LARCH_FTYPE (1, (USI, UQI))
DEF_LARCH_FTYPE (1, (VOID, USI))
+DEF_LARCH_FTYPE (1, (V4SF, V8SF))
+DEF_LARCH_FTYPE (1, (V2DF, V4DF))
+DEF_LARCH_FTYPE (1, (V2DI, V4DI))
+DEF_LARCH_FTYPE (1, (V8SF, V4SF))
+DEF_LARCH_FTYPE (1, (V4DF, V2DF))
+DEF_LARCH_FTYPE (1, (V4DI, V2DI))
DEF_LARCH_FTYPE (2, (VOID, UQI, USI))
DEF_LARCH_FTYPE (2, (VOID, UHI, USI))
DEF_LARCH_FTYPE (2, (SI, DI, SI))
DEF_LARCH_FTYPE (2, (USI, USI, USI))
DEF_LARCH_FTYPE (2, (UDI, UDI, USI))
+DEF_LARCH_FTYPE (2, (V8SF, V4SF, V4SF))
+DEF_LARCH_FTYPE (2, (V4DF, V2DF, V2DF))
+DEF_LARCH_FTYPE (2, (V4DI, V2DI, V2DI))
+DEF_LARCH_FTYPE (2, (V8SF, V8SF, V4SF))
+DEF_LARCH_FTYPE (2, (V4DF, V4DF, V2DF))
+DEF_LARCH_FTYPE (2, (V4DI, V4DI, V2DI))
DEF_LARCH_FTYPE (3, (VOID, USI, USI, SI))
DEF_LARCH_FTYPE (3, (VOID, USI, UDI, SI))
These built-in functions are available for LoongArch.
-Data Type Description:
+@menu
+* Data Types::
+* Directly-mapped Builtin Functions::
+* Directly-mapped Division Builtin Functions::
+* Other Builtin Functions::
+@end menu
+
+@node Data Types
+@subsubsection Data Types
+
@itemize
@item @code{imm0_31}, a compile-time constant in range 0 to 31;
@item @code{imm0_16383}, a compile-time constant in range 0 to 16383;
@item @code{imm_n2048_2047}, a compile-time constant in range -2048 to 2047;
@end itemize
+@node Directly-mapped Builtin Functions
+@subsubsection Directly-mapped Builtin Functions
+
The intrinsics provided are listed below:
@smallexample
unsigned int __builtin_loongarch_movfcsr2gr (imm0_31)
void __break (imm0_32767)
@end smallexample
+@node Directly-mapped Division Builtin Functions
+@subsubsection Directly-mapped Division Builtin Functions
+
These intrinsic functions are available by including @code{larchintrin.h} and
using @option{-mfrecipe}.
@smallexample
double __frsqrte_d (double);
@end smallexample
+@node Other Builtin Functions
+@subsubsection Other Builtin Functions
+
Additional built-in functions are available for LoongArch family
processors to efficiently use 128-bit floating-point (__float128)
values.
The interface is made available by including @code{<lsxintrin.h>} and using
@option{-mlsx}.
+@menu
+* SX Data Types::
+* Directly-mapped SX Builtin Functions::
+* Directly-mapped SX Division Builtin Functions::
+@end menu
+
+@node SX Data Types
+@subsubsection SX Data Types
+
The following vectors typedefs are included in @code{lsxintrin.h}:
@itemize
@item @code{imm_n2048_2047}, an integer literal in range -2048 to 2047.
@end itemize
+@node Directly-mapped SX Builtin Functions
+@subsubsection Directly-mapped SX Builtin Functions
+
For convenience, GCC defines functions @code{__lsx_vrepli_@{b/h/w/d@}} and
@code{__lsx_b[n]z_@{v/b/h/w/d@}}, which are implemented as follows:
__m128i __lsx_vxor_v (__m128i, __m128i);
@end smallexample
+@node Directly-mapped SX Division Builtin Functions
+@subsubsection Directly-mapped SX Division Builtin Functions
+
These intrinsic functions are available by including @code{lsxintrin.h} and
using @option{-mfrecipe} and @option{-mlsx}.
@smallexample
instructions. The interface is made available by including @code{<lasxintrin.h>}
and using @option{-mlasx}.
+@menu
+* ASX Data Types::
+* Directly-mapped ASX Builtin Functions::
+* Directly-mapped ASX Division Builtin Functions::
+* Directly-mapped SX and ASX Conversion Builtin Functions::
+@end menu
+
+@node ASX Data Types
+@subsubsection ASX Data Types
+
The following vectors typedefs are included in @code{lasxintrin.h}:
@itemize
@item @code{imm_n2048_2047}, an integer literal in range -2048 to 2047.
@end itemize
+@node Directly-mapped ASX Builtin Functions
+@subsubsection Directly-mapped ASX Builtin Functions
+
For convenience, GCC defines functions @code{__lasx_xvrepli_@{b/h/w/d@}} and
@code{__lasx_b[n]z_@{v/b/h/w/d@}}, which are implemented as follows:
__m256i __lasx_xvxor_v (__m256i, __m256i);
@end smallexample
+@node Directly-mapped ASX Division Builtin Functions
+@subsubsection Directly-mapped ASX Division Builtin Functions
+
These intrinsic functions are available by including @code{lasxintrin.h} and
using @option{-mfrecipe} and @option{-mlasx}.
@smallexample
__m256 __lasx_xvfrsqrte_s (__m256);
@end smallexample
+@node Directly-mapped SX and ASX Conversion Builtin Functions
+@subsubsection Directly-mapped SX and ASX Conversion Builtin Functions
+
+For convenience, the @code{lsxintrin.h} file was imported into @code{
+lasxintrin.h} and 18 new interface functions for 128 and 256 vector
+conversions were added, using the @option{-mlasx} option.
+@smallexample
+__m256 __lasx_cast_128_s (__m128);
+__m256d __lasx_cast_128_d (__m128d);
+__m256i __lasx_cast_128 (__m128i);
+__m256 __lasx_concat_128_s (__m128, __m128);
+__m256d __lasx_concat_128_d (__m128d, __m128d);
+__m256i __lasx_concat_128 (__m128i, __m128i);
+__m128 __lasx_extract_128_lo_s (__m256);
+__m128 __lasx_extract_128_hi_s (__m256);
+__m128d __lasx_extract_128_lo_d (__m256d);
+__m128d __lasx_extract_128_hi_d (__m256d);
+__m128i __lasx_extract_128_lo (__m256i);
+__m128i __lasx_extract_128_hi (__m256i);
+__m256 __lasx_insert_128_lo_s (__m256, __m128);
+__m256 __lasx_insert_128_hi_s (__m256, __m128);
+__m256d __lasx_insert_128_lo_d (__m256d, __m128d);
+__m256d __lasx_insert_128_hi_d (__m256d, __m128d);
+__m256i __lasx_insert_128_lo (__m256i, __m128i);
+__m256i __lasx_insert_128_hi (__m256i, __m128i);
+@end smallexample
+
+When gcc does not support interfaces for 128 and 256 conversions,
+use the following code for equivalent substitution.
+
+@smallexample
+
+ #ifndef __loongarch_asx_sx_conv
+
+ #include <lasxintrin.h>
+ #include <lsxintrin.h>
+ __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_cast_128_s (__m128 src)
+ @{
+ __m256 dest;
+ asm ("" : "=f"(dest) : "0"(src));
+ return dest;
+ @}
+
+ __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_cast_128_d (__m128d src)
+ @{
+ __m256d dest;
+ asm ("" : "=f"(dest) : "0"(src));
+ return dest;
+ @}
+
+ __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_cast_128 (__m128i src)
+ @{
+ __m256i dest;
+ asm ("" : "=f"(dest) : "0"(src));
+ return dest;
+ @}
+
+ __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_concat_128_s (__m128 src1, __m128 src2)
+ @{
+ __m256 dest;
+ asm ("xvpermi.q %u0,%u2,0x02\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_concat_128_d (__m128d src1, __m128d src2)
+ @{
+ __m256d dest;
+ asm ("xvpermi.q %u0,%u2,0x02\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_concat_128 (__m128i src1, __m128i src2)
+ @{
+ __m256i dest;
+ asm ("xvpermi.q %u0,%u2,0x02\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m128 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_extract_128_lo_s (__m256 src)
+ @{
+ __m128 dest;
+ asm ("" : "=f"(dest) : "0"(src));
+ return dest;
+ @}
+
+ __m128d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_extract_128_lo_d (__m256d src)
+ @{
+ __m128d dest;
+ asm ("" : "=f"(dest) : "0"(src));
+ return dest;
+ @}
+
+ __m128i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_extract_128_lo (__m256i src)
+ @{
+ __m128i dest;
+ asm ("" : "=f"(dest) : "0"(src));
+ return dest;
+ @}
+
+ __m128 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_extract_128_hi_s (__m256 src)
+ @{
+ __m128 dest;
+ asm ("xvpermi.d %u0,%u1,0xe\n"
+ : "=f"(dest)
+ : "f"(src));
+ return dest;
+ @}
+
+ __m128d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_extract_128_hi_d (__m256d src)
+ @{
+ __m128d dest;
+ asm ("xvpermi.d %u0,%u1,0xe\n"
+ : "=f"(dest)
+ : "f"(src));
+ return dest;
+ @}
+
+ __m128i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_extract_128_hi (__m256i src)
+ @{
+ __m128i dest;
+ asm ("xvpermi.d %u0,%u1,0xe\n"
+ : "=f"(dest)
+ : "f"(src));
+ return dest;
+ @}
+
+ __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_insert_128_lo_s (__m256 src1, __m128 src2)
+ @{
+ __m256 dest;
+ asm ("xvpermi.q %u0,%u2,0x30\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_insert_128_lo_d (__m256d a, __m128d b)
+ @{
+ __m256d dest;
+ asm ("xvpermi.q %u0,%u2,0x30\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_insert_128_lo (__m256i src1, __m128i src2)
+ @{
+ __m256i dest;
+ asm ("xvpermi.q %u0,%u2,0x30\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m256 inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_insert_128_hi_s (__m256 src1, __m128 src2)
+ @{
+ __m256 dest;
+ asm ("xvpermi.q %u0,%u2,0x02\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m256d inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_insert_128_hi_d (__m256d src1, __m128d src2)
+ @{
+ __m256d dest;
+ asm ("xvpermi.q %u0,%u2,0x02\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+
+ __m256i inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __lasx_insert_128_hi (__m256i src1, __m128i src2)
+ @{
+ __m256i dest;
+ asm ("xvpermi.q %u0,%u2,0x02\n"
+ : "=f"(dest)
+ : "0"(src1), "f"(src2));
+ return dest;
+ @}
+ #endif
+
+@end smallexample
+
@node MIPS DSP Built-in Functions
@subsection MIPS DSP Built-in Functions
--- /dev/null
+/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */
+
+#include "../simd_correctness_check.h"
+#include <lasxintrin.h>
+
+int
+main ()
+{
+ __m128i __m128i_op0, __m128i_op1, __m128i_op2, __m128i_out, __m128i_result;
+ __m128 __m128_op0, __m128_op1, __m128_op2, __m128_out, __m128_result;
+ __m128d __m128d_op0, __m128d_op1, __m128d_op2, __m128d_out, __m128d_result;
+
+ __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result;
+ __m256 __m256_op0, __m256_op1, __m256_op2, __m256_out, __m256_result;
+ __m256d __m256d_op0, __m256d_op1, __m256d_op2, __m256d_out, __m256d_result;
+
+ //__m128_op0={1,2,3,4},__m128_op1={5,6,7,8};
+ *((int *)&__m128_op0[3]) = 0x40800000;
+ *((int *)&__m128_op0[2]) = 0x40400000;
+ *((int *)&__m128_op0[1]) = 0x40000000;
+ *((int *)&__m128_op0[0]) = 0x3f800000;
+ *((int *)&__m128_op1[3]) = 0x41000000;
+ *((int *)&__m128_op1[2]) = 0x40e00000;
+ *((int *)&__m128_op1[1]) = 0x40c00000;
+ *((int *)&__m128_op1[0]) = 0x40a00000;
+ *((int *)&__m256_result[7]) = 0x41000000;
+ *((int *)&__m256_result[6]) = 0x40e00000;
+ *((int *)&__m256_result[5]) = 0x40c00000;
+ *((int *)&__m256_result[4]) = 0x40a00000;
+ *((int *)&__m256_result[3]) = 0x40800000;
+ *((int *)&__m256_result[2]) = 0x40400000;
+ *((int *)&__m256_result[1]) = 0x40000000;
+ *((int *)&__m256_result[0]) = 0x3f800000;
+ __m256_out = __lasx_concat_128_s (__m128_op0, __m128_op1);
+ ASSERTEQ_32 (__LINE__, __m256_result, __m256_out);
+ __m256_out = __lasx_cast_128_s (__m128_op0);
+ ASSERTEQ_32 (__LINE__, __m256_out, __m128_op0);
+
+ //__m128i_op0={1,2},__m128i_op1={3,4};
+ *((unsigned long *)&__m128i_op0[1]) = 0x2;
+ *((unsigned long *)&__m128i_op0[0]) = 0x1;
+ *((unsigned long *)&__m128i_op1[1]) = 0x4;
+ *((unsigned long *)&__m128i_op1[0]) = 0x3;
+ *((unsigned long *)&__m256i_result[3]) = 0x4;
+ *((unsigned long *)&__m256i_result[2]) = 0x3;
+ *((unsigned long *)&__m256i_result[1]) = 0x2;
+ *((unsigned long *)&__m256i_result[0]) = 0x1;
+ __m256i_out = __lasx_concat_128 (__m128i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out);
+ __m256i_out = __lasx_cast_128 (__m128i_op0);
+ ASSERTEQ_64 (__LINE__, __m256i_out, __m128i_op0);
+
+ //__m128d_op0={1,2},__m128i_op1={3,4};
+ *((unsigned long *)&__m128d_op0[1]) = 0x4000000000000000;
+ *((unsigned long *)&__m128d_op0[0]) = 0x3ff0000000000000;
+ *((unsigned long *)&__m128d_op1[1]) = 0x4010000000000000;
+ *((unsigned long *)&__m128d_op1[0]) = 0x4008000000000000;
+ *((unsigned long *)&__m256d_result[3]) = 0x4010000000000000;
+ *((unsigned long *)&__m256d_result[2]) = 0x4008000000000000;
+ *((unsigned long *)&__m256d_result[1]) = 0x4000000000000000;
+ *((unsigned long *)&__m256d_result[0]) = 0x3ff0000000000000;
+ __m256d_out = __lasx_concat_128_d (__m128d_op0, __m128d_op1);
+ ASSERTEQ_64 (__LINE__, __m256d_result, __m256d_out);
+ __m256d_out = __lasx_cast_128_d (__m128d_op0);
+ ASSERTEQ_64 (__LINE__, __m256d_out, __m128d_op0);
+
+ return 0;
+}
--- /dev/null
+/* { dg-do compile { target { loongarch64*-*-* } } } */
+/* { dg-options "-mabi=lp64d -O2 -mlasx" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <lasxintrin.h>
+
+/*
+**foo1:
+** vinsgr2vr.d (\$vr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,1
+** vinsgr2vr.d (\$vr[0-9]+),\$r8,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256
+foo1 (__m128 x, __m128 y)
+{
+ return __builtin_lasx_concat_128_s (x, y);
+}
+
+/*
+**foo2:
+** vinsgr2vr.d (\$vr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,1
+** vinsgr2vr.d (\$vr[0-9]+),\$r8,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256d
+foo2 (__m128d x, __m128d y)
+{
+ return __builtin_lasx_concat_128_d (x, y);
+}
+
+/*
+**foo3:
+** vinsgr2vr.d (\$vr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,1
+** vinsgr2vr.d (\$vr[0-9]+),\$r8,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256i
+foo3 (__m128i x, __m128i y)
+{
+ return __builtin_lasx_concat_128 (x, y);
+}
+
+/*
+**foo4:
+** vinsgr2vr.d (\$vr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,1
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256
+foo4 (__m128 x)
+{
+ return __builtin_lasx_cast_128_s (x);
+}
+
+/*
+**foo5:
+** vinsgr2vr.d (\$vr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,1
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256d
+foo5 (__m128d x)
+{
+ return __builtin_lasx_cast_128_d (x);
+}
+
+/*
+**foo6:
+** vinsgr2vr.d (\$vr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,1
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256i
+foo6 (__m128i x)
+{
+ return __builtin_lasx_cast_128 (x);
+}
--- /dev/null
+/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */
+
+#include "../simd_correctness_check.h"
+#include <lasxintrin.h>
+
+extern void abort (void);
+int
+main ()
+{
+ __m128i __m128i_result0, __m128i_result1, __m128i_out, __m128i_result;
+ __m128 __m128_result0, __m128_result1, __m128_out, __m128_result;
+ __m128d __m128d_result0, __m128d_result1, __m128d_out, __m128d_result;
+
+ __m256i __m256i_op0, __m256i_op1, __m256i_op2, __m256i_out, __m256i_result;
+ __m256 __m256_op0, __m256_op1, __m256_op2, __m256_out, __m256_result;
+ __m256d __m256d_op0, __m256d_op1, __m256d_op2, __m256d_out, __m256d_result;
+
+ //__m256_op0 = {1,2,3,4,5,6,7,8};
+ *((int *)&__m256_op0[7]) = 0x41000000;
+ *((int *)&__m256_op0[6]) = 0x40e00000;
+ *((int *)&__m256_op0[5]) = 0x40c00000;
+ *((int *)&__m256_op0[4]) = 0x40a00000;
+ *((int *)&__m256_op0[3]) = 0x40800000;
+ *((int *)&__m256_op0[2]) = 0x40400000;
+ *((int *)&__m256_op0[1]) = 0x40000000;
+ *((int *)&__m256_op0[0]) = 0x3f800000;
+ *((int *)&__m128_result1[3]) = 0x41000000;
+ *((int *)&__m128_result1[2]) = 0x40e00000;
+ *((int *)&__m128_result1[1]) = 0x40c00000;
+ *((int *)&__m128_result1[0]) = 0x40a00000;
+ *((int *)&__m128_result0[3]) = 0x40800000;
+ *((int *)&__m128_result0[2]) = 0x40400000;
+ *((int *)&__m128_result0[1]) = 0x40000000;
+ *((int *)&__m128_result0[0]) = 0x3f800000;
+ __m128_out = __lasx_extract_128_lo_s (__m256_op0);
+ ASSERTEQ_32 (__LINE__, __m128_result0, __m128_out);
+ __m128_out = __lasx_extract_128_hi_s (__m256_op0);
+ ASSERTEQ_32 (__LINE__, __m128_result1, __m128_out);
+
+ //__m256i_op0 = {1,2,3,4};
+ *((unsigned long *)&__m256i_op0[3]) = 0x4;
+ *((unsigned long *)&__m256i_op0[2]) = 0x3;
+ *((unsigned long *)&__m256i_op0[1]) = 0x2;
+ *((unsigned long *)&__m256i_op0[0]) = 0x1;
+ *((unsigned long *)&__m128i_result0[1]) = 0x2;
+ *((unsigned long *)&__m128i_result0[0]) = 0x1;
+ *((unsigned long *)&__m128i_result1[1]) = 0x4;
+ *((unsigned long *)&__m128i_result1[0]) = 0x3;
+ __m128i_out = __lasx_extract_128_lo (__m256i_op0);
+ ASSERTEQ_64 (__LINE__, __m128i_result0, __m128i_out);
+ __m128i_out = __lasx_extract_128_hi (__m256i_op0);
+ ASSERTEQ_64 (__LINE__, __m128i_result1, __m128i_out);
+
+ //__m256d_op0 = {1,2,3,4};
+ *((unsigned long *)&__m256d_op0[3]) = 0x4010000000000000;
+ *((unsigned long *)&__m256d_op0[2]) = 0x4008000000000000;
+ *((unsigned long *)&__m256d_op0[1]) = 0x4000000000000000;
+ *((unsigned long *)&__m256d_op0[0]) = 0x3ff0000000000000;
+ *((unsigned long *)&__m128d_result0[1]) = 0x4000000000000000;
+ *((unsigned long *)&__m128d_result0[0]) = 0x3ff0000000000000;
+ *((unsigned long *)&__m128d_result1[1]) = 0x4010000000000000;
+ *((unsigned long *)&__m128d_result1[0]) = 0x4008000000000000;
+ __m128d_out = __lasx_extract_128_lo_d (__m256d_op0);
+ ASSERTEQ_64 (__LINE__, __m128d_result0, __m128d_out);
+ __m128d_out = __lasx_extract_128_hi_d (__m256d_op0);
+ ASSERTEQ_64 (__LINE__, __m128d_result1, __m128d_out);
+
+ return 0;
+}
--- /dev/null
+/* { dg-do compile { target { loongarch64*-*-* } } } */
+/* { dg-options "-mabi=lp64d -O2 -mlasx" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <lasxintrin.h>
+
+/*
+**foo1_lo:
+** vld (\$vr[0-9]+),\$r4,0
+** vpickve2gr.du \$r4,(\$vr[0-9]+),0
+** vpickve2gr.du \$r5,(\$vr[0-9]+),1
+** jr \$r1
+*/
+__m128
+foo1_lo (__m256 x)
+{
+ return __lasx_extract_128_lo_s (x);
+}
+
+/*
+**foo1_hi:
+** xvld (\$xr[0-9]+),\$r4,0
+** xvpermi.d (\$xr[0-9]+),(\$xr[0-9]+),0xe
+** vpickve2gr.du \$r4,(\$vr[0-9]+),0
+** vpickve2gr.du \$r5,(\$vr[0-9]+),1
+** jr \$r1
+*/
+__m128
+foo1_hi (__m256 x)
+{
+ return __lasx_extract_128_hi_s (x);
+}
+
+/*
+**foo2_lo:
+** vld (\$vr[0-9]+),\$r4,0
+** vpickve2gr.du \$r4,(\$vr[0-9]+),0
+** vpickve2gr.du \$r5,(\$vr[0-9]+),1
+** jr \$r1
+*/
+__m128d
+foo2_lo (__m256d x)
+{
+ return __lasx_extract_128_lo_d (x);
+}
+
+/*
+**foo2_hi:
+** xvld (\$xr[0-9]+),\$r4,0
+** xvpermi.d (\$xr[0-9]+),(\$xr[0-9]+),0xe
+** vpickve2gr.du \$r4,(\$vr[0-9]+),0
+** vpickve2gr.du \$r5,(\$vr[0-9]+),1
+** jr \$r1
+*/
+__m128d
+foo2_hi (__m256d x)
+{
+ return __lasx_extract_128_hi_d (x);
+}
+
+/*
+**foo3_lo:
+** vld (\$vr[0-9]+),\$r4,0
+** vpickve2gr.du \$r4,(\$vr[0-9]+),0
+** vpickve2gr.du \$r5,(\$vr[0-9]+),1
+** jr \$r1
+*/
+__m128i
+foo3_lo (__m256i x)
+{
+ return __lasx_extract_128_lo (x);
+}
+
+/*
+**foo3_hi:
+** xvld (\$xr[0-9]+),\$r4,0
+** xvpermi.d (\$xr[0-9]+),(\$xr[0-9]+),0xe
+** vpickve2gr.du \$r4,(\$vr[0-9]+),0
+** vpickve2gr.du \$r5,(\$vr[0-9]+),1
+** jr \$r1
+*/
+__m128i
+foo3_hi (__m256i x)
+{
+ return __lasx_extract_128_hi (x);
+}
--- /dev/null
+/* { dg-options "-mabi=lp64d -O2 -mlasx -w -fno-strict-aliasing" } */
+
+#include "../simd_correctness_check.h"
+#include <lasxintrin.h>
+
+extern void abort (void);
+int
+main ()
+{
+ __m128i __m128i_op0, __m128i_op1, __m128i_out;
+ __m128 __m128_op0, __m128_op1, __m128_out;
+ __m128d __m128d_op0, __m128d_op1, __m128d_out;
+
+ __m256i __m256i_op0, __m256i_result0, __m256i_result1, __m256i_out;
+ __m256 __m256_op0, __m256_result0, __m256_result1, __m256_out;
+ __m256d __m256d_op0, __m256d_result0, __m256d_result1, __m256d_out;
+
+ //__m256_op0 = {1,2,3,4,5,6,7,8}, __m128_op0 ={9,9,9,9};
+ *((int *)&__m256_op0[7]) = 0x41000000;
+ *((int *)&__m256_op0[6]) = 0x40e00000;
+ *((int *)&__m256_op0[5]) = 0x40c00000;
+ *((int *)&__m256_op0[4]) = 0x40a00000;
+ *((int *)&__m256_op0[3]) = 0x40800000;
+ *((int *)&__m256_op0[2]) = 0x40400000;
+ *((int *)&__m256_op0[1]) = 0x40000000;
+ *((int *)&__m256_op0[0]) = 0x3f800000;
+ *((int *)&__m128_op0[3]) = 0x41100000;
+ *((int *)&__m128_op0[2]) = 0x41100000;
+ *((int *)&__m128_op0[1]) = 0x41100000;
+ *((int *)&__m128_op0[0]) = 0x41100000;
+ *((int *)&__m256_result0[7]) = 0x41000000;
+ *((int *)&__m256_result0[6]) = 0x40e00000;
+ *((int *)&__m256_result0[5]) = 0x40c00000;
+ *((int *)&__m256_result0[4]) = 0x40a00000;
+ *((int *)&__m256_result0[3]) = 0x41100000;
+ *((int *)&__m256_result0[2]) = 0x41100000;
+ *((int *)&__m256_result0[1]) = 0x41100000;
+ *((int *)&__m256_result0[0]) = 0x41100000;
+ *((int *)&__m256_result1[7]) = 0x41100000;
+ *((int *)&__m256_result1[6]) = 0x41100000;
+ *((int *)&__m256_result1[5]) = 0x41100000;
+ *((int *)&__m256_result1[4]) = 0x41100000;
+ *((int *)&__m256_result1[3]) = 0x40800000;
+ *((int *)&__m256_result1[2]) = 0x40400000;
+ *((int *)&__m256_result1[1]) = 0x40000000;
+ *((int *)&__m256_result1[0]) = 0x3f800000;
+ __m256_out = __lasx_insert_128_lo_s (__m256_op0, __m128_op0);
+ ASSERTEQ_32 (__LINE__, __m256_result0, __m256_out);
+ __m256_out = __lasx_insert_128_hi_s (__m256_op0, __m128_op0);
+ ASSERTEQ_32 (__LINE__, __m256_result1, __m256_out);
+
+ //__m256i_op0 ={1,2,3,4},__m128i_op0={5,6},__m128i_op1={7,8};
+ *((unsigned long *)&__m256i_op0[3]) = 0x4;
+ *((unsigned long *)&__m256i_op0[2]) = 0x3;
+ *((unsigned long *)&__m256i_op0[1]) = 0x2;
+ *((unsigned long *)&__m256i_op0[0]) = 0x1;
+ *((unsigned long *)&__m128i_op0[1]) = 0x6;
+ *((unsigned long *)&__m128i_op0[0]) = 0x5;
+ *((unsigned long *)&__m128i_op1[1]) = 0x8;
+ *((unsigned long *)&__m128i_op1[0]) = 0x7;
+ *((unsigned long *)&__m256i_result0[3]) = 0x4;
+ *((unsigned long *)&__m256i_result0[2]) = 0x3;
+ *((unsigned long *)&__m256i_result0[1]) = 0x6;
+ *((unsigned long *)&__m256i_result0[0]) = 0x5;
+ *((unsigned long *)&__m256i_result1[3]) = 0x8;
+ *((unsigned long *)&__m256i_result1[2]) = 0x7;
+ *((unsigned long *)&__m256i_result1[1]) = 0x2;
+ *((unsigned long *)&__m256i_result1[0]) = 0x1;
+ __m256i_out = __lasx_insert_128_lo (__m256i_op0, __m128i_op0);
+ ASSERTEQ_64 (__LINE__, __m256i_result0, __m256i_out);
+ __m256i_out = __lasx_insert_128_hi (__m256i_op0, __m128i_op1);
+ ASSERTEQ_64 (__LINE__, __m256i_result1, __m256i_out);
+
+ //__m256d_op0 ={1,2,3,4},__m128d_op0={5,6},__m128d_op1={7,8};
+ *((unsigned long *)&__m256d_op0[3]) = 0x4010000000000000;
+ *((unsigned long *)&__m256d_op0[2]) = 0x4008000000000000;
+ *((unsigned long *)&__m256d_op0[1]) = 0x4000000000000000;
+ *((unsigned long *)&__m256d_op0[0]) = 0x3ff0000000000000;
+ *((unsigned long *)&__m128d_op0[1]) = 0x4018000000000000;
+ *((unsigned long *)&__m128d_op0[0]) = 0x4014000000000000;
+ *((unsigned long *)&__m128d_op1[1]) = 0x4020000000000000;
+ *((unsigned long *)&__m128d_op1[0]) = 0x401c000000000000;
+ *((unsigned long *)&__m256d_result0[3]) = 0x4010000000000000;
+ *((unsigned long *)&__m256d_result0[2]) = 0x4008000000000000;
+ *((unsigned long *)&__m256d_result0[1]) = 0x4018000000000000;
+ *((unsigned long *)&__m256d_result0[0]) = 0x4014000000000000;
+ *((unsigned long *)&__m256d_result1[3]) = 0x4020000000000000;
+ *((unsigned long *)&__m256d_result1[2]) = 0x401c000000000000;
+ *((unsigned long *)&__m256d_result1[1]) = 0x4000000000000000;
+ *((unsigned long *)&__m256d_result1[0]) = 0x3ff0000000000000;
+ __m256d_out = __lasx_insert_128_lo_d (__m256d_op0, __m128d_op0);
+ ASSERTEQ_64 (__LINE__, __m256d_result0, __m256d_out);
+ __m256d_out = __lasx_insert_128_hi_d (__m256d_op0, __m128d_op1);
+ ASSERTEQ_64 (__LINE__, __m256d_result1, __m256d_out);
+
+ return 0;
+}
--- /dev/null
+/* { dg-do compile { target { loongarch64*-*-* } } } */
+/* { dg-options "-mabi=lp64d -O2 -mlasx" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <lasxintrin.h>
+
+/*
+**foo1:
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,0
+** xvld (\$xr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x30
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256
+foo1 (__m256 x, __m128 y)
+{
+ return __builtin_lasx_insert_128_lo_s (x, y);
+}
+
+/*
+**foo2:
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,0
+** xvld (\$xr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256
+foo2 (__m256 x, __m128 y)
+{
+ return __builtin_lasx_insert_128_hi_s (x, y);
+}
+
+/*
+**foo3:
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,0
+** xvld (\$xr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x30
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256d
+foo3 (__m256d x, __m128d y)
+{
+ return __builtin_lasx_insert_128_lo_d (x, y);
+}
+
+/*
+**foo4:
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,0
+** xvld (\$xr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256d
+foo4 (__m256d x, __m128d y)
+{
+ return __builtin_lasx_insert_128_hi_d (x, y);
+}
+
+/*
+**foo5:
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,0
+** xvld (\$xr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x30
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256i
+foo5 (__m256i x, __m128i y)
+{
+ return __builtin_lasx_insert_128_lo (x, y);
+}
+
+/*
+**foo6:
+** vinsgr2vr.d (\$vr[0-9]+),\$r6,0
+** xvld (\$xr[0-9]+),\$r5,0
+** vinsgr2vr.d (\$vr[0-9]+),\$r7,1
+** xvpermi.q (\$xr[0-9]+),(\$xr[0-9]+),0x02
+** xvst (\$xr[0-9]+),\$r4,0
+** jr \$r1
+*/
+__m256i
+foo6 (__m256i x, __m128i y)
+{
+ return __builtin_lasx_insert_128_hi (x, y);
+}