From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 20 Jun 2022 20:02:10 +0000 (-0700)
Subject: x86: Replace all sse instructions with vex equivilent in avx+ files
X-Git-Tag: glibc-2.36~134
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3079f652d7cc34456aefb412677c01e758922527;p=thirdparty%2Fglibc.git

x86: Replace all sse instructions with vex equivilent in avx+ files

Most of these don't really matter as there was no dirty upper state
but we should generally avoid stray sse when its not needed.

The one case that really matters is in svml_d_tanh4_core_avx2.S:

blendvps %xmm0, %xmm8, %xmm7

When there was a dirty upper state.

Tested on x86_64-linux
---

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
index e19bddd2e2c..73025e8b0f1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
@@ -210,11 +210,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	acos@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
index f4c72c36182..b8cc6dd7764 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
@@ -232,11 +232,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	acos@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
index 5d0b23b72c3..126110cf17b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
@@ -372,11 +372,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	acosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
index b9a1131664f..db0ef3b9dd9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
@@ -317,11 +317,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	acosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
index ba96089504e..612a45da309 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
@@ -202,11 +202,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	asin@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
index 0f5b773b045..e7b41ab232f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
@@ -224,11 +224,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	asin@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
index 131b716c957..1fcbb245b71 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
@@ -429,11 +429,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	asinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
index 5bdc6859f0d..8445fc8ba40 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
@@ -343,11 +343,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	asinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
index 1b601576cc0..a45cae79a1f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
@@ -277,12 +277,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
-	movsd	64(%rsp, %r14, 8), %xmm1
+	vmovsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm1
 	call	atan2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 96(%rsp, %r14, 8)
+	vmovsd	%xmm0, 96(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
index ef9581075d9..c3b0f7940cb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
@@ -295,12 +295,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
-	movsd	128(%rsp, %r14, 8), %xmm1
+	vmovsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	128(%rsp, %r14, 8), %xmm1
 	call	atan2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 192(%rsp, %r14, 8)
+	vmovsd	%xmm0, 192(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
index b5cbfd224c3..c9c41ef9f40 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
@@ -339,11 +339,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	atanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
index 3193c026dd2..de4edb3cc00 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
@@ -274,11 +274,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	atanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
index 96ecbe05c13..71a25f3db85 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
@@ -262,11 +262,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	cbrt@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
index 25df252108d..a3d9104f5ee 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
@@ -282,11 +282,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	cosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
index 066bbc7de6b..4ff0e038a36 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
@@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	cosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
index c832b65e3e1..6efd2e95ba0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
@@ -258,11 +258,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	erfc@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
index 77228814d33..42bdfe6f188 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
@@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	erfc@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
index 7271bcc1d9b..f519bcce45c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
@@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	exp10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
index 40b01c3cd08..3f0c6701998 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
@@ -191,11 +191,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	exp10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
index ced774e89c2..afa00a38bb7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
@@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	exp2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
index 7a85fd8b18d..eee785dbf5e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
@@ -227,11 +227,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	exp2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
index 590341c2434..4a3202750f0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
@@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	expm1@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
index efae1f8b663..0fa17f3a733 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
@@ -211,11 +211,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	expm1@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
index ae5738c1b7a..5c693d132eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
@@ -231,12 +231,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
-	movsd	64(%rsp, %r14, 8), %xmm1
+	vmovsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm1
 	call	hypot@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 96(%rsp, %r14, 8)
+	vmovsd	%xmm0, 96(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
index 0c404fd5eec..a392252c8b1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
@@ -194,12 +194,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
-	movsd	128(%rsp, %r14, 8), %xmm1
+	vmovsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	128(%rsp, %r14, 8), %xmm1
 	call	hypot@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 192(%rsp, %r14, 8)
+	vmovsd	%xmm0, 192(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
index 2461c6ad565..9bf45a6dc2b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
@@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
index 5d129ef4e51..101618cce92 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
@@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
index 13235793e85..39ec0024cfe 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
@@ -263,11 +263,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log1p@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
index dd55b5dd18d..3033fcb5b34 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
@@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log1p@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
index 25d2edaae5d..84bdb2090df 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
@@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
index bcb6736dec3..b3e9bb3ca4c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
@@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
index ae166005798..ad2a06ad37d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
@@ -280,11 +280,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	sinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
index 075665d57da..7ca915e30fc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
@@ -271,11 +271,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	sinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
index 01c86736e79..f26daf316bc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
@@ -267,11 +267,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	tan@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
index 376479035e6..0c90328b0a8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
@@ -239,11 +239,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	tan@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
index 7ddf145b25a..ea41d326ebf 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
@@ -110,7 +110,7 @@ ENTRY(_ZGVdN4v_tanh_avx2)
 	vpcmpgtd %xmm11, %xmm9, %xmm10
 	vpcmpgtd %xmm8, %xmm9, %xmm0
 	vpand	%xmm10, %xmm9, %xmm7
-	blendvps %xmm0, %xmm8, %xmm7
+	vblendvps %xmm0, %xmm8, %xmm7, %xmm7
 
 	/*
 	 * VSHRIMM( I, iIndex, = iIndex, (17 - 4) );
@@ -272,11 +272,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	tanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
index 82c01195003..c995401a248 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
@@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	tanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
index 26fef1f2680..fd84977e952 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
@@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	acosf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
index bf28a5dd005..078fe5a8989 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
@@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	acosf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
index 3f44e75248d..65026e647de 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
@@ -290,11 +290,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	acoshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
index 3a70fc14489..489dac033ca 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
@@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	acoshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
index 4e9984d8709..2accef703e1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
@@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	asinf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
index 59bea9dc424..257c8da2f7b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
@@ -187,11 +187,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	asinf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
index 6b569ecf419..a0c27922e4f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
@@ -313,11 +313,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	asinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
index 794030a4811..d6f6c3d5aa3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
@@ -361,11 +361,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	asinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
index 56aa5bb917d..15ffa4b6c9b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
@@ -257,12 +257,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
-	movss	128(%rsp, %r14, 4), %xmm1
+	vmovss	64(%rsp, %r14, 4), %xmm0
+	vmovss	128(%rsp, %r14, 4), %xmm1
 	call	atan2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 192(%rsp, %r14, 4)
+	vmovss	%xmm0, 192(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
index 29ebbb6db22..08b18c3e3f1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
@@ -238,12 +238,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	movss	64(%rsp, %r14, 4), %xmm1
+	vmovss	32(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm1
 	call	atan2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 96(%rsp, %r14, 4)
+	vmovss	%xmm0, 96(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index f42462c5818..94186a14cbd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -222,13 +222,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	64(%rsp, %rbp, 4), %xmm0
+	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index 43eb4238317..49ffd7a9b2f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -231,13 +231,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	32(%rsp, %rbp, 4), %xmm0
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
index d24d36163dc..14b58c171a1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
@@ -304,11 +304,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	cbrtf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
index 6b740bf866a..d1a5ddf5b45 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
@@ -228,11 +228,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	coshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
index 6f29218af19..a00650ccd68 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
@@ -242,11 +242,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	coshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
index 9daaa0c06da..5fb5b2f0f76 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
@@ -218,11 +218,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	erfcf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
index 4cafc1bcd58..60b9fab000c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
@@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	erfcf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
index eb9f3f8d8b8..10f0b2cb379 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
@@ -186,11 +186,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	exp10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
index 11244d5a5f2..275ab42529c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
@@ -238,11 +238,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	exp10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
index 5b406c6e32e..8a5f1e39858 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
@@ -209,11 +209,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	exp2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
index f7a80a4d64e..cc87e66425b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
@@ -188,11 +188,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	exp2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
index 71d23e632ca..7fe830daa4b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
@@ -194,11 +194,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	expm1f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
index 73f862528a0..d5d7fa2791f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
@@ -212,11 +212,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	expm1f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
index 548936fe616..c92e3ab0655 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
@@ -202,12 +202,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
-	movss	128(%rsp, %r14, 4), %xmm1
+	vmovss	64(%rsp, %r14, 4), %xmm0
+	vmovss	128(%rsp, %r14, 4), %xmm1
 	call	hypotf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 192(%rsp, %r14, 4)
+	vmovss	%xmm0, 192(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
index fc97828008a..7a26c5accc0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
@@ -226,12 +226,12 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	movss	64(%rsp, %r14, 4), %xmm1
+	vmovss	32(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm1
 	call	hypotf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 96(%rsp, %r14, 4)
+	vmovss	%xmm0, 96(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
index b192dfe464b..0eb9b23c4e9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
@@ -161,11 +161,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
index ea51c28f812..4bdc62e90e0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
@@ -174,11 +174,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
index 8fa5068595f..2c864f0c0e7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
@@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log1pf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
index 54d6a9a685c..7326a2b5ad8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
@@ -190,11 +190,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log1pf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
index 3b0a28fee05..02b255dde81 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
@@ -158,11 +158,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
index eaa51121784..2245d40f84b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
@@ -169,11 +169,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
index fad4847f284..89be733eb26 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
@@ -252,11 +252,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	sinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
index 8c4b46cee29..e358e2efee8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
@@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	sinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
index f2a18f0b2c2..4e18cdc0ce8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
@@ -235,11 +235,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	tanf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
index cd33fac6435..d34e61ac414 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
@@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%ebx, %r13d
-	movss	32(%rsp, %r13, 4), %xmm0
+	vmovss	32(%rsp, %r13, 4), %xmm0
 	call	tanf@PLT
 	# LOE r13 r14 r15 ebx r12d xmm0
 
-	movss	%xmm0, 64(%rsp, %r13, 4)
+	vmovss	%xmm0, 64(%rsp, %r13, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index 7edc74a1166..84f73fdaf90 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -221,13 +221,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	64(%rsp, %rbp, 4), %xmm0
+	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index 55df346a00c..ea3e9f4210d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -240,13 +240,13 @@ L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math function call to process special input.  */
-	movss	32(%rsp, %rbp, 4), %xmm0
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index bd26ba80d55..eb128a2ae33 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -49,7 +49,7 @@
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY(STRRCHR)
-	movd	%esi, %xmm7
+	vmovd	%esi, %xmm7
 	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
 	VPBROADCAST %xmm7, %ymm7