]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
x86-64: Add memmove family functions with 256-bit EVEX
authorH.J. Lu <hjl.tools@gmail.com>
Fri, 5 Mar 2021 14:46:08 +0000 (06:46 -0800)
committerH.J. Lu <hjl.tools@gmail.com>
Thu, 27 Jan 2022 20:47:19 +0000 (12:47 -0800)
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
AVX512VL since VZEROUPPER isn't needed at function exit.

(cherry picked from commit 63ad43566f7a25d140dc723598aeb441ad657eed)

sysdeps/x86_64/multiarch/Makefile
sysdeps/x86_64/multiarch/ifunc-impl-list.c
sysdeps/x86_64/multiarch/ifunc-memmove.h
sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S [new file with mode: 0644]
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S

index 38349a690b3d7f32592108c73f1a240326627e0b..e75805a64530371269136c7afa20e8119945833e 100644 (file)
@@ -42,6 +42,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
                   memset-avx2-unaligned-erms \
                   memset-avx512-unaligned-erms \
                   memchr-evex \
+                  memmove-evex-unaligned-erms \
                   memrchr-evex \
                   rawmemchr-evex \
                   stpcpy-evex \
index 680c2c070fe436a899bb15e97264b8b1adc2b10a..d7814a965f42f5ce1e0ebfa65bd88951701bce36 100644 (file)
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memmove_chk_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_chk_evex_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_chk_evex_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __memmove_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __memmove_chk_ssse3_back)
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memmove,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memmove_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_evex_unaligned)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_evex_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, memmove,
                              HAS_ARCH_FEATURE (AVX512F_Usable),
                              __memmove_avx512_no_vzeroupper)
@@ -553,6 +565,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memcpy_chk_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_chk_evex_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_chk_evex_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __memcpy_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __memcpy_chk_ssse3_back)
@@ -575,6 +593,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, memcpy,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __memcpy_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_evex_unaligned)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_evex_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
                              __memcpy_ssse3_back)
              IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -611,6 +635,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __mempcpy_chk_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_chk_evex_unaligned)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_chk_evex_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
                              HAS_CPU_FEATURE (SSSE3),
                              __mempcpy_chk_ssse3_back)
@@ -642,6 +672,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
              IFUNC_IMPL_ADD (array, i, mempcpy,
                              HAS_ARCH_FEATURE (AVX_Usable),
                              __mempcpy_avx_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_evex_unaligned)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_evex_unaligned_erms)
              IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
                              __mempcpy_ssse3_back)
              IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
index 5b1eb1c92c2f199be9339b65fe48a156f046223c..83db955826b1d87ba3f26291c078bb14571c9082 100644 (file)
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
 
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
-      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
-       return OPTIMIZE (avx_unaligned_erms);
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+       {
+         if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+           return OPTIMIZE (evex_unaligned_erms);
+
+         return OPTIMIZE (evex_unaligned);
+       }
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+       {
+         if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+           return OPTIMIZE (avx_unaligned_erms);
 
-      return OPTIMIZE (avx_unaligned);
+         return OPTIMIZE (avx_unaligned);
+       }
     }
 
   if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644 (file)
index 0000000..b879007
--- /dev/null
@@ -0,0 +1,26 @@
+#if IS_IN (libc)
+# define VEC_SIZE      32
+# define XMM0          xmm16
+# define XMM1          xmm17
+# define YMM0          ymm16
+# define YMM1          ymm17
+# define VEC0          ymm16
+# define VEC1          ymm17
+# define VEC2          ymm18
+# define VEC3          ymm19
+# define VEC4          ymm20
+# define VEC5          ymm21
+# define VEC6          ymm22
+# define VEC7          ymm23
+# define VEC8          ymm24
+# define VEC(i)                VEC##i
+# define VMOVNT                vmovntdq
+# define VMOVU         vmovdqu64
+# define VMOVA         vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p)            p##.evex
+# define MEMMOVE_SYMBOL(p,s)   p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
index 5aaadc233f1ea8f4fe226e0b2096856879448ee9..50fffeb5ce88283e07424e17b1b10d89a637cb4e 100644 (file)
 # define MEMMOVE_CHK_SYMBOL(p,s)       MEMMOVE_SYMBOL(p, s)
 #endif
 
+#ifndef XMM0
+# define XMM0                          xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0                          ymm0
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -312,20 +320,20 @@ L(less_vec):
 #if VEC_SIZE > 32
 L(between_32_63):
        /* From 32 to 63.  No branch when size == 32.  */
-       vmovdqu (%rsi), %ymm0
-       vmovdqu -32(%rsi,%rdx), %ymm1
-       vmovdqu %ymm0, (%rdi)
-       vmovdqu %ymm1, -32(%rdi,%rdx)
+       VMOVU   (%rsi), %YMM0
+       VMOVU   -32(%rsi,%rdx), %YMM1
+       VMOVU   %YMM0, (%rdi)
+       VMOVU   %YMM1, -32(%rdi,%rdx)
        VZEROUPPER
        ret
 #endif
 #if VEC_SIZE > 16
        /* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
-       vmovdqu (%rsi), %xmm0
-       vmovdqu -16(%rsi,%rdx), %xmm1
-       vmovdqu %xmm0, (%rdi)
-       vmovdqu %xmm1, -16(%rdi,%rdx)
+       VMOVU   (%rsi), %XMM0
+       VMOVU   -16(%rsi,%rdx), %XMM1
+       VMOVU   %XMM0, (%rdi)
+       VMOVU   %XMM1, -16(%rdi,%rdx)
        ret
 #endif
 L(between_8_15):