x86: Improve large memset perf with non-temporal stores [RHEL-29312]

author Noah Goldstein <goldstein.w.n@gmail.com>

Fri, 24 May 2024 17:38:50 +0000 (12:38 -0500)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Thu, 30 May 2024 17:36:09 +0000 (12:36 -0500)
author Noah Goldstein <goldstein.w.n@gmail.com>
Fri, 24 May 2024 17:38:50 +0000 (12:38 -0500)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Thu, 30 May 2024 17:36:09 +0000 (12:36 -0500)
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S

index 97839a22483b0613f5dcf7122c38d2dc3c64f5ca..637caadb406b2544fbc7d0544c6ce62e1cfd1ca8 100644 (file)
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -21,10 +21,13 @@
     2. If size is less than VEC, use integer register stores.
     3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
     4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
-   5. On machines ERMS feature, if size is greater or equal than
-      __x86_rep_stosb_threshold then REP STOSB will be used.
-   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
-      4 VEC stores and store 4 * VEC at a time until done.  */
+   5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
+      4 VEC stores and store 4 * VEC at a time until done.
+   6. On machines ERMS feature, if size is range
+         [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+         then REP STOSB will be used.
+   7. If size >= __x86_shared_non_temporal_threshold, use a
+         non-temporal stores.  */
  
  #include <sysdep.h>
  
@@ -147,6 +150,41 @@ L(entry_from_wmemset):
         VMOVU   %VMM(0), -VEC_SIZE(%rdi,%rdx)
         VMOVU   %VMM(0), (%rdi)
         VZEROUPPER_RETURN
+
+       /* If have AVX512 mask instructions put L(less_vec) close to
+          entry as it doesn't take much space and is likely a hot target.  */
+#ifdef USE_LESS_VEC_MASK_STORE
+    /* Align to ensure the L(less_vec) logic all fits in 1x cache lines.  */
+       .p2align 6,, 47
+       .p2align 4
+L(less_vec):
+L(less_vec_from_wmemset):
+       /* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+       /* Clear high bits from edi. Only keeping bits relevant to page
+          cross check. Note that we are using rax which is set in
+          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+       andl    $(PAGE_SIZE - 1), %edi
+       /* Check if VEC_SIZE store cross page. Mask stores suffer
+          serious performance degradation when it has to fault suppress.  */
+       cmpl    $(PAGE_SIZE - VEC_SIZE), %edi
+       /* This is generally considered a cold target.  */
+       ja      L(cross_page)
+# if VEC_SIZE > 32
+       movq    $-1, %rcx
+       bzhiq   %rdx, %rcx, %rcx
+       kmovq   %rcx, %k1
+# else
+       movl    $-1, %ecx
+       bzhil   %edx, %ecx, %ecx
+       kmovd   %ecx, %k1
+# endif
+       vmovdqu8 %VMM(0), (%rax){%k1}
+       VZEROUPPER_RETURN
+#endif
+
  #if defined USE_MULTIARCH && IS_IN (libc)
  END (MEMSET_SYMBOL (__memset, unaligned))
  
@@ -185,54 +223,6 @@ L(last_2x_vec):
  #endif
         VZEROUPPER_RETURN
  
-       /* If have AVX512 mask instructions put L(less_vec) close to
-          entry as it doesn't take much space and is likely a hot target.
-        */
-#ifdef USE_LESS_VEC_MASK_STORE
-       .p2align 4,, 10
-L(less_vec):
-L(less_vec_from_wmemset):
-       /* Less than 1 VEC.  */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-#  error Unsupported VEC_SIZE!
-# endif
-       /* Clear high bits from edi. Only keeping bits relevant to page
-          cross check. Note that we are using rax which is set in
-          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
-       andl    $(PAGE_SIZE - 1), %edi
-       /* Check if VEC_SIZE store cross page. Mask stores suffer
-          serious performance degradation when it has to fault suppress.
-        */
-       cmpl    $(PAGE_SIZE - VEC_SIZE), %edi
-       /* This is generally considered a cold target.  */
-       ja      L(cross_page)
-# if VEC_SIZE > 32
-       movq    $-1, %rcx
-       bzhiq   %rdx, %rcx, %rcx
-       kmovq   %rcx, %k1
-# else
-       movl    $-1, %ecx
-       bzhil   %edx, %ecx, %ecx
-       kmovd   %ecx, %k1
-# endif
-       vmovdqu8 %VMM(0), (%rax){%k1}
-       VZEROUPPER_RETURN
-
-# if defined USE_MULTIARCH && IS_IN (libc)
-       /* Include L(stosb_local) here if including L(less_vec) between
-          L(stosb_more_2x_vec) and ENTRY. This is to cache align the
-          L(stosb_more_2x_vec) target.  */
-       .p2align 4,, 10
-L(stosb_local):
-       movzbl  %sil, %eax
-       mov     %RDX_LP, %RCX_LP
-       mov     %RDI_LP, %RDX_LP
-       rep     stosb
-       mov     %RDX_LP, %RAX_LP
-       VZEROUPPER_RETURN
-# endif
-#endif
-
  #if defined USE_MULTIARCH && IS_IN (libc)
         .p2align 4
  L(stosb_more_2x_vec):
@@ -318,21 +308,33 @@ L(return_vzeroupper):
         ret
  #endif
  
-       .p2align 4,, 10
-#ifndef USE_LESS_VEC_MASK_STORE
-# if defined USE_MULTIARCH && IS_IN (libc)
+#ifdef USE_WITH_AVX2
+       .p2align 4
+#else
+       .p2align 4,, 4
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
         /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
            range for 2-byte jump encoding.  */
  L(stosb_local):
+       cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+       jae     L(nt_memset)
         movzbl  %sil, %eax
         mov     %RDX_LP, %RCX_LP
         mov     %RDI_LP, %RDX_LP
         rep     stosb
+# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
+       /* Use xchg to save 1-byte (this helps align targets below).  */
+       xchg    %RDX_LP, %RAX_LP
+# else
         mov     %RDX_LP, %RAX_LP
-       VZEROUPPER_RETURN
  # endif
+       VZEROUPPER_RETURN
+#endif
+#ifndef USE_LESS_VEC_MASK_STORE
         /* Define L(less_vec) only if not otherwise defined.  */
-       .p2align 4
+       .p2align 4,, 12
  L(less_vec):
         /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
            xmm). This is only does anything for AVX2.  */
@@ -423,4 +425,35 @@ L(between_2_3):
         movb    %SET_REG8, -1(%LESS_VEC_REG, %rdx)
  #endif
         ret
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+# ifdef USE_WITH_AVX512
+       /* Force align so the loop doesn't cross a cache-line.  */
+       .p2align 4
+# endif
+       .p2align 4,, 7
+    /* Memset using non-temporal stores.  */
+L(nt_memset):
+       VMOVU   %VMM(0), (VEC_SIZE * 0)(%rdi)
+       leaq    (VEC_SIZE * -4)(%rdi, %rdx), %rdx
+    /* Align DST.  */
+       orq     $(VEC_SIZE * 1 - 1), %rdi
+       incq    %rdi
+       .p2align 4,, 7
+L(nt_loop):
+       VMOVNT  %VMM(0), (VEC_SIZE * 0)(%rdi)
+       VMOVNT  %VMM(0), (VEC_SIZE * 1)(%rdi)
+       VMOVNT  %VMM(0), (VEC_SIZE * 2)(%rdi)
+       VMOVNT  %VMM(0), (VEC_SIZE * 3)(%rdi)
+       subq    $(VEC_SIZE * -4), %rdi
+       cmpq    %rdx, %rdi
+       jb      L(nt_loop)
+       sfence
+       VMOVU   %VMM(0), (VEC_SIZE * 0)(%rdx)
+       VMOVU   %VMM(0), (VEC_SIZE * 1)(%rdx)
+       VMOVU   %VMM(0), (VEC_SIZE * 2)(%rdx)
+       VMOVU   %VMM(0), (VEC_SIZE * 3)(%rdx)
+       VZEROUPPER_RETURN
+#endif
+
+END(MEMSET_SYMBOL(__memset, unaligned_erms))
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Fri, 24 May 2024 17:38:50 +0000 (12:38 -0500)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Thu, 30 May 2024 17:36:09 +0000 (12:36 -0500)