aarch64,falkor: Ignore prefetcher hints for memmove tail

author Siddhesh Poyarekar <siddhesh@sourceware.org>

Thu, 10 May 2018 18:38:01 +0000 (00:08 +0530)

committer Siddhesh Poyarekar <siddhesh@sourceware.org>

Thu, 10 May 2018 18:38:02 +0000 (00:08 +0530)
author Siddhesh Poyarekar <siddhesh@sourceware.org>
Thu, 10 May 2018 18:38:01 +0000 (00:08 +0530)
committer Siddhesh Poyarekar <siddhesh@sourceware.org>
Thu, 10 May 2018 18:38:02 +0000 (00:08 +0530)
diff --git a/ChangeLog b/ChangeLog

index b2b66020e81e59d9a204ac04cd689256ebe68ef2..c3b2e03c3b8c2516b7450c5b44ae5510274822eb 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2018-05-11  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/multiarch/memmove_falkor.S
+       (__memmove_falkor): Use multiple registers to move data in
+       loop tail.
+
  2018-05-10  Joseph Myers  <joseph@codesourcery.com>
  
         * math/math-underflow.h: New file.
diff --git a/sysdeps/aarch64/multiarch/memmove_falkor.S b/sysdeps/aarch64/multiarch/memmove_falkor.S

index 3375adf2de3069c3e987ca415422784e7a4e637d..c0d9560301e6433036921c75b1ac4ba27a90706e 100644 (file)
--- a/sysdeps/aarch64/multiarch/memmove_falkor.S
+++ b/sysdeps/aarch64/multiarch/memmove_falkor.S
@@ -150,7 +150,6 @@ L(copy96):
  
         .p2align 4
  L(copy_long):
-       sub     count, count, 64 + 16   /* Test and readjust count.  */
         mov     B_l, Q_l
         mov     B_h, Q_h
         ldp     A_l, A_h, [src]
@@ -161,6 +160,8 @@ L(copy_long):
         ldp     Q_l, Q_h, [src, 16]!
         stp     A_l, A_h, [dstin]
         ldp     A_l, A_h, [src, 16]!
+       subs    count, count, 32 + 64 + 16      /* Test and readjust count.  */
+       b.ls    L(last64)
  
  L(loop64):
         subs    count, count, 32
@@ -170,18 +171,22 @@ L(loop64):
         ldp     A_l, A_h, [src, 16]!
         b.hi    L(loop64)
  
-       /* Write the last full set of 32 bytes.  The remainder is at most 32
-          bytes, so it is safe to always copy 32 bytes from the end even if
-          there is just 1 byte left.  */
+       /* Write the last full set of 64 bytes.  The remainder is at most 64
+          bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+          from the end.  */
  L(last64):
-       ldp     C_l, C_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -64]
         stp     Q_l, Q_h, [dst, 16]
-       ldp     Q_l, Q_h, [srcend, -16]
-       stp     A_l, A_h, [dst, 32]
-       stp     C_l, C_h, [dstend, -32]
-       stp     Q_l, Q_h, [dstend, -16]
         mov     Q_l, B_l
         mov     Q_h, B_h
+       ldp     B_l, B_h, [srcend, -48]
+       stp     A_l, A_h, [dst, 32]
+       ldp     A_l, A_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       stp     C_l, C_h, [dstend, -64]
+       stp     B_l, B_h, [dstend, -48]
+       stp     A_l, A_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
         ret
  
         .p2align 4
@@ -204,7 +209,8 @@ L(move_long):
         sub     count, count, tmp1
         ldp     A_l, A_h, [srcend, -16]!
         sub     dstend, dstend, tmp1
-       sub     count, count, 64
+       subs    count, count, 32 + 64
+       b.ls    2f
  
  1:
         subs    count, count, 32
@@ -214,18 +220,22 @@ L(move_long):
         ldp     A_l, A_h, [srcend, -16]!
         b.hi    1b
  
-       /* Write the last full set of 32 bytes.  The remainder is at most 32
-          bytes, so it is safe to always copy 32 bytes from the start even if
-          there is just 1 byte left.  */
+       /* Write the last full set of 64 bytes.  The remainder is at most 64
+          bytes and at least 33 bytes, so it is safe to always copy 64 bytes
+          from the start.  */
  2:
-       ldp     C_l, C_h, [src, 16]
+       ldp     C_l, C_h, [src, 48]
         stp     Q_l, Q_h, [dstend, -16]
-       ldp     Q_l, Q_h, [src]
-       stp     A_l, A_h, [dstend, -32]
-       stp     C_l, C_h, [dstin, 16]
-       stp     Q_l, Q_h, [dstin]
         mov     Q_l, B_l
         mov     Q_h, B_h
+       ldp     B_l, B_h, [src, 32]
+       stp     A_l, A_h, [dstend, -32]
+       ldp     A_l, A_h, [src, 16]
+       ldp     D_l, D_h, [src]
+       stp     C_l, C_h, [dstin, 48]
+       stp     B_l, B_h, [dstin, 32]
+       stp     A_l, A_h, [dstin, 16]
+       stp     D_l, D_h, [dstin]
  3:     ret
  
  END (__memmove_falkor)
author	Siddhesh Poyarekar <siddhesh@sourceware.org>
	Thu, 10 May 2018 18:38:01 +0000 (00:08 +0530)
committer	Siddhesh Poyarekar <siddhesh@sourceware.org>
	Thu, 10 May 2018 18:38:02 +0000 (00:08 +0530)
ChangeLog		patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/memmove_falkor.S		patch \| blob \| blame \| history