]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
aarch64: Remove non-temporal load/stores from oryon-1's memset
authorAndrew Pinski <quic_apinski@quicinc.com>
Fri, 15 Nov 2024 03:03:20 +0000 (19:03 -0800)
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>
Thu, 21 Nov 2024 14:32:23 +0000 (11:32 -0300)
The hardware architects have a new recommendation not to use
non-temporal load/stores for memset. This patch removes this path.
I found there was no difference in the memset speed with/without
non-temporal load/stores either.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
sysdeps/aarch64/multiarch/memset_oryon1.S

index 6fa28a9bd030a70594274cc8d69a0c0ef3b47093..b63c16ec514093a47e295d41e66cfa444d6cdd94 100644 (file)
@@ -93,8 +93,6 @@ L(set_long):
        cmp     count, 256
        ccmp    valw, 0, 0, cs
        b.eq    L(try_zva)
-       cmp     count, #32768
-       b.hi    L(set_long_with_nontemp)
        /* Small-size or non-zero memset does not use DC ZVA. */
        sub     count, dstend, dst
 
@@ -117,30 +115,6 @@ L(set_long):
        stp     val, val, [dstend, -16]
        ret
 
-L(set_long_with_nontemp):
-       /* Small-size or non-zero memset does not use DC ZVA. */
-       sub     count, dstend, dst
-
-       /* Adjust count and bias for loop. By subtracting extra 1 from count,
-          it is easy to use tbz instruction to check whether loop tailing
-          count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
-       sub     count, count, 64+16+1
-
-1:     stnp    val, val, [dst, 16]
-       stnp    val, val, [dst, 32]
-       stnp    val, val, [dst, 48]
-       stnp    val, val, [dst, 64]
-       add     dst, dst, #64
-       subs    count, count, 64
-       b.hs    1b
-
-       tbz     count, 5, 1f    /* Remaining count is less than 33 bytes? */
-       stnp    val, val, [dst, 16]
-       stnp    val, val, [dst, 32]
-1:     stnp    val, val, [dstend, -32]
-       stnp    val, val, [dstend, -16]
-       ret
-
 L(try_zva):
        /* Write the first and last 64 byte aligned block using stp rather
           than using DC ZVA as it is faster. */