sysdeps/aarch64/strlen.S

   1 /* Copyright (C) 2012-2019 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions:
  22  *
  23  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  24  */
  25
  26 #ifndef STRLEN
  27 # define STRLEN __strlen
  28 #endif
  29
  30 /* To test the page crossing code path more thoroughly, compile with
  31    -DTEST_PAGE_CROSS - this will force all calls through the slower
  32    entry path.  This option is not intended for production use.  */
  33
  34 /* Arguments and results.  */
  35 #define srcin           x0
  36 #define len             x0
  37
  38 /* Locals and temporaries.  */
  39 #define src             x1
  40 #define data1           x2
  41 #define data2           x3
  42 #define has_nul1        x4
  43 #define has_nul2        x5
  44 #define tmp1            x4
  45 #define tmp2            x5
  46 #define tmp3            x6
  47 #define tmp4            x7
  48 #define zeroones        x8
  49
  50         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  51            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  52            can be done in parallel across the entire word. A faster check
  53            (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
  54            false hits for characters 129..255.  */
  55
  56 #define REP8_01 0x0101010101010101
  57 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  58 #define REP8_80 0x8080808080808080
  59
  60 #ifdef TEST_PAGE_CROSS
  61 # define MIN_PAGE_SIZE 16
  62 #else
  63 # define MIN_PAGE_SIZE 4096
  64 #endif
  65
  66         /* Since strings are short on average, we check the first 16 bytes
  67            of the string for a NUL character.  In order to do an unaligned ldp
  68            safely we have to do a page cross check first.  If there is a NUL
  69            byte we calculate the length from the 2 8-byte words using
  70            conditional select to reduce branch mispredictions (it is unlikely
  71            strlen will be repeatedly called on strings with the same length).
  72
  73            If the string is longer than 16 bytes, we align src so don't need
  74            further page cross checks, and process 32 bytes per iteration
  75            using the fast NUL check.  If we encounter non-ASCII characters,
  76            fallback to a second loop using the full NUL check.
  77
  78            If the page cross check fails, we read 16 bytes from an aligned
  79            address, remove any characters before the string, and continue
  80            in the main loop using aligned loads.  Since strings crossing a
  81            page in the first 16 bytes are rare (probability of
  82            16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
  83
  84            AArch64 systems have a minimum page size of 4k.  We don't bother
  85            checking for larger page sizes - the cost of setting up the correct
  86            page size is just not worth the extra gain from a small reduction in
  87            the cases taking the slow path.  Note that we only care about
  88            whether the first fetch, which may be misaligned, crosses a page
  89            boundary.  */
  90
  91 ENTRY_ALIGN (STRLEN, 6)
  92         DELOUSE (0)
  93         DELOUSE (1)
  94         and     tmp1, srcin, MIN_PAGE_SIZE - 1
  95         mov     zeroones, REP8_01
  96         cmp     tmp1, MIN_PAGE_SIZE - 16
  97         b.gt    L(page_cross)
  98         ldp     data1, data2, [srcin]
  99 #ifdef __AARCH64EB__
 100         /* For big-endian, carry propagation (if the final byte in the
 101            string is 0x01) means we cannot use has_nul1/2 directly.
 102            Since we expect strings to be small and early-exit,
 103            byte-swap the data now so has_null1/2 will be correct.  */
 104         rev     data1, data1
 105         rev     data2, data2
 106 #endif
 107         sub     tmp1, data1, zeroones
 108         orr     tmp2, data1, REP8_7f
 109         sub     tmp3, data2, zeroones
 110         orr     tmp4, data2, REP8_7f
 111         bics    has_nul1, tmp1, tmp2
 112         bic     has_nul2, tmp3, tmp4
 113         ccmp    has_nul2, 0, 0, eq
 114         beq     L(main_loop_entry)
 115
 116         /* Enter with C = has_nul1 == 0.  */
 117         csel    has_nul1, has_nul1, has_nul2, cc
 118         mov     len, 8
 119         rev     has_nul1, has_nul1
 120         clz     tmp1, has_nul1
 121         csel    len, xzr, len, cc
 122         add     len, len, tmp1, lsr 3
 123         ret
 124
 125         /* The inner loop processes 32 bytes per iteration and uses the fast
 126            NUL check.  If we encounter non-ASCII characters, use a second
 127            loop with the accurate NUL check.  */
 128         .p2align 4
 129 L(main_loop_entry):
 130         bic     src, srcin, 15
 131         sub     src, src, 16
 132 L(main_loop):
 133         ldp     data1, data2, [src, 32]!
 134 L(page_cross_entry):
 135         sub     tmp1, data1, zeroones
 136         sub     tmp3, data2, zeroones
 137         orr     tmp2, tmp1, tmp3
 138         tst     tmp2, zeroones, lsl 7
 139         bne     1f
 140         ldp     data1, data2, [src, 16]
 141         sub     tmp1, data1, zeroones
 142         sub     tmp3, data2, zeroones
 143         orr     tmp2, tmp1, tmp3
 144         tst     tmp2, zeroones, lsl 7
 145         beq     L(main_loop)
 146         add     src, src, 16
 147 1:
 148         /* The fast check failed, so do the slower, accurate NUL check.  */
 149         orr     tmp2, data1, REP8_7f
 150         orr     tmp4, data2, REP8_7f
 151         bics    has_nul1, tmp1, tmp2
 152         bic     has_nul2, tmp3, tmp4
 153         ccmp    has_nul2, 0, 0, eq
 154         beq     L(nonascii_loop)
 155
 156         /* Enter with C = has_nul1 == 0.  */
 157 L(tail):
 158 #ifdef __AARCH64EB__
 159         /* For big-endian, carry propagation (if the final byte in the
 160            string is 0x01) means we cannot use has_nul1/2 directly.  The
 161            easiest way to get the correct byte is to byte-swap the data
 162            and calculate the syndrome a second time.  */
 163         csel    data1, data1, data2, cc
 164         rev     data1, data1
 165         sub     tmp1, data1, zeroones
 166         orr     tmp2, data1, REP8_7f
 167         bic     has_nul1, tmp1, tmp2
 168 #else
 169         csel    has_nul1, has_nul1, has_nul2, cc
 170 #endif
 171         sub     len, src, srcin
 172         rev     has_nul1, has_nul1
 173         add     tmp2, len, 8
 174         clz     tmp1, has_nul1
 175         csel    len, len, tmp2, cc
 176         add     len, len, tmp1, lsr 3
 177         ret
 178
 179 L(nonascii_loop):
 180         ldp     data1, data2, [src, 16]!
 181         sub     tmp1, data1, zeroones
 182         orr     tmp2, data1, REP8_7f
 183         sub     tmp3, data2, zeroones
 184         orr     tmp4, data2, REP8_7f
 185         bics    has_nul1, tmp1, tmp2
 186         bic     has_nul2, tmp3, tmp4
 187         ccmp    has_nul2, 0, 0, eq
 188         bne     L(tail)
 189         ldp     data1, data2, [src, 16]!
 190         sub     tmp1, data1, zeroones
 191         orr     tmp2, data1, REP8_7f
 192         sub     tmp3, data2, zeroones
 193         orr     tmp4, data2, REP8_7f
 194         bics    has_nul1, tmp1, tmp2
 195         bic     has_nul2, tmp3, tmp4
 196         ccmp    has_nul2, 0, 0, eq
 197         beq     L(nonascii_loop)
 198         b       L(tail)
 199
 200         /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
 201            srcin to 0x7f, so we ignore any NUL bytes before the string.
 202            Then continue in the aligned loop.  */
 203 L(page_cross):
 204         bic     src, srcin, 15
 205         ldp     data1, data2, [src]
 206         lsl     tmp1, srcin, 3
 207         mov     tmp4, -1
 208 #ifdef __AARCH64EB__
 209         /* Big-endian.  Early bytes are at MSB.  */
 210         lsr     tmp1, tmp4, tmp1        /* Shift (tmp1 & 63).  */
 211 #else
 212         /* Little-endian.  Early bytes are at LSB.  */
 213         lsl     tmp1, tmp4, tmp1        /* Shift (tmp1 & 63).  */
 214 #endif
 215         orr     tmp1, tmp1, REP8_80
 216         orn     data1, data1, tmp1
 217         orn     tmp2, data2, tmp1
 218         tst     srcin, 8
 219         csel    data1, data1, tmp4, eq
 220         csel    data2, data2, tmp2, eq
 221         b       L(page_cross_entry)
 222 END (STRLEN)
 223 weak_alias (STRLEN, strlen)
 224 libc_hidden_builtin_def (strlen)