]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/memchr.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / aarch64 / memchr.S
1 /* memchr - find a character in a memory zone
2
3 Copyright (C) 2015-2017 Free Software Foundation, Inc.
4
5 This file is part of the GNU C Library.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library. If not, see
19 <http://www.gnu.org/licenses/>. */
20
21 #include <sysdep.h>
22
23 /* Assumptions:
24 *
25 * ARMv8-a, AArch64
26 * Neon Available.
27 */
28
29 /* Arguments and results. */
30 #define srcin x0
31 #define chrin w1
32 #define cntin x2
33
34 #define result x0
35
36 #define src x3
37 #define tmp x4
38 #define wtmp2 w5
39 #define synd x6
40 #define soff x9
41 #define cntrem x10
42
43 #define vrepchr v0
44 #define vdata1 v1
45 #define vdata2 v2
46 #define vhas_chr1 v3
47 #define vhas_chr2 v4
48 #define vrepmask v5
49 #define vend v6
50
51 /*
52 * Core algorithm:
53 *
54 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
55 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
56 * requested character and bit 1 is not used (faster than using a 32bit
57 * syndrome). Since the bits in the syndrome reflect exactly the order in which
58 * things occur in the original string, counting trailing zeros allows to
59 * identify exactly which byte has matched.
60 */
61
62 ENTRY (__memchr)
63 /* Do not dereference srcin if no bytes to compare. */
64 cbz cntin, L(zero_length)
65 /*
66 * Magic constant 0x40100401 allows us to identify which lane matches
67 * the requested byte.
68 */
69 mov wtmp2, #0x0401
70 movk wtmp2, #0x4010, lsl #16
71 dup vrepchr.16b, chrin
72 /* Work with aligned 32-byte chunks */
73 bic src, srcin, #31
74 dup vrepmask.4s, wtmp2
75 ands soff, srcin, #31
76 and cntrem, cntin, #31
77 b.eq L(loop)
78
79 /*
80 * Input string is not 32-byte aligned. We calculate the syndrome
81 * value for the aligned 32 bytes block containing the first bytes
82 * and mask the irrelevant part.
83 */
84
85 ld1 {vdata1.16b, vdata2.16b}, [src], #32
86 sub tmp, soff, #32
87 adds cntin, cntin, tmp
88 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
89 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
90 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
91 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
92 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
93 addp vend.16b, vend.16b, vend.16b /* 128->64 */
94 mov synd, vend.2d[0]
95 /* Clear the soff*2 lower bits */
96 lsl tmp, soff, #1
97 lsr synd, synd, tmp
98 lsl synd, synd, tmp
99 /* The first block can also be the last */
100 b.ls L(masklast)
101 /* Have we found something already? */
102 cbnz synd, L(tail)
103
104 L(loop):
105 ld1 {vdata1.16b, vdata2.16b}, [src], #32
106 subs cntin, cntin, #32
107 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
108 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
109 /* If we're out of data we finish regardless of the result */
110 b.ls L(end)
111 /* Use a fast check for the termination condition */
112 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
113 addp vend.2d, vend.2d, vend.2d
114 mov synd, vend.2d[0]
115 /* We're not out of data, loop if we haven't found the character */
116 cbz synd, L(loop)
117
118 L(end):
119 /* Termination condition found, let's calculate the syndrome value */
120 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
121 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
122 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
123 addp vend.16b, vend.16b, vend.16b /* 128->64 */
124 mov synd, vend.2d[0]
125 /* Only do the clear for the last possible block */
126 b.hi L(tail)
127
128 L(masklast):
129 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
130 add tmp, cntrem, soff
131 and tmp, tmp, #31
132 sub tmp, tmp, #32
133 neg tmp, tmp, lsl #1
134 lsl synd, synd, tmp
135 lsr synd, synd, tmp
136
137 L(tail):
138 /* Count the trailing zeros using bit reversing */
139 rbit synd, synd
140 /* Compensate the last post-increment */
141 sub src, src, #32
142 /* Check that we have found a character */
143 cmp synd, #0
144 /* And count the leading zeros */
145 clz synd, synd
146 /* Compute the potential result */
147 add result, src, synd, lsr #1
148 /* Select result or NULL */
149 csel result, xzr, result, eq
150 ret
151
152 L(zero_length):
153 mov result, #0
154 ret
155 END (__memchr)
156 weak_alias (__memchr, memchr)
157 libc_hidden_builtin_def (memchr)