]>
Commit | Line | Data |
---|---|---|
a1a638dd | 1 | /* memchr implemented using NEON. |
04277e02 | 2 | Copyright (C) 2011-2019 Free Software Foundation, Inc. |
a1a638dd AZ |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
a1a638dd AZ |
18 | |
19 | #include <sysdep.h> | |
20 | ||
21 | /* For __ARM_NEON__ this file defines memchr. */ | |
22 | #ifndef __ARM_NEON__ | |
f8f72bc0 | 23 | # define memchr __memchr_neon |
a1a638dd AZ |
24 | # undef libc_hidden_builtin_def |
25 | # define libc_hidden_builtin_def(a) | |
26 | #endif | |
27 | ||
28 | .arch armv7-a | |
29 | .fpu neon | |
30 | ||
31 | ||
32 | /* Arguments */ | |
33 | #define srcin r0 | |
34 | #define chrin r1 | |
35 | #define cntin r2 | |
36 | ||
37 | /* Retval */ | |
38 | #define result r0 /* Live range does not overlap with srcin */ | |
39 | ||
40 | /* Working registers */ | |
41 | #define src r1 /* Live range does not overlap with chrin */ | |
42 | #define tmp r3 | |
43 | #define synd r0 /* No overlap with srcin or result */ | |
44 | #define soff r12 | |
45 | ||
46 | /* Working NEON registers */ | |
47 | #define vrepchr q0 | |
48 | #define vdata0 q1 | |
49 | #define vdata0_0 d2 /* Lower half of vdata0 */ | |
50 | #define vdata0_1 d3 /* Upper half of vdata0 */ | |
51 | #define vdata1 q2 | |
52 | #define vdata1_0 d4 /* Lower half of vhas_chr0 */ | |
53 | #define vdata1_1 d5 /* Upper half of vhas_chr0 */ | |
54 | #define vrepmask q3 | |
55 | #define vrepmask0 d6 | |
56 | #define vrepmask1 d7 | |
57 | #define vend q4 | |
58 | #define vend0 d8 | |
59 | #define vend1 d9 | |
60 | ||
61 | /* | |
62 | * Core algorithm: | |
63 | * | |
64 | * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per | |
65 | * byte. Each bit is set if the relevant byte matched the requested character | |
66 | * and cleared otherwise. Since the bits in the syndrome reflect exactly the | |
67 | * order in which things occur in the original string, counting trailing zeros | |
68 | * allows to identify exactly which byte has matched. | |
69 | */ | |
70 | ||
a1a638dd | 71 | .thumb_func |
a1a638dd AZ |
72 | .p2align 4,,15 |
73 | ||
74 | ENTRY(memchr) | |
75 | /* Use a simple loop if there are less than 8 bytes to search. */ | |
76 | cmp cntin, #7 | |
77 | bhi .Llargestr | |
78 | and chrin, chrin, #0xff | |
79 | ||
80 | .Lsmallstr: | |
81 | subs cntin, cntin, #1 | |
82 | blo .Lnotfound /* Return not found if reached end. */ | |
83 | ldrb tmp, [srcin], #1 | |
84 | cmp tmp, chrin | |
85 | bne .Lsmallstr /* Loop again if not found. */ | |
86 | /* Otherwise fixup address and return. */ | |
87 | sub result, srcin, #1 | |
88 | bx lr | |
89 | ||
90 | ||
91 | .Llargestr: | |
92 | vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ | |
93 | /* | |
94 | * Magic constant 0x8040201008040201 allows us to identify which lane | |
95 | * matches the requested byte. | |
96 | */ | |
97 | movw tmp, #0x0201 | |
98 | movt tmp, #0x0804 | |
99 | lsl soff, tmp, #4 | |
100 | vmov vrepmask0, tmp, soff | |
101 | vmov vrepmask1, tmp, soff | |
102 | /* Work with aligned 32-byte chunks */ | |
103 | bic src, srcin, #31 | |
104 | ands soff, srcin, #31 | |
105 | beq .Lloopintro /* Go straight to main loop if it's aligned. */ | |
106 | ||
107 | /* | |
108 | * Input string is not 32-byte aligned. We calculate the syndrome | |
109 | * value for the aligned 32 bytes block containing the first bytes | |
110 | * and mask the irrelevant part. | |
111 | */ | |
112 | vld1.8 {vdata0, vdata1}, [src:256]! | |
113 | sub tmp, soff, #32 | |
114 | adds cntin, cntin, tmp | |
115 | vceq.i8 vdata0, vdata0, vrepchr | |
116 | vceq.i8 vdata1, vdata1, vrepchr | |
117 | vand vdata0, vdata0, vrepmask | |
118 | vand vdata1, vdata1, vrepmask | |
119 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 | |
120 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 | |
121 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 | |
122 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 | |
123 | vmov synd, vdata0_0[0] | |
124 | ||
125 | /* Clear the soff lower bits */ | |
126 | lsr synd, synd, soff | |
127 | lsl synd, synd, soff | |
128 | /* The first block can also be the last */ | |
129 | bls .Lmasklast | |
130 | /* Have we found something already? */ | |
a1a638dd | 131 | cbnz synd, .Ltail |
f8f72bc0 | 132 | |
a1a638dd AZ |
133 | |
134 | .Lloopintro: | |
135 | vpush {vend} | |
136 | /* 264/265 correspond to d8/d9 for q4 */ | |
137 | cfi_adjust_cfa_offset (16) | |
138 | cfi_rel_offset (264, 0) | |
139 | cfi_rel_offset (265, 8) | |
140 | .p2align 3,,7 | |
141 | .Lloop: | |
142 | vld1.8 {vdata0, vdata1}, [src:256]! | |
143 | subs cntin, cntin, #32 | |
144 | vceq.i8 vdata0, vdata0, vrepchr | |
145 | vceq.i8 vdata1, vdata1, vrepchr | |
146 | /* If we're out of data we finish regardless of the result. */ | |
147 | bls .Lend | |
148 | /* Use a fast check for the termination condition. */ | |
149 | vorr vend, vdata0, vdata1 | |
150 | vorr vend0, vend0, vend1 | |
151 | vmov synd, tmp, vend0 | |
152 | orrs synd, synd, tmp | |
153 | /* We're not out of data, loop if we haven't found the character. */ | |
154 | beq .Lloop | |
155 | ||
156 | .Lend: | |
157 | vpop {vend} | |
158 | cfi_adjust_cfa_offset (-16) | |
159 | cfi_restore (264) | |
160 | cfi_restore (265) | |
161 | ||
162 | /* Termination condition found, let's calculate the syndrome value. */ | |
163 | vand vdata0, vdata0, vrepmask | |
164 | vand vdata1, vdata1, vrepmask | |
165 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 | |
166 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 | |
167 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 | |
168 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 | |
169 | vmov synd, vdata0_0[0] | |
a1a638dd AZ |
170 | cbz synd, .Lnotfound |
171 | bhi .Ltail /* Uses the condition code from | |
172 | subs cntin, cntin, #32 above. */ | |
a1a638dd AZ |
173 | |
174 | ||
175 | .Lmasklast: | |
176 | /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ | |
177 | neg cntin, cntin | |
178 | lsl synd, synd, cntin | |
179 | lsrs synd, synd, cntin | |
180 | it eq | |
181 | moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ | |
182 | ||
183 | ||
184 | .Ltail: | |
185 | /* Count the trailing zeros using bit reversing */ | |
186 | rbit synd, synd | |
187 | /* Compensate the last post-increment */ | |
188 | sub src, src, #32 | |
189 | /* Count the leading zeros */ | |
190 | clz synd, synd | |
191 | /* Compute the potential result and return */ | |
192 | add result, src, synd | |
193 | bx lr | |
194 | ||
195 | ||
196 | .Lnotfound: | |
197 | /* Set result to NULL if not found and return */ | |
198 | mov result, #0 | |
199 | bx lr | |
200 | ||
201 | END(memchr) | |
202 | libc_hidden_builtin_def (memchr) |