]>
Commit | Line | Data |
---|---|---|
6d7e8eda | 1 | /* Copyright (C) 2013-2023 Free Software Foundation, Inc. |
a0b1cd88 MS |
2 | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
a0b1cd88 MS |
18 | |
19 | #include <sysdep.h> | |
20 | ||
21 | /* Assumptions: | |
22 | * | |
23 | * ARMv8-a, AArch64 | |
24 | */ | |
25 | ||
26 | #define REP8_01 0x0101010101010101 | |
27 | #define REP8_7f 0x7f7f7f7f7f7f7f7f | |
a0b1cd88 MS |
28 | |
29 | /* Parameters and result. */ | |
30 | #define src1 x0 | |
31 | #define src2 x1 | |
32 | #define limit x2 | |
33 | #define result x0 | |
34 | ||
35 | /* Internal variables. */ | |
36 | #define data1 x3 | |
37 | #define data1w w3 | |
38 | #define data2 x4 | |
39 | #define data2w w4 | |
40 | #define has_nul x5 | |
41 | #define diff x6 | |
42 | #define syndrome x7 | |
43 | #define tmp1 x8 | |
44 | #define tmp2 x9 | |
45 | #define tmp3 x10 | |
46 | #define zeroones x11 | |
47 | #define pos x12 | |
03e1378f AB |
48 | #define mask x13 |
49 | #define endloop x14 | |
7108f1f9 | 50 | #define count mask |
03e1378f AB |
51 | #define offset pos |
52 | #define neg_offset x15 | |
a0b1cd88 | 53 | |
03e1378f AB |
54 | /* Define endian dependent shift operations. |
55 | On big-endian early bytes are at MSB and on little-endian LSB. | |
56 | LS_FW means shifting towards early bytes. | |
57 | LS_BK means shifting towards later bytes. | |
58 | */ | |
59 | #ifdef __AARCH64EB__ | |
60 | #define LS_FW lsl | |
61 | #define LS_BK lsr | |
62 | #else | |
63 | #define LS_FW lsr | |
64 | #define LS_BK lsl | |
65 | #endif | |
66 | ||
67 | .text | |
68 | .p2align 6 | |
69 | .rep 9 | |
70 | nop /* Pad so that the loop below fits a cache line. */ | |
71 | .endr | |
72 | ENTRY_ALIGN (strncmp, 0) | |
a0b1cd88 MS |
73 | cbz limit, L(ret0) |
74 | eor tmp1, src1, src2 | |
75 | mov zeroones, #REP8_01 | |
76 | tst tmp1, #7 | |
7108f1f9 | 77 | and count, src1, #7 |
a0b1cd88 | 78 | b.ne L(misaligned8) |
7108f1f9 | 79 | cbnz count, L(mutual_align) |
a0b1cd88 MS |
80 | |
81 | /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 | |
82 | (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and | |
83 | can be done in parallel across the entire word. */ | |
84 | /* Start of performance-critical section -- one 64B cache line. */ | |
85 | L(loop_aligned): | |
86 | ldr data1, [src1], #8 | |
87 | ldr data2, [src2], #8 | |
88 | L(start_realigned): | |
03e1378f | 89 | subs limit, limit, #8 |
a0b1cd88 MS |
90 | sub tmp1, data1, zeroones |
91 | orr tmp2, data1, #REP8_7f | |
92 | eor diff, data1, data2 /* Non-zero if differences found. */ | |
03e1378f | 93 | csinv endloop, diff, xzr, hi /* Last Dword or differences. */ |
a0b1cd88 MS |
94 | bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ |
95 | ccmp endloop, #0, #0, eq | |
96 | b.eq L(loop_aligned) | |
97 | /* End of performance-critical section -- one 64B cache line. */ | |
98 | ||
03e1378f AB |
99 | L(full_check): |
100 | #ifndef __AARCH64EB__ | |
a0b1cd88 | 101 | orr syndrome, diff, has_nul |
03e1378f AB |
102 | add limit, limit, 8 /* Rewind limit to before last subs. */ |
103 | L(syndrome_check): | |
104 | /* Limit was reached. Check if the NUL byte or the difference | |
105 | is before the limit. */ | |
a0b1cd88 MS |
106 | rev syndrome, syndrome |
107 | rev data1, data1 | |
a0b1cd88 MS |
108 | clz pos, syndrome |
109 | rev data2, data2 | |
110 | lsl data1, data1, pos | |
03e1378f | 111 | cmp limit, pos, lsr #3 |
a0b1cd88 MS |
112 | lsl data2, data2, pos |
113 | /* But we need to zero-extend (char is unsigned) the value and then | |
114 | perform a signed 32-bit subtraction. */ | |
115 | lsr data1, data1, #56 | |
116 | sub result, data1, data2, lsr #56 | |
03e1378f AB |
117 | csel result, result, xzr, hi |
118 | ret | |
a0b1cd88 | 119 | #else |
03e1378f AB |
120 | /* Not reached the limit, must have found the end or a diff. */ |
121 | tbz limit, #63, L(not_limit) | |
122 | add tmp1, limit, 8 | |
123 | cbz limit, L(not_limit) | |
124 | ||
125 | lsl limit, tmp1, #3 /* Bits -> bytes. */ | |
126 | mov mask, #~0 | |
127 | lsr mask, mask, limit | |
128 | bic data1, data1, mask | |
129 | bic data2, data2, mask | |
130 | ||
131 | /* Make sure that the NUL byte is marked in the syndrome. */ | |
132 | orr has_nul, has_nul, mask | |
133 | ||
134 | L(not_limit): | |
a0b1cd88 MS |
135 | /* For big-endian we cannot use the trick with the syndrome value |
136 | as carry-propagation can corrupt the upper bits if the trailing | |
137 | bytes in the string contain 0x01. */ | |
138 | /* However, if there is no NUL byte in the dword, we can generate | |
139 | the result directly. We can't just subtract the bytes as the | |
140 | MSB might be significant. */ | |
141 | cbnz has_nul, 1f | |
142 | cmp data1, data2 | |
143 | cset result, ne | |
144 | cneg result, result, lo | |
03e1378f | 145 | ret |
a0b1cd88 MS |
146 | 1: |
147 | /* Re-compute the NUL-byte detection, using a byte-reversed value. */ | |
148 | rev tmp3, data1 | |
149 | sub tmp1, tmp3, zeroones | |
150 | orr tmp2, tmp3, #REP8_7f | |
151 | bic has_nul, tmp1, tmp2 | |
152 | rev has_nul, has_nul | |
153 | orr syndrome, diff, has_nul | |
154 | clz pos, syndrome | |
03e1378f AB |
155 | /* The most-significant-non-zero bit of the syndrome marks either the |
156 | first bit that is different, or the top bit of the first zero byte. | |
a0b1cd88 MS |
157 | Shifting left now will bring the critical information into the |
158 | top bits. */ | |
03e1378f | 159 | L(end_quick): |
a0b1cd88 MS |
160 | lsl data1, data1, pos |
161 | lsl data2, data2, pos | |
162 | /* But we need to zero-extend (char is unsigned) the value and then | |
163 | perform a signed 32-bit subtraction. */ | |
164 | lsr data1, data1, #56 | |
165 | sub result, data1, data2, lsr #56 | |
03e1378f | 166 | ret |
a0b1cd88 MS |
167 | #endif |
168 | ||
169 | L(mutual_align): | |
170 | /* Sources are mutually aligned, but are not currently at an | |
171 | alignment boundary. Round down the addresses and then mask off | |
172 | the bytes that precede the start point. | |
173 | We also need to adjust the limit calculations, but without | |
174 | overflowing if the limit is near ULONG_MAX. */ | |
175 | bic src1, src1, #7 | |
176 | bic src2, src2, #7 | |
177 | ldr data1, [src1], #8 | |
7108f1f9 | 178 | neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ |
a0b1cd88 MS |
179 | ldr data2, [src2], #8 |
180 | mov tmp2, #~0 | |
03e1378f AB |
181 | LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ |
182 | /* Adjust the limit and ensure it doesn't overflow. */ | |
183 | adds limit, limit, count | |
184 | csinv limit, limit, xzr, lo | |
a0b1cd88 MS |
185 | orr data1, data1, tmp2 |
186 | orr data2, data2, tmp2 | |
a0b1cd88 MS |
187 | b L(start_realigned) |
188 | ||
a0b1cd88 | 189 | .p2align 6 |
7108f1f9 | 190 | /* Don't bother with dwords for up to 16 bytes. */ |
a0b1cd88 | 191 | L(misaligned8): |
7108f1f9 SP |
192 | cmp limit, #16 |
193 | b.hs L(try_misaligned_words) | |
194 | ||
195 | L(byte_loop): | |
a0b1cd88 MS |
196 | /* Perhaps we can do better than this. */ |
197 | ldrb data1w, [src1], #1 | |
198 | ldrb data2w, [src2], #1 | |
199 | subs limit, limit, #1 | |
7108f1f9 | 200 | ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ |
a0b1cd88 | 201 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ |
7108f1f9 SP |
202 | b.eq L(byte_loop) |
203 | L(done): | |
a0b1cd88 | 204 | sub result, data1, data2 |
03e1378f | 205 | ret |
7108f1f9 SP |
206 | /* Align the SRC1 to a dword by doing a bytewise compare and then do |
207 | the dword loop. */ | |
208 | L(try_misaligned_words): | |
03e1378f | 209 | cbz count, L(src1_aligned) |
7108f1f9 SP |
210 | |
211 | neg count, count | |
212 | and count, count, #7 | |
213 | sub limit, limit, count | |
7108f1f9 SP |
214 | |
215 | L(page_end_loop): | |
216 | ldrb data1w, [src1], #1 | |
217 | ldrb data2w, [src2], #1 | |
218 | cmp data1w, #1 | |
219 | ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ | |
220 | b.ne L(done) | |
221 | subs count, count, #1 | |
222 | b.hi L(page_end_loop) | |
223 | ||
03e1378f AB |
224 | /* The following diagram explains the comparison of misaligned strings. |
225 | The bytes are shown in natural order. For little-endian, it is | |
226 | reversed in the registers. The "x" bytes are before the string. | |
227 | The "|" separates data that is loaded at one time. | |
228 | src1 | a a a a a a a a | b b b c c c c c | . . . | |
229 | src2 | x x x x x a a a a a a a a b b b | c c c c c . . . | |
230 | After shifting in each step, the data looks like this: | |
231 | STEP_A STEP_B STEP_C | |
232 | data1 a a a a a a a a b b b c c c c c b b b c c c c c | |
233 | data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c | |
234 | The bytes with "0" are eliminated from the syndrome via mask. | |
235 | Align SRC2 down to 16 bytes. This way we can read 16 bytes at a | |
236 | time from SRC2. The comparison happens in 3 steps. After each step | |
237 | the loop can exit, or read from SRC1 or SRC2. */ | |
238 | L(src1_aligned): | |
239 | /* Calculate offset from 8 byte alignment to string start in bits. No | |
240 | need to mask offset since shifts are ignoring upper bits. */ | |
241 | lsl offset, src2, #3 | |
242 | bic src2, src2, #0xf | |
243 | mov mask, -1 | |
244 | neg neg_offset, offset | |
245 | ldr data1, [src1], #8 | |
246 | ldp tmp1, tmp2, [src2], #16 | |
247 | LS_BK mask, mask, neg_offset | |
248 | and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ | |
249 | /* Skip the first compare if data in tmp1 is irrelevant. */ | |
250 | tbnz offset, 6, L(misaligned_mid_loop) | |
251 | ||
7108f1f9 | 252 | L(loop_misaligned): |
03e1378f AB |
253 | /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ |
254 | LS_FW data2, tmp1, offset | |
255 | LS_BK tmp1, tmp2, neg_offset | |
256 | subs limit, limit, #8 | |
257 | orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ | |
258 | sub has_nul, data1, zeroones | |
259 | eor diff, data1, data2 /* Non-zero if differences found. */ | |
260 | orr tmp3, data1, #REP8_7f | |
261 | csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ | |
262 | bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ | |
263 | orr tmp3, endloop, has_nul | |
264 | cbnz tmp3, L(full_check) | |
7108f1f9 SP |
265 | |
266 | ldr data1, [src1], #8 | |
03e1378f AB |
267 | L(misaligned_mid_loop): |
268 | /* STEP_B: Compare first part of data1 to second part of tmp2. */ | |
269 | LS_FW data2, tmp2, offset | |
270 | #ifdef __AARCH64EB__ | |
271 | /* For big-endian we do a byte reverse to avoid carry-propagation | |
272 | problem described above. This way we can reuse the has_nul in the | |
273 | next step and also use syndrome value trick at the end. */ | |
274 | rev tmp3, data1 | |
275 | #define data1_fixed tmp3 | |
276 | #else | |
277 | #define data1_fixed data1 | |
278 | #endif | |
279 | sub has_nul, data1_fixed, zeroones | |
280 | orr tmp3, data1_fixed, #REP8_7f | |
281 | eor diff, data2, data1 /* Non-zero if differences found. */ | |
282 | bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ | |
283 | #ifdef __AARCH64EB__ | |
284 | rev has_nul, has_nul | |
285 | #endif | |
286 | cmp limit, neg_offset, lsr #3 | |
287 | orr syndrome, diff, has_nul | |
288 | bic syndrome, syndrome, mask /* Ignore later bytes. */ | |
289 | csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ | |
290 | cbnz tmp3, L(syndrome_check) | |
7108f1f9 | 291 | |
03e1378f AB |
292 | /* STEP_C: Compare second part of data1 to first part of tmp1. */ |
293 | ldp tmp1, tmp2, [src2], #16 | |
294 | cmp limit, #8 | |
295 | LS_BK data2, tmp1, neg_offset | |
296 | eor diff, data2, data1 /* Non-zero if differences found. */ | |
297 | orr syndrome, diff, has_nul | |
298 | and syndrome, syndrome, mask /* Ignore earlier bytes. */ | |
299 | csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ | |
300 | cbnz tmp3, L(syndrome_check) | |
301 | ||
302 | ldr data1, [src1], #8 | |
303 | sub limit, limit, #8 | |
304 | b L(loop_misaligned) | |
305 | ||
306 | #ifdef __AARCH64EB__ | |
307 | L(syndrome_check): | |
308 | clz pos, syndrome | |
309 | cmp pos, limit, lsl #3 | |
310 | b.lo L(end_quick) | |
311 | #endif | |
7108f1f9 SP |
312 | |
313 | L(ret0): | |
314 | mov result, #0 | |
03e1378f | 315 | ret |
7108f1f9 | 316 | |
a0b1cd88 MS |
317 | END (strncmp) |
318 | libc_hidden_builtin_def (strncmp) |