]>
Commit | Line | Data |
---|---|---|
f63c86fe | 1 | /* strcmp implementation for ARMv7-A, optimized for Cortex-A15. |
2b778ceb | 2 | Copyright (C) 2012-2021 Free Software Foundation, Inc. |
f63c86fe WN |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
f63c86fe WN |
18 | |
19 | #include <arm-features.h> | |
20 | #include <sysdep.h> | |
21 | ||
22 | /* Implementation of strcmp for ARMv7 when DSP instructions are | |
23 | available. Use ldrd to support wider loads, provided the data | |
24 | is sufficiently aligned. Use saturating arithmetic to optimize | |
25 | the compares. */ | |
26 | ||
27 | /* Build Options: | |
28 | STRCMP_PRECHECK: Run a quick pre-check of the first byte in the | |
29 | string. If comparing completely random strings the pre-check will | |
30 | save time, since there is a very high probability of a mismatch in | |
31 | the first character: we save significant overhead if this is the | |
32 | common case. However, if strings are likely to be identical (e.g. | |
33 | because we're verifying a hit in a hash table), then this check | |
34 | is largely redundant. */ | |
35 | ||
36 | #define STRCMP_PRECHECK 1 | |
37 | ||
f63c86fe WN |
38 | .syntax unified |
39 | ||
40 | #ifdef __ARM_BIG_ENDIAN | |
41 | # define S2LO lsl | |
42 | # define S2LOEQ lsleq | |
43 | # define S2HI lsr | |
44 | # define MSB 0x000000ff | |
45 | # define LSB 0xff000000 | |
46 | # define BYTE0_OFFSET 24 | |
47 | # define BYTE1_OFFSET 16 | |
48 | # define BYTE2_OFFSET 8 | |
49 | # define BYTE3_OFFSET 0 | |
50 | #else /* not __ARM_BIG_ENDIAN */ | |
51 | # define S2LO lsr | |
52 | # define S2LOEQ lsreq | |
53 | # define S2HI lsl | |
54 | # define BYTE0_OFFSET 0 | |
55 | # define BYTE1_OFFSET 8 | |
56 | # define BYTE2_OFFSET 16 | |
57 | # define BYTE3_OFFSET 24 | |
58 | # define MSB 0xff000000 | |
59 | # define LSB 0x000000ff | |
60 | #endif /* not __ARM_BIG_ENDIAN */ | |
61 | ||
62 | /* Parameters and result. */ | |
63 | #define src1 r0 | |
64 | #define src2 r1 | |
65 | #define result r0 /* Overlaps src1. */ | |
66 | ||
67 | /* Internal variables. */ | |
68 | #define tmp1 r4 | |
69 | #define tmp2 r5 | |
70 | #define const_m1 r12 | |
71 | ||
72 | /* Additional internal variables for 64-bit aligned data. */ | |
73 | #define data1a r2 | |
74 | #define data1b r3 | |
75 | #define data2a r6 | |
76 | #define data2b r7 | |
77 | #define syndrome_a tmp1 | |
78 | #define syndrome_b tmp2 | |
79 | ||
80 | /* Additional internal variables for 32-bit aligned data. */ | |
81 | #define data1 r2 | |
82 | #define data2 r3 | |
83 | #define syndrome tmp2 | |
84 | ||
85 | ||
0a982a29 RM |
86 | .thumb |
87 | ||
88 | /* In Thumb code we can't use MVN with a register shift, but we do have ORN. */ | |
89 | .macro prepare_mask mask_reg, nbits_reg | |
90 | S2HI \mask_reg, const_m1, \nbits_reg | |
91 | .endm | |
92 | .macro apply_mask data_reg, mask_reg | |
93 | orn \data_reg, \data_reg, \mask_reg | |
94 | .endm | |
0a982a29 | 95 | |
f63c86fe WN |
96 | /* Macro to compute and return the result value for word-aligned |
97 | cases. */ | |
98 | .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 | |
99 | #ifdef __ARM_BIG_ENDIAN | |
100 | /* If data1 contains a zero byte, then syndrome will contain a 1 in | |
101 | bit 7 of that byte. Otherwise, the highest set bit in the | |
102 | syndrome will highlight the first different bit. It is therefore | |
103 | sufficient to extract the eight bits starting with the syndrome | |
104 | bit. */ | |
105 | clz tmp1, \synd | |
106 | lsl r1, \d2, tmp1 | |
107 | .if \restore_r6 | |
108 | ldrd r6, r7, [sp, #8] | |
109 | .endif | |
110 | lsl \d1, \d1, tmp1 | |
111 | lsr result, \d1, #24 | |
112 | ldrd r4, r5, [sp], #16 | |
113 | cfi_remember_state | |
114 | cfi_def_cfa_offset (0) | |
115 | cfi_restore (r4) | |
116 | cfi_restore (r5) | |
117 | cfi_restore (r6) | |
118 | cfi_restore (r7) | |
119 | sub result, result, r1, lsr #24 | |
120 | bx lr | |
121 | #else | |
122 | /* To use the big-endian trick we'd have to reverse all three words. | |
123 | that's slower than this approach. */ | |
124 | rev \synd, \synd | |
125 | clz tmp1, \synd | |
126 | bic tmp1, tmp1, #7 | |
127 | lsr r1, \d2, tmp1 | |
128 | .if \restore_r6 | |
129 | ldrd r6, r7, [sp, #8] | |
130 | .endif | |
131 | lsr \d1, \d1, tmp1 | |
132 | and result, \d1, #255 | |
133 | and r1, r1, #255 | |
134 | ldrd r4, r5, [sp], #16 | |
135 | cfi_remember_state | |
136 | cfi_def_cfa_offset (0) | |
137 | cfi_restore (r4) | |
138 | cfi_restore (r5) | |
139 | cfi_restore (r6) | |
140 | cfi_restore (r7) | |
141 | sub result, result, r1 | |
142 | ||
143 | bx lr | |
144 | #endif | |
145 | .endm | |
146 | ||
147 | .text | |
148 | .p2align 5 | |
149 | .Lstrcmp_start_addr: | |
150 | #if STRCMP_PRECHECK == 1 | |
151 | .Lfastpath_exit: | |
152 | sub r0, r2, r3 | |
153 | bx lr | |
154 | nop | |
155 | #endif | |
156 | ENTRY (strcmp) | |
157 | #if STRCMP_PRECHECK == 1 | |
81cb7a0b ZW |
158 | ldrb r2, [src1] |
159 | ldrb r3, [src2] | |
f63c86fe WN |
160 | cmp r2, #1 |
161 | it cs | |
162 | cmpcs r2, r3 | |
163 | bne .Lfastpath_exit | |
164 | #endif | |
165 | strd r4, r5, [sp, #-16]! | |
166 | cfi_def_cfa_offset (16) | |
167 | cfi_offset (r4, -16) | |
168 | cfi_offset (r5, -12) | |
169 | orr tmp1, src1, src2 | |
170 | strd r6, r7, [sp, #8] | |
171 | cfi_offset (r6, -8) | |
172 | cfi_offset (r7, -4) | |
173 | mvn const_m1, #0 | |
174 | lsl r2, tmp1, #29 | |
175 | cbz r2, .Lloop_aligned8 | |
176 | ||
177 | .Lnot_aligned: | |
178 | eor tmp1, src1, src2 | |
179 | tst tmp1, #7 | |
180 | bne .Lmisaligned8 | |
181 | ||
182 | /* Deal with mutual misalignment by aligning downwards and then | |
183 | masking off the unwanted loaded data to prevent a difference. */ | |
184 | and tmp1, src1, #7 | |
185 | bic src1, src1, #7 | |
186 | and tmp2, tmp1, #3 | |
187 | bic src2, src2, #7 | |
188 | lsl tmp2, tmp2, #3 /* Bytes -> bits. */ | |
81cb7a0b | 189 | ldrd data1a, data1b, [src1], #16 |
f63c86fe | 190 | tst tmp1, #4 |
81cb7a0b | 191 | ldrd data2a, data2b, [src2], #16 |
0a982a29 RM |
192 | prepare_mask tmp1, tmp2 |
193 | apply_mask data1a, tmp1 | |
194 | apply_mask data2a, tmp1 | |
f63c86fe | 195 | beq .Lstart_realigned8 |
0a982a29 | 196 | apply_mask data1b, tmp1 |
f63c86fe | 197 | mov data1a, const_m1 |
0a982a29 | 198 | apply_mask data2b, tmp1 |
f63c86fe WN |
199 | mov data2a, const_m1 |
200 | b .Lstart_realigned8 | |
201 | ||
202 | /* Unwind the inner loop by a factor of 2, giving 16 bytes per | |
203 | pass. */ | |
204 | .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ | |
205 | .p2align 2 /* Always word aligned. */ | |
206 | .Lloop_aligned8: | |
81cb7a0b ZW |
207 | ldrd data1a, data1b, [src1], #16 |
208 | ldrd data2a, data2b, [src2], #16 | |
f63c86fe WN |
209 | .Lstart_realigned8: |
210 | uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ | |
211 | eor syndrome_a, data1a, data2a | |
212 | sel syndrome_a, syndrome_a, const_m1 | |
213 | cbnz syndrome_a, .Ldiff_in_a | |
214 | uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ | |
215 | eor syndrome_b, data1b, data2b | |
216 | sel syndrome_b, syndrome_b, const_m1 | |
217 | cbnz syndrome_b, .Ldiff_in_b | |
218 | ||
81cb7a0b ZW |
219 | ldrd data1a, data1b, [src1, #-8] |
220 | ldrd data2a, data2b, [src2, #-8] | |
f63c86fe WN |
221 | uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ |
222 | eor syndrome_a, data1a, data2a | |
223 | sel syndrome_a, syndrome_a, const_m1 | |
224 | uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ | |
225 | eor syndrome_b, data1b, data2b | |
226 | sel syndrome_b, syndrome_b, const_m1 | |
227 | /* Can't use CBZ for backwards branch. */ | |
228 | orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ | |
229 | beq .Lloop_aligned8 | |
230 | ||
231 | .Ldiff_found: | |
232 | cbnz syndrome_a, .Ldiff_in_a | |
233 | ||
234 | .Ldiff_in_b: | |
235 | strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 | |
236 | ||
237 | .Ldiff_in_a: | |
238 | cfi_restore_state | |
239 | strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 | |
240 | ||
241 | cfi_restore_state | |
242 | .Lmisaligned8: | |
243 | tst tmp1, #3 | |
244 | bne .Lmisaligned4 | |
245 | ands tmp1, src1, #3 | |
246 | bne .Lmutual_align4 | |
247 | ||
248 | /* Unrolled by a factor of 2, to reduce the number of post-increment | |
249 | operations. */ | |
250 | .Lloop_aligned4: | |
81cb7a0b ZW |
251 | ldr data1, [src1], #8 |
252 | ldr data2, [src2], #8 | |
f63c86fe WN |
253 | .Lstart_realigned4: |
254 | uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ | |
255 | eor syndrome, data1, data2 | |
256 | sel syndrome, syndrome, const_m1 | |
257 | cbnz syndrome, .Laligned4_done | |
81cb7a0b ZW |
258 | ldr data1, [src1, #-4] |
259 | ldr data2, [src2, #-4] | |
f63c86fe WN |
260 | uadd8 syndrome, data1, const_m1 |
261 | eor syndrome, data1, data2 | |
262 | sel syndrome, syndrome, const_m1 | |
263 | cmp syndrome, #0 | |
264 | beq .Lloop_aligned4 | |
265 | ||
266 | .Laligned4_done: | |
267 | strcmp_epilogue_aligned syndrome, data1, data2, 0 | |
268 | ||
269 | .Lmutual_align4: | |
270 | cfi_restore_state | |
271 | /* Deal with mutual misalignment by aligning downwards and then | |
272 | masking off the unwanted loaded data to prevent a difference. */ | |
273 | lsl tmp1, tmp1, #3 /* Bytes -> bits. */ | |
274 | bic src1, src1, #3 | |
81cb7a0b | 275 | ldr data1, [src1], #8 |
f63c86fe | 276 | bic src2, src2, #3 |
81cb7a0b | 277 | ldr data2, [src2], #8 |
f63c86fe | 278 | |
0a982a29 RM |
279 | prepare_mask tmp1, tmp1 |
280 | apply_mask data1, tmp1 | |
281 | apply_mask data2, tmp1 | |
f63c86fe WN |
282 | b .Lstart_realigned4 |
283 | ||
284 | .Lmisaligned4: | |
285 | ands tmp1, src1, #3 | |
286 | beq .Lsrc1_aligned | |
287 | sub src2, src2, tmp1 | |
288 | bic src1, src1, #3 | |
289 | lsls tmp1, tmp1, #31 | |
81cb7a0b | 290 | ldr data1, [src1], #4 |
f63c86fe WN |
291 | beq .Laligned_m2 |
292 | bcs .Laligned_m1 | |
293 | ||
294 | #if STRCMP_PRECHECK == 0 | |
81cb7a0b | 295 | ldrb data2, [src2, #1] |
f63c86fe WN |
296 | uxtb tmp1, data1, ror #BYTE1_OFFSET |
297 | subs tmp1, tmp1, data2 | |
298 | bne .Lmisaligned_exit | |
299 | cbz data2, .Lmisaligned_exit | |
300 | ||
301 | .Laligned_m2: | |
81cb7a0b | 302 | ldrb data2, [src2, #2] |
f63c86fe WN |
303 | uxtb tmp1, data1, ror #BYTE2_OFFSET |
304 | subs tmp1, tmp1, data2 | |
305 | bne .Lmisaligned_exit | |
306 | cbz data2, .Lmisaligned_exit | |
307 | ||
308 | .Laligned_m1: | |
81cb7a0b | 309 | ldrb data2, [src2, #3] |
f63c86fe WN |
310 | uxtb tmp1, data1, ror #BYTE3_OFFSET |
311 | subs tmp1, tmp1, data2 | |
312 | bne .Lmisaligned_exit | |
313 | add src2, src2, #4 | |
314 | cbnz data2, .Lsrc1_aligned | |
315 | #else /* STRCMP_PRECHECK */ | |
316 | /* If we've done the pre-check, then we don't need to check the | |
317 | first byte again here. */ | |
81cb7a0b | 318 | ldrb data2, [src2, #2] |
f63c86fe WN |
319 | uxtb tmp1, data1, ror #BYTE2_OFFSET |
320 | subs tmp1, tmp1, data2 | |
321 | bne .Lmisaligned_exit | |
322 | cbz data2, .Lmisaligned_exit | |
323 | ||
324 | .Laligned_m2: | |
81cb7a0b | 325 | ldrb data2, [src2, #3] |
f63c86fe WN |
326 | uxtb tmp1, data1, ror #BYTE3_OFFSET |
327 | subs tmp1, tmp1, data2 | |
328 | bne .Lmisaligned_exit | |
329 | cbnz data2, .Laligned_m1 | |
330 | #endif | |
331 | ||
332 | .Lmisaligned_exit: | |
333 | mov result, tmp1 | |
334 | ldr r4, [sp], #16 | |
335 | cfi_remember_state | |
336 | cfi_def_cfa_offset (0) | |
337 | cfi_restore (r4) | |
338 | cfi_restore (r5) | |
339 | cfi_restore (r6) | |
340 | cfi_restore (r7) | |
341 | bx lr | |
342 | ||
343 | #if STRCMP_PRECHECK == 1 | |
344 | .Laligned_m1: | |
345 | add src2, src2, #4 | |
346 | #endif | |
347 | .Lsrc1_aligned: | |
348 | cfi_restore_state | |
349 | /* src1 is word aligned, but src2 has no common alignment | |
350 | with it. */ | |
81cb7a0b | 351 | ldr data1, [src1], #4 |
f63c86fe WN |
352 | lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ |
353 | ||
354 | bic src2, src2, #3 | |
81cb7a0b | 355 | ldr data2, [src2], #4 |
f63c86fe WN |
356 | bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ |
357 | bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ | |
358 | ||
359 | /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ | |
360 | .Loverlap3: | |
361 | bic tmp1, data1, #MSB | |
362 | uadd8 syndrome, data1, const_m1 | |
363 | eors syndrome, tmp1, data2, S2LO #8 | |
364 | sel syndrome, syndrome, const_m1 | |
365 | bne 4f | |
366 | cbnz syndrome, 5f | |
81cb7a0b | 367 | ldr data2, [src2], #4 |
f63c86fe WN |
368 | eor tmp1, tmp1, data1 |
369 | cmp tmp1, data2, S2HI #24 | |
370 | bne 6f | |
81cb7a0b | 371 | ldr data1, [src1], #4 |
f63c86fe WN |
372 | b .Loverlap3 |
373 | 4: | |
374 | S2LO data2, data2, #8 | |
375 | b .Lstrcmp_tail | |
376 | ||
377 | 5: | |
378 | bics syndrome, syndrome, #MSB | |
379 | bne .Lstrcmp_done_equal | |
380 | ||
381 | /* We can only get here if the MSB of data1 contains 0, so | |
382 | fast-path the exit. */ | |
81cb7a0b | 383 | ldrb result, [src2] |
f63c86fe WN |
384 | ldrd r4, r5, [sp], #16 |
385 | cfi_remember_state | |
386 | cfi_def_cfa_offset (0) | |
387 | cfi_restore (r4) | |
388 | cfi_restore (r5) | |
389 | /* R6/7 Not used in this sequence. */ | |
390 | cfi_restore (r6) | |
391 | cfi_restore (r7) | |
392 | neg result, result | |
393 | bx lr | |
394 | ||
395 | 6: | |
396 | cfi_restore_state | |
397 | S2LO data1, data1, #24 | |
398 | and data2, data2, #LSB | |
399 | b .Lstrcmp_tail | |
400 | ||
401 | .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ | |
402 | .Loverlap2: | |
403 | and tmp1, data1, const_m1, S2LO #16 | |
404 | uadd8 syndrome, data1, const_m1 | |
405 | eors syndrome, tmp1, data2, S2LO #16 | |
406 | sel syndrome, syndrome, const_m1 | |
407 | bne 4f | |
408 | cbnz syndrome, 5f | |
81cb7a0b | 409 | ldr data2, [src2], #4 |
f63c86fe WN |
410 | eor tmp1, tmp1, data1 |
411 | cmp tmp1, data2, S2HI #16 | |
412 | bne 6f | |
81cb7a0b | 413 | ldr data1, [src1], #4 |
f63c86fe WN |
414 | b .Loverlap2 |
415 | 4: | |
416 | S2LO data2, data2, #16 | |
417 | b .Lstrcmp_tail | |
418 | 5: | |
419 | ands syndrome, syndrome, const_m1, S2LO #16 | |
420 | bne .Lstrcmp_done_equal | |
421 | ||
81cb7a0b | 422 | ldrh data2, [src2] |
f63c86fe WN |
423 | S2LO data1, data1, #16 |
424 | #ifdef __ARM_BIG_ENDIAN | |
425 | lsl data2, data2, #16 | |
426 | #endif | |
427 | b .Lstrcmp_tail | |
428 | ||
429 | 6: | |
430 | S2LO data1, data1, #16 | |
431 | and data2, data2, const_m1, S2LO #16 | |
432 | b .Lstrcmp_tail | |
433 | ||
434 | .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ | |
435 | .Loverlap1: | |
436 | and tmp1, data1, #LSB | |
437 | uadd8 syndrome, data1, const_m1 | |
438 | eors syndrome, tmp1, data2, S2LO #24 | |
439 | sel syndrome, syndrome, const_m1 | |
440 | bne 4f | |
441 | cbnz syndrome, 5f | |
81cb7a0b | 442 | ldr data2, [src2], #4 |
f63c86fe WN |
443 | eor tmp1, tmp1, data1 |
444 | cmp tmp1, data2, S2HI #8 | |
445 | bne 6f | |
81cb7a0b | 446 | ldr data1, [src1], #4 |
f63c86fe WN |
447 | b .Loverlap1 |
448 | 4: | |
449 | S2LO data2, data2, #24 | |
450 | b .Lstrcmp_tail | |
451 | 5: | |
452 | tst syndrome, #LSB | |
453 | bne .Lstrcmp_done_equal | |
81cb7a0b | 454 | ldr data2, [src2] |
f63c86fe WN |
455 | 6: |
456 | S2LO data1, data1, #8 | |
457 | bic data2, data2, #MSB | |
458 | b .Lstrcmp_tail | |
459 | ||
460 | .Lstrcmp_done_equal: | |
461 | mov result, #0 | |
462 | ldrd r4, r5, [sp], #16 | |
463 | cfi_remember_state | |
464 | cfi_def_cfa_offset (0) | |
465 | cfi_restore (r4) | |
466 | cfi_restore (r5) | |
467 | /* R6/7 not used in this sequence. */ | |
468 | cfi_restore (r6) | |
469 | cfi_restore (r7) | |
470 | bx lr | |
471 | ||
472 | .Lstrcmp_tail: | |
473 | cfi_restore_state | |
474 | #ifndef __ARM_BIG_ENDIAN | |
475 | rev data1, data1 | |
476 | rev data2, data2 | |
477 | /* Now everything looks big-endian... */ | |
478 | #endif | |
479 | uadd8 tmp1, data1, const_m1 | |
480 | eor tmp1, data1, data2 | |
481 | sel syndrome, tmp1, const_m1 | |
482 | clz tmp1, syndrome | |
483 | lsl data1, data1, tmp1 | |
484 | lsl data2, data2, tmp1 | |
485 | lsr result, data1, #24 | |
486 | ldrd r4, r5, [sp], #16 | |
487 | cfi_def_cfa_offset (0) | |
488 | cfi_restore (r4) | |
489 | cfi_restore (r5) | |
490 | /* R6/7 not used in this sequence. */ | |
491 | cfi_restore (r6) | |
492 | cfi_restore (r7) | |
493 | sub result, result, data2, lsr #24 | |
494 | bx lr | |
495 | END (strcmp) | |
496 | libc_hidden_builtin_def (strcmp) |