]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/arm/armv7/strcmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / arm / armv7 / strcmp.S
1 /* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
2 Copyright (C) 2012-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <arm-features.h>
20 #include <sysdep.h>
21
22 /* Implementation of strcmp for ARMv7 when DSP instructions are
23 available. Use ldrd to support wider loads, provided the data
24 is sufficiently aligned. Use saturating arithmetic to optimize
25 the compares. */
26
27 /* Build Options:
28 STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
29 string. If comparing completely random strings the pre-check will
30 save time, since there is a very high probability of a mismatch in
31 the first character: we save significant overhead if this is the
32 common case. However, if strings are likely to be identical (e.g.
33 because we're verifying a hit in a hash table), then this check
34 is largely redundant. */
35
36 #define STRCMP_PRECHECK 1
37
38 .syntax unified
39
40 #ifdef __ARM_BIG_ENDIAN
41 # define S2LO lsl
42 # define S2LOEQ lsleq
43 # define S2HI lsr
44 # define MSB 0x000000ff
45 # define LSB 0xff000000
46 # define BYTE0_OFFSET 24
47 # define BYTE1_OFFSET 16
48 # define BYTE2_OFFSET 8
49 # define BYTE3_OFFSET 0
50 #else /* not __ARM_BIG_ENDIAN */
51 # define S2LO lsr
52 # define S2LOEQ lsreq
53 # define S2HI lsl
54 # define BYTE0_OFFSET 0
55 # define BYTE1_OFFSET 8
56 # define BYTE2_OFFSET 16
57 # define BYTE3_OFFSET 24
58 # define MSB 0xff000000
59 # define LSB 0x000000ff
60 #endif /* not __ARM_BIG_ENDIAN */
61
62 /* Parameters and result. */
63 #define src1 r0
64 #define src2 r1
65 #define result r0 /* Overlaps src1. */
66
67 /* Internal variables. */
68 #define tmp1 r4
69 #define tmp2 r5
70 #define const_m1 r12
71
72 /* Additional internal variables for 64-bit aligned data. */
73 #define data1a r2
74 #define data1b r3
75 #define data2a r6
76 #define data2b r7
77 #define syndrome_a tmp1
78 #define syndrome_b tmp2
79
80 /* Additional internal variables for 32-bit aligned data. */
81 #define data1 r2
82 #define data2 r3
83 #define syndrome tmp2
84
85
86 #ifndef NO_THUMB
87 /* This code is best on Thumb. */
88 .thumb
89
90 /* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
91 .macro prepare_mask mask_reg, nbits_reg
92 S2HI \mask_reg, const_m1, \nbits_reg
93 .endm
94 .macro apply_mask data_reg, mask_reg
95 orn \data_reg, \data_reg, \mask_reg
96 .endm
97 #else
98 /* In ARM code we don't have ORN, but we can use MVN with a register shift. */
99 .macro prepare_mask mask_reg, nbits_reg
100 mvn \mask_reg, const_m1, S2HI \nbits_reg
101 .endm
102 .macro apply_mask data_reg, mask_reg
103 orr \data_reg, \data_reg, \mask_reg
104 .endm
105
106 /* These clobber the condition codes, which the real Thumb cbz/cbnz
107 instructions do not. But it doesn't matter for any of the uses here. */
108 .macro cbz reg, label
109 cmp \reg, #0
110 beq \label
111 .endm
112 .macro cbnz reg, label
113 cmp \reg, #0
114 bne \label
115 .endm
116 #endif
117
118
119 /* Macro to compute and return the result value for word-aligned
120 cases. */
121 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
122 #ifdef __ARM_BIG_ENDIAN
123 /* If data1 contains a zero byte, then syndrome will contain a 1 in
124 bit 7 of that byte. Otherwise, the highest set bit in the
125 syndrome will highlight the first different bit. It is therefore
126 sufficient to extract the eight bits starting with the syndrome
127 bit. */
128 clz tmp1, \synd
129 lsl r1, \d2, tmp1
130 .if \restore_r6
131 ldrd r6, r7, [sp, #8]
132 .endif
133 lsl \d1, \d1, tmp1
134 lsr result, \d1, #24
135 ldrd r4, r5, [sp], #16
136 cfi_remember_state
137 cfi_def_cfa_offset (0)
138 cfi_restore (r4)
139 cfi_restore (r5)
140 cfi_restore (r6)
141 cfi_restore (r7)
142 sub result, result, r1, lsr #24
143 bx lr
144 #else
145 /* To use the big-endian trick we'd have to reverse all three words.
146 that's slower than this approach. */
147 rev \synd, \synd
148 clz tmp1, \synd
149 bic tmp1, tmp1, #7
150 lsr r1, \d2, tmp1
151 .if \restore_r6
152 ldrd r6, r7, [sp, #8]
153 .endif
154 lsr \d1, \d1, tmp1
155 and result, \d1, #255
156 and r1, r1, #255
157 ldrd r4, r5, [sp], #16
158 cfi_remember_state
159 cfi_def_cfa_offset (0)
160 cfi_restore (r4)
161 cfi_restore (r5)
162 cfi_restore (r6)
163 cfi_restore (r7)
164 sub result, result, r1
165
166 bx lr
167 #endif
168 .endm
169
170 .text
171 .p2align 5
172 .Lstrcmp_start_addr:
173 #if STRCMP_PRECHECK == 1
174 .Lfastpath_exit:
175 sub r0, r2, r3
176 bx lr
177 nop
178 #endif
179 ENTRY (strcmp)
180 #if STRCMP_PRECHECK == 1
181 ldrb r2, [src1]
182 ldrb r3, [src2]
183 cmp r2, #1
184 it cs
185 cmpcs r2, r3
186 bne .Lfastpath_exit
187 #endif
188 strd r4, r5, [sp, #-16]!
189 cfi_def_cfa_offset (16)
190 cfi_offset (r4, -16)
191 cfi_offset (r5, -12)
192 orr tmp1, src1, src2
193 strd r6, r7, [sp, #8]
194 cfi_offset (r6, -8)
195 cfi_offset (r7, -4)
196 mvn const_m1, #0
197 lsl r2, tmp1, #29
198 cbz r2, .Lloop_aligned8
199
200 .Lnot_aligned:
201 eor tmp1, src1, src2
202 tst tmp1, #7
203 bne .Lmisaligned8
204
205 /* Deal with mutual misalignment by aligning downwards and then
206 masking off the unwanted loaded data to prevent a difference. */
207 and tmp1, src1, #7
208 bic src1, src1, #7
209 and tmp2, tmp1, #3
210 bic src2, src2, #7
211 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
212 ldrd data1a, data1b, [src1], #16
213 tst tmp1, #4
214 ldrd data2a, data2b, [src2], #16
215 prepare_mask tmp1, tmp2
216 apply_mask data1a, tmp1
217 apply_mask data2a, tmp1
218 beq .Lstart_realigned8
219 apply_mask data1b, tmp1
220 mov data1a, const_m1
221 apply_mask data2b, tmp1
222 mov data2a, const_m1
223 b .Lstart_realigned8
224
225 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
226 pass. */
227 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
228 .p2align 2 /* Always word aligned. */
229 .Lloop_aligned8:
230 ldrd data1a, data1b, [src1], #16
231 ldrd data2a, data2b, [src2], #16
232 .Lstart_realigned8:
233 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
234 eor syndrome_a, data1a, data2a
235 sel syndrome_a, syndrome_a, const_m1
236 cbnz syndrome_a, .Ldiff_in_a
237 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
238 eor syndrome_b, data1b, data2b
239 sel syndrome_b, syndrome_b, const_m1
240 cbnz syndrome_b, .Ldiff_in_b
241
242 ldrd data1a, data1b, [src1, #-8]
243 ldrd data2a, data2b, [src2, #-8]
244 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
245 eor syndrome_a, data1a, data2a
246 sel syndrome_a, syndrome_a, const_m1
247 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
248 eor syndrome_b, data1b, data2b
249 sel syndrome_b, syndrome_b, const_m1
250 /* Can't use CBZ for backwards branch. */
251 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
252 beq .Lloop_aligned8
253
254 .Ldiff_found:
255 cbnz syndrome_a, .Ldiff_in_a
256
257 .Ldiff_in_b:
258 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
259
260 .Ldiff_in_a:
261 cfi_restore_state
262 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
263
264 cfi_restore_state
265 .Lmisaligned8:
266 tst tmp1, #3
267 bne .Lmisaligned4
268 ands tmp1, src1, #3
269 bne .Lmutual_align4
270
271 /* Unrolled by a factor of 2, to reduce the number of post-increment
272 operations. */
273 .Lloop_aligned4:
274 ldr data1, [src1], #8
275 ldr data2, [src2], #8
276 .Lstart_realigned4:
277 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
278 eor syndrome, data1, data2
279 sel syndrome, syndrome, const_m1
280 cbnz syndrome, .Laligned4_done
281 ldr data1, [src1, #-4]
282 ldr data2, [src2, #-4]
283 uadd8 syndrome, data1, const_m1
284 eor syndrome, data1, data2
285 sel syndrome, syndrome, const_m1
286 cmp syndrome, #0
287 beq .Lloop_aligned4
288
289 .Laligned4_done:
290 strcmp_epilogue_aligned syndrome, data1, data2, 0
291
292 .Lmutual_align4:
293 cfi_restore_state
294 /* Deal with mutual misalignment by aligning downwards and then
295 masking off the unwanted loaded data to prevent a difference. */
296 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
297 bic src1, src1, #3
298 ldr data1, [src1], #8
299 bic src2, src2, #3
300 ldr data2, [src2], #8
301
302 prepare_mask tmp1, tmp1
303 apply_mask data1, tmp1
304 apply_mask data2, tmp1
305 b .Lstart_realigned4
306
307 .Lmisaligned4:
308 ands tmp1, src1, #3
309 beq .Lsrc1_aligned
310 sub src2, src2, tmp1
311 bic src1, src1, #3
312 lsls tmp1, tmp1, #31
313 ldr data1, [src1], #4
314 beq .Laligned_m2
315 bcs .Laligned_m1
316
317 #if STRCMP_PRECHECK == 0
318 ldrb data2, [src2, #1]
319 uxtb tmp1, data1, ror #BYTE1_OFFSET
320 subs tmp1, tmp1, data2
321 bne .Lmisaligned_exit
322 cbz data2, .Lmisaligned_exit
323
324 .Laligned_m2:
325 ldrb data2, [src2, #2]
326 uxtb tmp1, data1, ror #BYTE2_OFFSET
327 subs tmp1, tmp1, data2
328 bne .Lmisaligned_exit
329 cbz data2, .Lmisaligned_exit
330
331 .Laligned_m1:
332 ldrb data2, [src2, #3]
333 uxtb tmp1, data1, ror #BYTE3_OFFSET
334 subs tmp1, tmp1, data2
335 bne .Lmisaligned_exit
336 add src2, src2, #4
337 cbnz data2, .Lsrc1_aligned
338 #else /* STRCMP_PRECHECK */
339 /* If we've done the pre-check, then we don't need to check the
340 first byte again here. */
341 ldrb data2, [src2, #2]
342 uxtb tmp1, data1, ror #BYTE2_OFFSET
343 subs tmp1, tmp1, data2
344 bne .Lmisaligned_exit
345 cbz data2, .Lmisaligned_exit
346
347 .Laligned_m2:
348 ldrb data2, [src2, #3]
349 uxtb tmp1, data1, ror #BYTE3_OFFSET
350 subs tmp1, tmp1, data2
351 bne .Lmisaligned_exit
352 cbnz data2, .Laligned_m1
353 #endif
354
355 .Lmisaligned_exit:
356 mov result, tmp1
357 ldr r4, [sp], #16
358 cfi_remember_state
359 cfi_def_cfa_offset (0)
360 cfi_restore (r4)
361 cfi_restore (r5)
362 cfi_restore (r6)
363 cfi_restore (r7)
364 bx lr
365
366 #if STRCMP_PRECHECK == 1
367 .Laligned_m1:
368 add src2, src2, #4
369 #endif
370 .Lsrc1_aligned:
371 cfi_restore_state
372 /* src1 is word aligned, but src2 has no common alignment
373 with it. */
374 ldr data1, [src1], #4
375 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
376
377 bic src2, src2, #3
378 ldr data2, [src2], #4
379 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
380 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
381
382 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
383 .Loverlap3:
384 bic tmp1, data1, #MSB
385 uadd8 syndrome, data1, const_m1
386 eors syndrome, tmp1, data2, S2LO #8
387 sel syndrome, syndrome, const_m1
388 bne 4f
389 cbnz syndrome, 5f
390 ldr data2, [src2], #4
391 eor tmp1, tmp1, data1
392 cmp tmp1, data2, S2HI #24
393 bne 6f
394 ldr data1, [src1], #4
395 b .Loverlap3
396 4:
397 S2LO data2, data2, #8
398 b .Lstrcmp_tail
399
400 5:
401 bics syndrome, syndrome, #MSB
402 bne .Lstrcmp_done_equal
403
404 /* We can only get here if the MSB of data1 contains 0, so
405 fast-path the exit. */
406 ldrb result, [src2]
407 ldrd r4, r5, [sp], #16
408 cfi_remember_state
409 cfi_def_cfa_offset (0)
410 cfi_restore (r4)
411 cfi_restore (r5)
412 /* R6/7 Not used in this sequence. */
413 cfi_restore (r6)
414 cfi_restore (r7)
415 neg result, result
416 bx lr
417
418 6:
419 cfi_restore_state
420 S2LO data1, data1, #24
421 and data2, data2, #LSB
422 b .Lstrcmp_tail
423
424 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
425 .Loverlap2:
426 and tmp1, data1, const_m1, S2LO #16
427 uadd8 syndrome, data1, const_m1
428 eors syndrome, tmp1, data2, S2LO #16
429 sel syndrome, syndrome, const_m1
430 bne 4f
431 cbnz syndrome, 5f
432 ldr data2, [src2], #4
433 eor tmp1, tmp1, data1
434 cmp tmp1, data2, S2HI #16
435 bne 6f
436 ldr data1, [src1], #4
437 b .Loverlap2
438 4:
439 S2LO data2, data2, #16
440 b .Lstrcmp_tail
441 5:
442 ands syndrome, syndrome, const_m1, S2LO #16
443 bne .Lstrcmp_done_equal
444
445 ldrh data2, [src2]
446 S2LO data1, data1, #16
447 #ifdef __ARM_BIG_ENDIAN
448 lsl data2, data2, #16
449 #endif
450 b .Lstrcmp_tail
451
452 6:
453 S2LO data1, data1, #16
454 and data2, data2, const_m1, S2LO #16
455 b .Lstrcmp_tail
456
457 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
458 .Loverlap1:
459 and tmp1, data1, #LSB
460 uadd8 syndrome, data1, const_m1
461 eors syndrome, tmp1, data2, S2LO #24
462 sel syndrome, syndrome, const_m1
463 bne 4f
464 cbnz syndrome, 5f
465 ldr data2, [src2], #4
466 eor tmp1, tmp1, data1
467 cmp tmp1, data2, S2HI #8
468 bne 6f
469 ldr data1, [src1], #4
470 b .Loverlap1
471 4:
472 S2LO data2, data2, #24
473 b .Lstrcmp_tail
474 5:
475 tst syndrome, #LSB
476 bne .Lstrcmp_done_equal
477 ldr data2, [src2]
478 6:
479 S2LO data1, data1, #8
480 bic data2, data2, #MSB
481 b .Lstrcmp_tail
482
483 .Lstrcmp_done_equal:
484 mov result, #0
485 ldrd r4, r5, [sp], #16
486 cfi_remember_state
487 cfi_def_cfa_offset (0)
488 cfi_restore (r4)
489 cfi_restore (r5)
490 /* R6/7 not used in this sequence. */
491 cfi_restore (r6)
492 cfi_restore (r7)
493 bx lr
494
495 .Lstrcmp_tail:
496 cfi_restore_state
497 #ifndef __ARM_BIG_ENDIAN
498 rev data1, data1
499 rev data2, data2
500 /* Now everything looks big-endian... */
501 #endif
502 uadd8 tmp1, data1, const_m1
503 eor tmp1, data1, data2
504 sel syndrome, tmp1, const_m1
505 clz tmp1, syndrome
506 lsl data1, data1, tmp1
507 lsl data2, data2, tmp1
508 lsr result, data1, #24
509 ldrd r4, r5, [sp], #16
510 cfi_def_cfa_offset (0)
511 cfi_restore (r4)
512 cfi_restore (r5)
513 /* R6/7 not used in this sequence. */
514 cfi_restore (r6)
515 cfi_restore (r7)
516 sub result, result, data2, lsr #24
517 bx lr
518 END (strcmp)
519 libc_hidden_builtin_def (strcmp)