]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/arm/armv7/strcmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / arm / armv7 / strcmp.S
1 /* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
2 Copyright (C) 2012-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <arm-features.h>
20 #include <sysdep.h>
21
22 /* Implementation of strcmp for ARMv7 when DSP instructions are
23 available. Use ldrd to support wider loads, provided the data
24 is sufficiently aligned. Use saturating arithmetic to optimize
25 the compares. */
26
27 /* Build Options:
28 STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
29 string. If comparing completely random strings the pre-check will
30 save time, since there is a very high probability of a mismatch in
31 the first character: we save significant overhead if this is the
32 common case. However, if strings are likely to be identical (e.g.
33 because we're verifying a hit in a hash table), then this check
34 is largely redundant. */
35
36 #define STRCMP_PRECHECK 1
37
38 .syntax unified
39
40 #ifdef __ARM_BIG_ENDIAN
41 # define S2LO lsl
42 # define S2LOEQ lsleq
43 # define S2HI lsr
44 # define MSB 0x000000ff
45 # define LSB 0xff000000
46 # define BYTE0_OFFSET 24
47 # define BYTE1_OFFSET 16
48 # define BYTE2_OFFSET 8
49 # define BYTE3_OFFSET 0
50 #else /* not __ARM_BIG_ENDIAN */
51 # define S2LO lsr
52 # define S2LOEQ lsreq
53 # define S2HI lsl
54 # define BYTE0_OFFSET 0
55 # define BYTE1_OFFSET 8
56 # define BYTE2_OFFSET 16
57 # define BYTE3_OFFSET 24
58 # define MSB 0xff000000
59 # define LSB 0x000000ff
60 #endif /* not __ARM_BIG_ENDIAN */
61
62 /* Parameters and result. */
63 #define src1 r0
64 #define src2 r1
65 #define result r0 /* Overlaps src1. */
66
67 /* Internal variables. */
68 #define tmp1 r4
69 #define tmp2 r5
70 #define const_m1 r12
71
72 /* Additional internal variables for 64-bit aligned data. */
73 #define data1a r2
74 #define data1b r3
75 #define data2a r6
76 #define data2b r7
77 #define syndrome_a tmp1
78 #define syndrome_b tmp2
79
80 /* Additional internal variables for 32-bit aligned data. */
81 #define data1 r2
82 #define data2 r3
83 #define syndrome tmp2
84
85
86 #ifndef NO_THUMB
87 /* This code is best on Thumb. */
88 .thumb
89
90 /* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
91 .macro prepare_mask mask_reg, nbits_reg
92 S2HI \mask_reg, const_m1, \nbits_reg
93 .endm
94 .macro apply_mask data_reg, mask_reg
95 orn \data_reg, \data_reg, \mask_reg
96 .endm
97 #else
98 /* In ARM code we don't have ORN, but we can use MVN with a register shift. */
99 .macro prepare_mask mask_reg, nbits_reg
100 mvn \mask_reg, const_m1, S2HI \nbits_reg
101 .endm
102 .macro apply_mask data_reg, mask_reg
103 orr \data_reg, \data_reg, \mask_reg
104 .endm
105
106 /* These clobber the condition codes, which the real Thumb cbz/cbnz
107 instructions do not. But it doesn't matter for any of the uses here. */
108 .macro cbz reg, label
109 cmp \reg, #0
110 beq \label
111 .endm
112 .macro cbnz reg, label
113 cmp \reg, #0
114 bne \label
115 .endm
116 #endif
117
118
119 /* Macro to compute and return the result value for word-aligned
120 cases. */
121 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
122 #ifdef __ARM_BIG_ENDIAN
123 /* If data1 contains a zero byte, then syndrome will contain a 1 in
124 bit 7 of that byte. Otherwise, the highest set bit in the
125 syndrome will highlight the first different bit. It is therefore
126 sufficient to extract the eight bits starting with the syndrome
127 bit. */
128 clz tmp1, \synd
129 lsl r1, \d2, tmp1
130 .if \restore_r6
131 ldrd r6, r7, [sp, #8]
132 .endif
133 lsl \d1, \d1, tmp1
134 lsr result, \d1, #24
135 ldrd r4, r5, [sp], #16
136 cfi_remember_state
137 cfi_def_cfa_offset (0)
138 cfi_restore (r4)
139 cfi_restore (r5)
140 cfi_restore (r6)
141 cfi_restore (r7)
142 sub result, result, r1, lsr #24
143 bx lr
144 #else
145 /* To use the big-endian trick we'd have to reverse all three words.
146 that's slower than this approach. */
147 rev \synd, \synd
148 clz tmp1, \synd
149 bic tmp1, tmp1, #7
150 lsr r1, \d2, tmp1
151 .if \restore_r6
152 ldrd r6, r7, [sp, #8]
153 .endif
154 lsr \d1, \d1, tmp1
155 and result, \d1, #255
156 and r1, r1, #255
157 ldrd r4, r5, [sp], #16
158 cfi_remember_state
159 cfi_def_cfa_offset (0)
160 cfi_restore (r4)
161 cfi_restore (r5)
162 cfi_restore (r6)
163 cfi_restore (r7)
164 sub result, result, r1
165
166 bx lr
167 #endif
168 .endm
169
170 .text
171 .p2align 5
172 .Lstrcmp_start_addr:
173 #if STRCMP_PRECHECK == 1
174 .Lfastpath_exit:
175 sub r0, r2, r3
176 bx lr
177 nop
178 #endif
179 ENTRY (strcmp)
180 #if STRCMP_PRECHECK == 1
181 sfi_breg src1, \
182 ldrb r2, [\B]
183 sfi_breg src2, \
184 ldrb r3, [\B]
185 cmp r2, #1
186 it cs
187 cmpcs r2, r3
188 bne .Lfastpath_exit
189 #endif
190 strd r4, r5, [sp, #-16]!
191 cfi_def_cfa_offset (16)
192 cfi_offset (r4, -16)
193 cfi_offset (r5, -12)
194 orr tmp1, src1, src2
195 strd r6, r7, [sp, #8]
196 cfi_offset (r6, -8)
197 cfi_offset (r7, -4)
198 mvn const_m1, #0
199 lsl r2, tmp1, #29
200 cbz r2, .Lloop_aligned8
201
202 .Lnot_aligned:
203 eor tmp1, src1, src2
204 tst tmp1, #7
205 bne .Lmisaligned8
206
207 /* Deal with mutual misalignment by aligning downwards and then
208 masking off the unwanted loaded data to prevent a difference. */
209 and tmp1, src1, #7
210 bic src1, src1, #7
211 and tmp2, tmp1, #3
212 bic src2, src2, #7
213 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
214 sfi_breg src1, \
215 ldrd data1a, data1b, [\B], #16
216 tst tmp1, #4
217 sfi_breg src2, \
218 ldrd data2a, data2b, [\B], #16
219 prepare_mask tmp1, tmp2
220 apply_mask data1a, tmp1
221 apply_mask data2a, tmp1
222 beq .Lstart_realigned8
223 apply_mask data1b, tmp1
224 mov data1a, const_m1
225 apply_mask data2b, tmp1
226 mov data2a, const_m1
227 b .Lstart_realigned8
228
229 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
230 pass. */
231 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
232 .p2align 2 /* Always word aligned. */
233 .Lloop_aligned8:
234 sfi_breg src1, \
235 ldrd data1a, data1b, [\B], #16
236 sfi_breg src2, \
237 ldrd data2a, data2b, [\B], #16
238 .Lstart_realigned8:
239 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
240 eor syndrome_a, data1a, data2a
241 sel syndrome_a, syndrome_a, const_m1
242 cbnz syndrome_a, .Ldiff_in_a
243 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
244 eor syndrome_b, data1b, data2b
245 sel syndrome_b, syndrome_b, const_m1
246 cbnz syndrome_b, .Ldiff_in_b
247
248 sfi_breg src1, \
249 ldrd data1a, data1b, [\B, #-8]
250 sfi_breg src2, \
251 ldrd data2a, data2b, [\B, #-8]
252 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
253 eor syndrome_a, data1a, data2a
254 sel syndrome_a, syndrome_a, const_m1
255 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
256 eor syndrome_b, data1b, data2b
257 sel syndrome_b, syndrome_b, const_m1
258 /* Can't use CBZ for backwards branch. */
259 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
260 beq .Lloop_aligned8
261
262 .Ldiff_found:
263 cbnz syndrome_a, .Ldiff_in_a
264
265 .Ldiff_in_b:
266 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
267
268 .Ldiff_in_a:
269 cfi_restore_state
270 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
271
272 cfi_restore_state
273 .Lmisaligned8:
274 tst tmp1, #3
275 bne .Lmisaligned4
276 ands tmp1, src1, #3
277 bne .Lmutual_align4
278
279 /* Unrolled by a factor of 2, to reduce the number of post-increment
280 operations. */
281 .Lloop_aligned4:
282 sfi_breg src1, \
283 ldr data1, [\B], #8
284 sfi_breg src2, \
285 ldr data2, [\B], #8
286 .Lstart_realigned4:
287 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
288 eor syndrome, data1, data2
289 sel syndrome, syndrome, const_m1
290 cbnz syndrome, .Laligned4_done
291 sfi_breg src1, \
292 ldr data1, [\B, #-4]
293 sfi_breg src2, \
294 ldr data2, [\B, #-4]
295 uadd8 syndrome, data1, const_m1
296 eor syndrome, data1, data2
297 sel syndrome, syndrome, const_m1
298 cmp syndrome, #0
299 beq .Lloop_aligned4
300
301 .Laligned4_done:
302 strcmp_epilogue_aligned syndrome, data1, data2, 0
303
304 .Lmutual_align4:
305 cfi_restore_state
306 /* Deal with mutual misalignment by aligning downwards and then
307 masking off the unwanted loaded data to prevent a difference. */
308 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
309 bic src1, src1, #3
310 sfi_breg src1, \
311 ldr data1, [\B], #8
312 bic src2, src2, #3
313 sfi_breg src2, \
314 ldr data2, [\B], #8
315
316 prepare_mask tmp1, tmp1
317 apply_mask data1, tmp1
318 apply_mask data2, tmp1
319 b .Lstart_realigned4
320
321 .Lmisaligned4:
322 ands tmp1, src1, #3
323 beq .Lsrc1_aligned
324 sub src2, src2, tmp1
325 bic src1, src1, #3
326 lsls tmp1, tmp1, #31
327 sfi_breg src1, \
328 ldr data1, [\B], #4
329 beq .Laligned_m2
330 bcs .Laligned_m1
331
332 #if STRCMP_PRECHECK == 0
333 sfi_breg src2, \
334 ldrb data2, [\B, #1]
335 uxtb tmp1, data1, ror #BYTE1_OFFSET
336 subs tmp1, tmp1, data2
337 bne .Lmisaligned_exit
338 cbz data2, .Lmisaligned_exit
339
340 .Laligned_m2:
341 sfi_breg src2, \
342 ldrb data2, [\B, #2]
343 uxtb tmp1, data1, ror #BYTE2_OFFSET
344 subs tmp1, tmp1, data2
345 bne .Lmisaligned_exit
346 cbz data2, .Lmisaligned_exit
347
348 .Laligned_m1:
349 sfi_breg src2, \
350 ldrb data2, [\B, #3]
351 uxtb tmp1, data1, ror #BYTE3_OFFSET
352 subs tmp1, tmp1, data2
353 bne .Lmisaligned_exit
354 add src2, src2, #4
355 cbnz data2, .Lsrc1_aligned
356 #else /* STRCMP_PRECHECK */
357 /* If we've done the pre-check, then we don't need to check the
358 first byte again here. */
359 sfi_breg src2, \
360 ldrb data2, [\B, #2]
361 uxtb tmp1, data1, ror #BYTE2_OFFSET
362 subs tmp1, tmp1, data2
363 bne .Lmisaligned_exit
364 cbz data2, .Lmisaligned_exit
365
366 .Laligned_m2:
367 sfi_breg src2, \
368 ldrb data2, [\B, #3]
369 uxtb tmp1, data1, ror #BYTE3_OFFSET
370 subs tmp1, tmp1, data2
371 bne .Lmisaligned_exit
372 cbnz data2, .Laligned_m1
373 #endif
374
375 .Lmisaligned_exit:
376 mov result, tmp1
377 ldr r4, [sp], #16
378 cfi_remember_state
379 cfi_def_cfa_offset (0)
380 cfi_restore (r4)
381 cfi_restore (r5)
382 cfi_restore (r6)
383 cfi_restore (r7)
384 bx lr
385
386 #if STRCMP_PRECHECK == 1
387 .Laligned_m1:
388 add src2, src2, #4
389 #endif
390 .Lsrc1_aligned:
391 cfi_restore_state
392 /* src1 is word aligned, but src2 has no common alignment
393 with it. */
394 sfi_breg src1, \
395 ldr data1, [\B], #4
396 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
397
398 bic src2, src2, #3
399 sfi_breg src2, \
400 ldr data2, [\B], #4
401 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
402 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
403
404 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
405 .Loverlap3:
406 bic tmp1, data1, #MSB
407 uadd8 syndrome, data1, const_m1
408 eors syndrome, tmp1, data2, S2LO #8
409 sel syndrome, syndrome, const_m1
410 bne 4f
411 cbnz syndrome, 5f
412 sfi_breg src2, \
413 ldr data2, [\B], #4
414 eor tmp1, tmp1, data1
415 cmp tmp1, data2, S2HI #24
416 bne 6f
417 sfi_breg src1, \
418 ldr data1, [\B], #4
419 b .Loverlap3
420 4:
421 S2LO data2, data2, #8
422 b .Lstrcmp_tail
423
424 5:
425 bics syndrome, syndrome, #MSB
426 bne .Lstrcmp_done_equal
427
428 /* We can only get here if the MSB of data1 contains 0, so
429 fast-path the exit. */
430 sfi_breg src2, \
431 ldrb result, [\B]
432 ldrd r4, r5, [sp], #16
433 cfi_remember_state
434 cfi_def_cfa_offset (0)
435 cfi_restore (r4)
436 cfi_restore (r5)
437 /* R6/7 Not used in this sequence. */
438 cfi_restore (r6)
439 cfi_restore (r7)
440 neg result, result
441 bx lr
442
443 6:
444 cfi_restore_state
445 S2LO data1, data1, #24
446 and data2, data2, #LSB
447 b .Lstrcmp_tail
448
449 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
450 .Loverlap2:
451 and tmp1, data1, const_m1, S2LO #16
452 uadd8 syndrome, data1, const_m1
453 eors syndrome, tmp1, data2, S2LO #16
454 sel syndrome, syndrome, const_m1
455 bne 4f
456 cbnz syndrome, 5f
457 sfi_breg src2, \
458 ldr data2, [\B], #4
459 eor tmp1, tmp1, data1
460 cmp tmp1, data2, S2HI #16
461 bne 6f
462 sfi_breg src1, \
463 ldr data1, [\B], #4
464 b .Loverlap2
465 4:
466 S2LO data2, data2, #16
467 b .Lstrcmp_tail
468 5:
469 ands syndrome, syndrome, const_m1, S2LO #16
470 bne .Lstrcmp_done_equal
471
472 sfi_breg src2, \
473 ldrh data2, [\B]
474 S2LO data1, data1, #16
475 #ifdef __ARM_BIG_ENDIAN
476 lsl data2, data2, #16
477 #endif
478 b .Lstrcmp_tail
479
480 6:
481 S2LO data1, data1, #16
482 and data2, data2, const_m1, S2LO #16
483 b .Lstrcmp_tail
484
485 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
486 .Loverlap1:
487 and tmp1, data1, #LSB
488 uadd8 syndrome, data1, const_m1
489 eors syndrome, tmp1, data2, S2LO #24
490 sel syndrome, syndrome, const_m1
491 bne 4f
492 cbnz syndrome, 5f
493 sfi_breg src2, \
494 ldr data2, [\B], #4
495 eor tmp1, tmp1, data1
496 cmp tmp1, data2, S2HI #8
497 bne 6f
498 sfi_breg src1, \
499 ldr data1, [\B], #4
500 b .Loverlap1
501 4:
502 S2LO data2, data2, #24
503 b .Lstrcmp_tail
504 5:
505 tst syndrome, #LSB
506 bne .Lstrcmp_done_equal
507 sfi_breg src2, \
508 ldr data2, [\B]
509 6:
510 S2LO data1, data1, #8
511 bic data2, data2, #MSB
512 b .Lstrcmp_tail
513
514 .Lstrcmp_done_equal:
515 mov result, #0
516 ldrd r4, r5, [sp], #16
517 cfi_remember_state
518 cfi_def_cfa_offset (0)
519 cfi_restore (r4)
520 cfi_restore (r5)
521 /* R6/7 not used in this sequence. */
522 cfi_restore (r6)
523 cfi_restore (r7)
524 bx lr
525
526 .Lstrcmp_tail:
527 cfi_restore_state
528 #ifndef __ARM_BIG_ENDIAN
529 rev data1, data1
530 rev data2, data2
531 /* Now everything looks big-endian... */
532 #endif
533 uadd8 tmp1, data1, const_m1
534 eor tmp1, data1, data2
535 sel syndrome, tmp1, const_m1
536 clz tmp1, syndrome
537 lsl data1, data1, tmp1
538 lsl data2, data2, tmp1
539 lsr result, data1, #24
540 ldrd r4, r5, [sp], #16
541 cfi_def_cfa_offset (0)
542 cfi_restore (r4)
543 cfi_restore (r5)
544 /* R6/7 not used in this sequence. */
545 cfi_restore (r6)
546 cfi_restore (r7)
547 sub result, result, data2, lsr #24
548 bx lr
549 END (strcmp)
550 libc_hidden_builtin_def (strcmp)