]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power7/memcmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memcmp.S
1 /* Optimized memcmp implementation for POWER7/PowerPC64.
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
24 #ifndef MEMCMP
25 # define MEMCMP memcmp
26 #endif
27 .machine power7
28 ENTRY_TOCLESS (MEMCMP, 4)
29 CALL_MCOUNT 3
30
31 #define rRTN r3
32 #define rSTR1 r3 /* first string arg */
33 #define rSTR2 r4 /* second string arg */
34 #define rN r5 /* max string length */
35 #define rWORD1 r6 /* current word in s1 */
36 #define rWORD2 r7 /* current word in s2 */
37 #define rWORD3 r8 /* next word in s1 */
38 #define rWORD4 r9 /* next word in s2 */
39 #define rWORD5 r10 /* next word in s1 */
40 #define rWORD6 r11 /* next word in s2 */
41
42 #define rOFF8 r20 /* 8 bytes offset. */
43 #define rOFF16 r21 /* 16 bytes offset. */
44 #define rOFF24 r22 /* 24 bytes offset. */
45 #define rOFF32 r23 /* 24 bytes offset. */
46 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
47 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
48 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
49 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
50 #define rSHR r28 /* Unaligned shift right count. */
51 #define rSHL r29 /* Unaligned shift left count. */
52 #define rWORD7 r30 /* next word in s1 */
53 #define rWORD8 r31 /* next word in s2 */
54
55 #define rWORD8SAVE (-8)
56 #define rWORD7SAVE (-16)
57 #define rOFF8SAVE (-24)
58 #define rOFF16SAVE (-32)
59 #define rOFF24SAVE (-40)
60 #define rOFF32SAVE (-48)
61 #define rSHRSAVE (-56)
62 #define rSHLSAVE (-64)
63 #define rWORD8SHIFTSAVE (-72)
64 #define rWORD2SHIFTSAVE (-80)
65 #define rWORD4SHIFTSAVE (-88)
66 #define rWORD6SHIFTSAVE (-96)
67
68 #ifdef __LITTLE_ENDIAN__
69 # define LD ldbrx
70 #else
71 # define LD ldx
72 #endif
73
74 xor r0, rSTR2, rSTR1
75 cmpldi cr6, rN, 0
76 cmpldi cr1, rN, 12
77 clrldi. r0, r0, 61
78 clrldi r12, rSTR1, 61
79 cmpldi cr5, r12, 0
80 beq- cr6, L(zeroLength)
81 dcbt 0, rSTR1
82 dcbt 0, rSTR2
83 /* If less than 8 bytes or not aligned, use the unaligned
84 byte loop. */
85 blt cr1, L(bytealigned)
86 std rWORD8, rWORD8SAVE(r1)
87 std rWORD7, rWORD7SAVE(r1)
88 std rOFF8, rOFF8SAVE(r1)
89 std rOFF16, rOFF16SAVE(r1)
90 std rOFF24, rOFF24SAVE(r1)
91 std rOFF32, rOFF32SAVE(r1)
92 cfi_offset(rWORD8, rWORD8SAVE)
93 cfi_offset(rWORD7, rWORD7SAVE)
94 cfi_offset(rOFF8, rOFF8SAVE)
95 cfi_offset(rOFF16, rOFF16SAVE)
96 cfi_offset(rOFF24, rOFF24SAVE)
97 cfi_offset(rOFF32, rOFF32SAVE)
98
99 li rOFF8,8
100 li rOFF16,16
101 li rOFF24,24
102 li rOFF32,32
103
104 bne L(unaligned)
105 /* At this point we know both strings have the same alignment and the
106 compare length is at least 8 bytes. r12 contains the low order
107 3 bits of rSTR1 and cr5 contains the result of the logical compare
108 of r12 to 0. If r12 == 0 then we are already double word
109 aligned and can perform the DW aligned loop.
110
111 Otherwise we know the two strings have the same alignment (but not
112 yet DW). So we force the string addresses to the next lower DW
113 boundary and special case this first DW using shift left to
114 eliminate bits preceding the first byte. Since we want to join the
115 normal (DW aligned) compare loop, starting at the second double word,
116 we need to adjust the length (rN) and special case the loop
117 versioning for the first DW. This ensures that the loop count is
118 correct and the first DW (shifted) is in the expected register pair. */
119 .align 4
120 L(samealignment):
121 clrrdi rSTR1, rSTR1, 3
122 clrrdi rSTR2, rSTR2, 3
123 beq cr5, L(DWaligned)
124 add rN, rN, r12
125 sldi rWORD6, r12, 3
126 srdi r0, rN, 5 /* Divide by 32 */
127 andi. r12, rN, 24 /* Get the DW remainder */
128 LD rWORD1, 0, rSTR1
129 LD rWORD2, 0, rSTR2
130 cmpldi cr1, r12, 16
131 cmpldi cr7, rN, 32
132 clrldi rN, rN, 61
133 beq L(dPs4)
134 mtctr r0
135 bgt cr1, L(dPs3)
136 beq cr1, L(dPs2)
137
138 /* Remainder is 8 */
139 .align 3
140 L(dsP1):
141 sld rWORD5, rWORD1, rWORD6
142 sld rWORD6, rWORD2, rWORD6
143 cmpld cr5, rWORD5, rWORD6
144 blt cr7, L(dP1x)
145 /* Do something useful in this cycle since we have to branch anyway. */
146 LD rWORD1, rOFF8, rSTR1
147 LD rWORD2, rOFF8, rSTR2
148 cmpld cr7, rWORD1, rWORD2
149 b L(dP1e)
150 /* Remainder is 16 */
151 .align 4
152 L(dPs2):
153 sld rWORD5, rWORD1, rWORD6
154 sld rWORD6, rWORD2, rWORD6
155 cmpld cr6, rWORD5, rWORD6
156 blt cr7, L(dP2x)
157 /* Do something useful in this cycle since we have to branch anyway. */
158 LD rWORD7, rOFF8, rSTR1
159 LD rWORD8, rOFF8, rSTR2
160 cmpld cr5, rWORD7, rWORD8
161 b L(dP2e)
162 /* Remainder is 24 */
163 .align 4
164 L(dPs3):
165 sld rWORD3, rWORD1, rWORD6
166 sld rWORD4, rWORD2, rWORD6
167 cmpld cr1, rWORD3, rWORD4
168 b L(dP3e)
169 /* Count is a multiple of 32, remainder is 0 */
170 .align 4
171 L(dPs4):
172 mtctr r0
173 sld rWORD1, rWORD1, rWORD6
174 sld rWORD2, rWORD2, rWORD6
175 cmpld cr7, rWORD1, rWORD2
176 b L(dP4e)
177
178 /* At this point we know both strings are double word aligned and the
179 compare length is at least 8 bytes. */
180 .align 4
181 L(DWaligned):
182 andi. r12, rN, 24 /* Get the DW remainder */
183 srdi r0, rN, 5 /* Divide by 32 */
184 cmpldi cr1, r12, 16
185 cmpldi cr7, rN, 32
186 clrldi rN, rN, 61
187 beq L(dP4)
188 bgt cr1, L(dP3)
189 beq cr1, L(dP2)
190
191 /* Remainder is 8 */
192 .align 4
193 L(dP1):
194 mtctr r0
195 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
196 (8-15 byte compare), we want to use only volatile registers. This
197 means we can avoid restoring non-volatile registers since we did not
198 change any on the early exit path. The key here is the non-early
199 exit path only cares about the condition code (cr5), not about which
200 register pair was used. */
201 LD rWORD5, 0, rSTR1
202 LD rWORD6, 0, rSTR2
203 cmpld cr5, rWORD5, rWORD6
204 blt cr7, L(dP1x)
205 LD rWORD1, rOFF8, rSTR1
206 LD rWORD2, rOFF8, rSTR2
207 cmpld cr7, rWORD1, rWORD2
208 L(dP1e):
209 LD rWORD3, rOFF16, rSTR1
210 LD rWORD4, rOFF16, rSTR2
211 cmpld cr1, rWORD3, rWORD4
212 LD rWORD5, rOFF24, rSTR1
213 LD rWORD6, rOFF24, rSTR2
214 cmpld cr6, rWORD5, rWORD6
215 bne cr5, L(dLcr5x)
216 bne cr7, L(dLcr7x)
217
218 LD rWORD7, rOFF32, rSTR1
219 LD rWORD8, rOFF32, rSTR2
220 addi rSTR1, rSTR1, 32
221 addi rSTR2, rSTR2, 32
222 bne cr1, L(dLcr1)
223 cmpld cr5, rWORD7, rWORD8
224 bdnz L(dLoop)
225 bne cr6, L(dLcr6)
226 ld rWORD8, rWORD8SAVE(r1)
227 ld rWORD7, rWORD7SAVE(r1)
228 .align 3
229 L(dP1x):
230 sldi. r12, rN, 3
231 bne cr5, L(dLcr5x)
232 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
233 bne L(d00)
234 ld rOFF8, rOFF8SAVE(r1)
235 ld rOFF16, rOFF16SAVE(r1)
236 ld rOFF24, rOFF24SAVE(r1)
237 ld rOFF32, rOFF32SAVE(r1)
238 li rRTN, 0
239 blr
240
241 /* Remainder is 16 */
242 .align 4
243 L(dP2):
244 mtctr r0
245 LD rWORD5, 0, rSTR1
246 LD rWORD6, 0, rSTR2
247 cmpld cr6, rWORD5, rWORD6
248 blt cr7, L(dP2x)
249 LD rWORD7, rOFF8, rSTR1
250 LD rWORD8, rOFF8, rSTR2
251 cmpld cr5, rWORD7, rWORD8
252 L(dP2e):
253 LD rWORD1, rOFF16, rSTR1
254 LD rWORD2, rOFF16, rSTR2
255 cmpld cr7, rWORD1, rWORD2
256 LD rWORD3, rOFF24, rSTR1
257 LD rWORD4, rOFF24, rSTR2
258 cmpld cr1, rWORD3, rWORD4
259 addi rSTR1, rSTR1, 8
260 addi rSTR2, rSTR2, 8
261 bne cr6, L(dLcr6)
262 bne cr5, L(dLcr5)
263 b L(dLoop2)
264 .align 4
265 L(dP2x):
266 LD rWORD3, rOFF8, rSTR1
267 LD rWORD4, rOFF8, rSTR2
268 cmpld cr1, rWORD3, rWORD4
269 sldi. r12, rN, 3
270 bne cr6, L(dLcr6x)
271 addi rSTR1, rSTR1, 8
272 addi rSTR2, rSTR2, 8
273 bne cr1, L(dLcr1x)
274 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
275 bne L(d00)
276 ld rOFF8, rOFF8SAVE(r1)
277 ld rOFF16, rOFF16SAVE(r1)
278 ld rOFF24, rOFF24SAVE(r1)
279 ld rOFF32, rOFF32SAVE(r1)
280 li rRTN, 0
281 blr
282
283 /* Remainder is 24 */
284 .align 4
285 L(dP3):
286 mtctr r0
287 LD rWORD3, 0, rSTR1
288 LD rWORD4, 0, rSTR2
289 cmpld cr1, rWORD3, rWORD4
290 L(dP3e):
291 LD rWORD5, rOFF8, rSTR1
292 LD rWORD6, rOFF8, rSTR2
293 cmpld cr6, rWORD5, rWORD6
294 blt cr7, L(dP3x)
295 LD rWORD7, rOFF16, rSTR1
296 LD rWORD8, rOFF16, rSTR2
297 cmpld cr5, rWORD7, rWORD8
298 LD rWORD1, rOFF24, rSTR1
299 LD rWORD2, rOFF24, rSTR2
300 cmpld cr7, rWORD1, rWORD2
301 addi rSTR1, rSTR1, 16
302 addi rSTR2, rSTR2, 16
303 bne cr1, L(dLcr1)
304 bne cr6, L(dLcr6)
305 b L(dLoop1)
306 /* Again we are on a early exit path (24-31 byte compare), we want to
307 only use volatile registers and avoid restoring non-volatile
308 registers. */
309 .align 4
310 L(dP3x):
311 LD rWORD1, rOFF16, rSTR1
312 LD rWORD2, rOFF16, rSTR2
313 cmpld cr7, rWORD1, rWORD2
314 sldi. r12, rN, 3
315 bne cr1, L(dLcr1x)
316 addi rSTR1, rSTR1, 16
317 addi rSTR2, rSTR2, 16
318 bne cr6, L(dLcr6x)
319 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
320 bne cr7, L(dLcr7x)
321 bne L(d00)
322 ld rOFF8, rOFF8SAVE(r1)
323 ld rOFF16, rOFF16SAVE(r1)
324 ld rOFF24, rOFF24SAVE(r1)
325 ld rOFF32, rOFF32SAVE(r1)
326 li rRTN, 0
327 blr
328
329 /* Count is a multiple of 32, remainder is 0 */
330 .align 4
331 L(dP4):
332 mtctr r0
333 LD rWORD1, 0, rSTR1
334 LD rWORD2, 0, rSTR2
335 cmpld cr7, rWORD1, rWORD2
336 L(dP4e):
337 LD rWORD3, rOFF8, rSTR1
338 LD rWORD4, rOFF8, rSTR2
339 cmpld cr1, rWORD3, rWORD4
340 LD rWORD5, rOFF16, rSTR1
341 LD rWORD6, rOFF16, rSTR2
342 cmpld cr6, rWORD5, rWORD6
343 LD rWORD7, rOFF24, rSTR1
344 LD rWORD8, rOFF24, rSTR2
345 addi rSTR1, rSTR1, 24
346 addi rSTR2, rSTR2, 24
347 cmpld cr5, rWORD7, rWORD8
348 bne cr7, L(dLcr7)
349 bne cr1, L(dLcr1)
350 bdz- L(d24) /* Adjust CTR as we start with +4 */
351 /* This is the primary loop */
352 .align 4
353 L(dLoop):
354 LD rWORD1, rOFF8, rSTR1
355 LD rWORD2, rOFF8, rSTR2
356 cmpld cr1, rWORD3, rWORD4
357 bne cr6, L(dLcr6)
358 L(dLoop1):
359 LD rWORD3, rOFF16, rSTR1
360 LD rWORD4, rOFF16, rSTR2
361 cmpld cr6, rWORD5, rWORD6
362 bne cr5, L(dLcr5)
363 L(dLoop2):
364 LD rWORD5, rOFF24, rSTR1
365 LD rWORD6, rOFF24, rSTR2
366 cmpld cr5, rWORD7, rWORD8
367 bne cr7, L(dLcr7)
368 L(dLoop3):
369 LD rWORD7, rOFF32, rSTR1
370 LD rWORD8, rOFF32, rSTR2
371 addi rSTR1, rSTR1, 32
372 addi rSTR2, rSTR2, 32
373 bne cr1, L(dLcr1)
374 cmpld cr7, rWORD1, rWORD2
375 bdnz L(dLoop)
376
377 L(dL4):
378 cmpld cr1, rWORD3, rWORD4
379 bne cr6, L(dLcr6)
380 cmpld cr6, rWORD5, rWORD6
381 bne cr5, L(dLcr5)
382 cmpld cr5, rWORD7, rWORD8
383 L(d44):
384 bne cr7, L(dLcr7)
385 L(d34):
386 bne cr1, L(dLcr1)
387 L(d24):
388 bne cr6, L(dLcr6)
389 L(d14):
390 sldi. r12, rN, 3
391 bne cr5, L(dLcr5)
392 L(d04):
393 ld rWORD8, rWORD8SAVE(r1)
394 ld rWORD7, rWORD7SAVE(r1)
395 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
396 beq L(duzeroLength)
397 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
398 we are aligned it is safe to load the whole double word, and use
399 shift right double to eliminate bits beyond the compare length. */
400 L(d00):
401 LD rWORD1, rOFF8, rSTR1
402 LD rWORD2, rOFF8, rSTR2
403 srd rWORD1, rWORD1, rN
404 srd rWORD2, rWORD2, rN
405 cmpld cr7, rWORD1, rWORD2
406 bne cr7, L(dLcr7x)
407 ld rOFF8, rOFF8SAVE(r1)
408 ld rOFF16, rOFF16SAVE(r1)
409 ld rOFF24, rOFF24SAVE(r1)
410 ld rOFF32, rOFF32SAVE(r1)
411 li rRTN, 0
412 blr
413
414 .align 4
415 L(dLcr7):
416 ld rWORD8, rWORD8SAVE(r1)
417 ld rWORD7, rWORD7SAVE(r1)
418 L(dLcr7x):
419 ld rOFF8, rOFF8SAVE(r1)
420 ld rOFF16, rOFF16SAVE(r1)
421 ld rOFF24, rOFF24SAVE(r1)
422 ld rOFF32, rOFF32SAVE(r1)
423 li rRTN, 1
424 bgtlr cr7
425 li rRTN, -1
426 blr
427 .align 4
428 L(dLcr1):
429 ld rWORD8, rWORD8SAVE(r1)
430 ld rWORD7, rWORD7SAVE(r1)
431 L(dLcr1x):
432 ld rOFF8, rOFF8SAVE(r1)
433 ld rOFF16, rOFF16SAVE(r1)
434 ld rOFF24, rOFF24SAVE(r1)
435 ld rOFF32, rOFF32SAVE(r1)
436 li rRTN, 1
437 bgtlr cr1
438 li rRTN, -1
439 blr
440 .align 4
441 L(dLcr6):
442 ld rWORD8, rWORD8SAVE(r1)
443 ld rWORD7, rWORD7SAVE(r1)
444 L(dLcr6x):
445 ld rOFF8, rOFF8SAVE(r1)
446 ld rOFF16, rOFF16SAVE(r1)
447 ld rOFF24, rOFF24SAVE(r1)
448 ld rOFF32, rOFF32SAVE(r1)
449 li rRTN, 1
450 bgtlr cr6
451 li rRTN, -1
452 blr
453 .align 4
454 L(dLcr5):
455 ld rWORD8, rWORD8SAVE(r1)
456 ld rWORD7, rWORD7SAVE(r1)
457 L(dLcr5x):
458 ld rOFF8, rOFF8SAVE(r1)
459 ld rOFF16, rOFF16SAVE(r1)
460 ld rOFF24, rOFF24SAVE(r1)
461 ld rOFF32, rOFF32SAVE(r1)
462 li rRTN, 1
463 bgtlr cr5
464 li rRTN, -1
465 blr
466
467 .align 4
468 L(bytealigned):
469 mtctr rN
470
471 /* We need to prime this loop. This loop is swing modulo scheduled
472 to avoid pipe delays. The dependent instruction latencies (load to
473 compare to conditional branch) is 2 to 3 cycles. In this loop each
474 dispatch group ends in a branch and takes 1 cycle. Effectively
475 the first iteration of the loop only serves to load operands and
476 branches based on compares are delayed until the next loop.
477
478 So we must precondition some registers and condition codes so that
479 we don't exit the loop early on the first iteration. */
480
481 lbz rWORD1, 0(rSTR1)
482 lbz rWORD2, 0(rSTR2)
483 bdz L(b11)
484 cmpld cr7, rWORD1, rWORD2
485 lbz rWORD3, 1(rSTR1)
486 lbz rWORD4, 1(rSTR2)
487 bdz L(b12)
488 cmpld cr1, rWORD3, rWORD4
489 lbzu rWORD5, 2(rSTR1)
490 lbzu rWORD6, 2(rSTR2)
491 bdz L(b13)
492 .align 4
493 L(bLoop):
494 lbzu rWORD1, 1(rSTR1)
495 lbzu rWORD2, 1(rSTR2)
496 bne cr7, L(bLcr7)
497
498 cmpld cr6, rWORD5, rWORD6
499 bdz L(b3i)
500
501 lbzu rWORD3, 1(rSTR1)
502 lbzu rWORD4, 1(rSTR2)
503 bne cr1, L(bLcr1)
504
505 cmpld cr7, rWORD1, rWORD2
506 bdz L(b2i)
507
508 lbzu rWORD5, 1(rSTR1)
509 lbzu rWORD6, 1(rSTR2)
510 bne cr6, L(bLcr6)
511
512 cmpld cr1, rWORD3, rWORD4
513 bdnz L(bLoop)
514
515 /* We speculatively loading bytes before we have tested the previous
516 bytes. But we must avoid overrunning the length (in the ctr) to
517 prevent these speculative loads from causing a segfault. In this
518 case the loop will exit early (before the all pending bytes are
519 tested. In this case we must complete the pending operations
520 before returning. */
521 L(b1i):
522 bne cr7, L(bLcr7)
523 bne cr1, L(bLcr1)
524 b L(bx56)
525 .align 4
526 L(b2i):
527 bne cr6, L(bLcr6)
528 bne cr7, L(bLcr7)
529 b L(bx34)
530 .align 4
531 L(b3i):
532 bne cr1, L(bLcr1)
533 bne cr6, L(bLcr6)
534 b L(bx12)
535 .align 4
536 L(bLcr7):
537 li rRTN, 1
538 bgtlr cr7
539 li rRTN, -1
540 blr
541 L(bLcr1):
542 li rRTN, 1
543 bgtlr cr1
544 li rRTN, -1
545 blr
546 L(bLcr6):
547 li rRTN, 1
548 bgtlr cr6
549 li rRTN, -1
550 blr
551
552 L(b13):
553 bne cr7, L(bx12)
554 bne cr1, L(bx34)
555 L(bx56):
556 sub rRTN, rWORD5, rWORD6
557 blr
558 nop
559 L(b12):
560 bne cr7, L(bx12)
561 L(bx34):
562 sub rRTN, rWORD3, rWORD4
563 blr
564 L(b11):
565 L(bx12):
566 sub rRTN, rWORD1, rWORD2
567 blr
568
569 .align 4
570 L(zeroLength):
571 li rRTN, 0
572 blr
573
574 .align 4
575 /* At this point we know the strings have different alignment and the
576 compare length is at least 8 bytes. r12 contains the low order
577 3 bits of rSTR1 and cr5 contains the result of the logical compare
578 of r12 to 0. If r12 == 0 then rStr1 is double word
579 aligned and can perform the DWunaligned loop.
580
581 Otherwise we know that rSTR1 is not already DW aligned yet.
582 So we can force the string addresses to the next lower DW
583 boundary and special case this first DW using shift left to
584 eliminate bits preceding the first byte. Since we want to join the
585 normal (DWaligned) compare loop, starting at the second double word,
586 we need to adjust the length (rN) and special case the loop
587 versioning for the first DW. This ensures that the loop count is
588 correct and the first DW (shifted) is in the expected resister pair. */
589 L(unaligned):
590 std rSHL, rSHLSAVE(r1)
591 cfi_offset(rSHL, rSHLSAVE)
592 clrldi rSHL, rSTR2, 61
593 beq cr6, L(duzeroLength)
594 std rSHR, rSHRSAVE(r1)
595 cfi_offset(rSHR, rSHRSAVE)
596 beq cr5, L(DWunaligned)
597 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
598 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
599 /* Adjust the logical start of rSTR2 to compensate for the extra bits
600 in the 1st rSTR1 DW. */
601 sub rWORD8_SHIFT, rSTR2, r12
602 /* But do not attempt to address the DW before that DW that contains
603 the actual start of rSTR2. */
604 clrrdi rSTR2, rSTR2, 3
605 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
606 /* Compute the left/right shift counts for the unaligned rSTR2,
607 compensating for the logical (DW aligned) start of rSTR1. */
608 clrldi rSHL, rWORD8_SHIFT, 61
609 clrrdi rSTR1, rSTR1, 3
610 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
611 sldi rSHL, rSHL, 3
612 cmpld cr5, rWORD8_SHIFT, rSTR2
613 add rN, rN, r12
614 sldi rWORD6, r12, 3
615 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
616 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
617 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
618 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
619 subfic rSHR, rSHL, 64
620 srdi r0, rN, 5 /* Divide by 32 */
621 andi. r12, rN, 24 /* Get the DW remainder */
622 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
623 this special case those bits may be discarded anyway. Also we
624 must avoid loading a DW where none of the bits are part of rSTR2 as
625 this may cross a page boundary and cause a page fault. */
626 li rWORD8, 0
627 blt cr5, L(dus0)
628 LD rWORD8, 0, rSTR2
629 addi rSTR2, rSTR2, 8
630 sld rWORD8, rWORD8, rSHL
631
632 L(dus0):
633 LD rWORD1, 0, rSTR1
634 LD rWORD2, 0, rSTR2
635 cmpldi cr1, r12, 16
636 cmpldi cr7, rN, 32
637 srd r12, rWORD2, rSHR
638 clrldi rN, rN, 61
639 beq L(duPs4)
640 mtctr r0
641 or rWORD8, r12, rWORD8
642 bgt cr1, L(duPs3)
643 beq cr1, L(duPs2)
644
645 /* Remainder is 8 */
646 .align 4
647 L(dusP1):
648 sld rWORD8_SHIFT, rWORD2, rSHL
649 sld rWORD7, rWORD1, rWORD6
650 sld rWORD8, rWORD8, rWORD6
651 bge cr7, L(duP1e)
652 /* At this point we exit early with the first double word compare
653 complete and remainder of 0 to 7 bytes. See L(du14) for details on
654 how we handle the remaining bytes. */
655 cmpld cr5, rWORD7, rWORD8
656 sldi. rN, rN, 3
657 bne cr5, L(duLcr5)
658 cmpld cr7, rN, rSHR
659 beq L(duZeroReturn)
660 li r0, 0
661 ble cr7, L(dutrim)
662 LD rWORD2, rOFF8, rSTR2
663 srd r0, rWORD2, rSHR
664 b L(dutrim)
665 /* Remainder is 16 */
666 .align 4
667 L(duPs2):
668 sld rWORD6_SHIFT, rWORD2, rSHL
669 sld rWORD5, rWORD1, rWORD6
670 sld rWORD6, rWORD8, rWORD6
671 b L(duP2e)
672 /* Remainder is 24 */
673 .align 4
674 L(duPs3):
675 sld rWORD4_SHIFT, rWORD2, rSHL
676 sld rWORD3, rWORD1, rWORD6
677 sld rWORD4, rWORD8, rWORD6
678 b L(duP3e)
679 /* Count is a multiple of 32, remainder is 0 */
680 .align 4
681 L(duPs4):
682 mtctr r0
683 or rWORD8, r12, rWORD8
684 sld rWORD2_SHIFT, rWORD2, rSHL
685 sld rWORD1, rWORD1, rWORD6
686 sld rWORD2, rWORD8, rWORD6
687 b L(duP4e)
688
689 /* At this point we know rSTR1 is double word aligned and the
690 compare length is at least 8 bytes. */
691 .align 4
692 L(DWunaligned):
693 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
694 clrrdi rSTR2, rSTR2, 3
695 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
696 srdi r0, rN, 5 /* Divide by 32 */
697 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
698 andi. r12, rN, 24 /* Get the DW remainder */
699 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
700 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
701 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
702 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
703 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
704 sldi rSHL, rSHL, 3
705 LD rWORD6, 0, rSTR2
706 LD rWORD8, rOFF8, rSTR2
707 addi rSTR2, rSTR2, 8
708 cmpldi cr1, r12, 16
709 cmpldi cr7, rN, 32
710 clrldi rN, rN, 61
711 subfic rSHR, rSHL, 64
712 sld rWORD6_SHIFT, rWORD6, rSHL
713 beq L(duP4)
714 mtctr r0
715 bgt cr1, L(duP3)
716 beq cr1, L(duP2)
717
718 /* Remainder is 8 */
719 .align 4
720 L(duP1):
721 srd r12, rWORD8, rSHR
722 LD rWORD7, 0, rSTR1
723 sld rWORD8_SHIFT, rWORD8, rSHL
724 or rWORD8, r12, rWORD6_SHIFT
725 blt cr7, L(duP1x)
726 L(duP1e):
727 LD rWORD1, rOFF8, rSTR1
728 LD rWORD2, rOFF8, rSTR2
729 cmpld cr5, rWORD7, rWORD8
730 srd r0, rWORD2, rSHR
731 sld rWORD2_SHIFT, rWORD2, rSHL
732 or rWORD2, r0, rWORD8_SHIFT
733 LD rWORD3, rOFF16, rSTR1
734 LD rWORD4, rOFF16, rSTR2
735 cmpld cr7, rWORD1, rWORD2
736 srd r12, rWORD4, rSHR
737 sld rWORD4_SHIFT, rWORD4, rSHL
738 bne cr5, L(duLcr5)
739 or rWORD4, r12, rWORD2_SHIFT
740 LD rWORD5, rOFF24, rSTR1
741 LD rWORD6, rOFF24, rSTR2
742 cmpld cr1, rWORD3, rWORD4
743 srd r0, rWORD6, rSHR
744 sld rWORD6_SHIFT, rWORD6, rSHL
745 bne cr7, L(duLcr7)
746 or rWORD6, r0, rWORD4_SHIFT
747 cmpld cr6, rWORD5, rWORD6
748 b L(duLoop3)
749 .align 4
750 /* At this point we exit early with the first double word compare
751 complete and remainder of 0 to 7 bytes. See L(du14) for details on
752 how we handle the remaining bytes. */
753 L(duP1x):
754 cmpld cr5, rWORD7, rWORD8
755 sldi. rN, rN, 3
756 bne cr5, L(duLcr5)
757 cmpld cr7, rN, rSHR
758 beq L(duZeroReturn)
759 li r0, 0
760 ble cr7, L(dutrim)
761 LD rWORD2, rOFF8, rSTR2
762 srd r0, rWORD2, rSHR
763 b L(dutrim)
764 /* Remainder is 16 */
765 .align 4
766 L(duP2):
767 srd r0, rWORD8, rSHR
768 LD rWORD5, 0, rSTR1
769 or rWORD6, r0, rWORD6_SHIFT
770 sld rWORD6_SHIFT, rWORD8, rSHL
771 L(duP2e):
772 LD rWORD7, rOFF8, rSTR1
773 LD rWORD8, rOFF8, rSTR2
774 cmpld cr6, rWORD5, rWORD6
775 srd r12, rWORD8, rSHR
776 sld rWORD8_SHIFT, rWORD8, rSHL
777 or rWORD8, r12, rWORD6_SHIFT
778 blt cr7, L(duP2x)
779 LD rWORD1, rOFF16, rSTR1
780 LD rWORD2, rOFF16, rSTR2
781 cmpld cr5, rWORD7, rWORD8
782 bne cr6, L(duLcr6)
783 srd r0, rWORD2, rSHR
784 sld rWORD2_SHIFT, rWORD2, rSHL
785 or rWORD2, r0, rWORD8_SHIFT
786 LD rWORD3, rOFF24, rSTR1
787 LD rWORD4, rOFF24, rSTR2
788 cmpld cr7, rWORD1, rWORD2
789 bne cr5, L(duLcr5)
790 srd r12, rWORD4, rSHR
791 sld rWORD4_SHIFT, rWORD4, rSHL
792 or rWORD4, r12, rWORD2_SHIFT
793 addi rSTR1, rSTR1, 8
794 addi rSTR2, rSTR2, 8
795 cmpld cr1, rWORD3, rWORD4
796 b L(duLoop2)
797 .align 4
798 L(duP2x):
799 cmpld cr5, rWORD7, rWORD8
800 addi rSTR1, rSTR1, 8
801 addi rSTR2, rSTR2, 8
802 bne cr6, L(duLcr6)
803 sldi. rN, rN, 3
804 bne cr5, L(duLcr5)
805 cmpld cr7, rN, rSHR
806 beq L(duZeroReturn)
807 li r0, 0
808 ble cr7, L(dutrim)
809 LD rWORD2, rOFF8, rSTR2
810 srd r0, rWORD2, rSHR
811 b L(dutrim)
812
813 /* Remainder is 24 */
814 .align 4
815 L(duP3):
816 srd r12, rWORD8, rSHR
817 LD rWORD3, 0, rSTR1
818 sld rWORD4_SHIFT, rWORD8, rSHL
819 or rWORD4, r12, rWORD6_SHIFT
820 L(duP3e):
821 LD rWORD5, rOFF8, rSTR1
822 LD rWORD6, rOFF8, rSTR2
823 cmpld cr1, rWORD3, rWORD4
824 srd r0, rWORD6, rSHR
825 sld rWORD6_SHIFT, rWORD6, rSHL
826 or rWORD6, r0, rWORD4_SHIFT
827 LD rWORD7, rOFF16, rSTR1
828 LD rWORD8, rOFF16, rSTR2
829 cmpld cr6, rWORD5, rWORD6
830 bne cr1, L(duLcr1)
831 srd r12, rWORD8, rSHR
832 sld rWORD8_SHIFT, rWORD8, rSHL
833 or rWORD8, r12, rWORD6_SHIFT
834 blt cr7, L(duP3x)
835 LD rWORD1, rOFF24, rSTR1
836 LD rWORD2, rOFF24, rSTR2
837 cmpld cr5, rWORD7, rWORD8
838 bne cr6, L(duLcr6)
839 srd r0, rWORD2, rSHR
840 sld rWORD2_SHIFT, rWORD2, rSHL
841 or rWORD2, r0, rWORD8_SHIFT
842 addi rSTR1, rSTR1, 16
843 addi rSTR2, rSTR2, 16
844 cmpld cr7, rWORD1, rWORD2
845 b L(duLoop1)
846 .align 4
847 L(duP3x):
848 addi rSTR1, rSTR1, 16
849 addi rSTR2, rSTR2, 16
850 cmpld cr5, rWORD7, rWORD8
851 bne cr6, L(duLcr6)
852 sldi. rN, rN, 3
853 bne cr5, L(duLcr5)
854 cmpld cr7, rN, rSHR
855 beq L(duZeroReturn)
856 li r0, 0
857 ble cr7, L(dutrim)
858 LD rWORD2, rOFF8, rSTR2
859 srd r0, rWORD2, rSHR
860 b L(dutrim)
861
862 /* Count is a multiple of 32, remainder is 0 */
863 .align 4
864 L(duP4):
865 mtctr r0
866 srd r0, rWORD8, rSHR
867 LD rWORD1, 0, rSTR1
868 sld rWORD2_SHIFT, rWORD8, rSHL
869 or rWORD2, r0, rWORD6_SHIFT
870 L(duP4e):
871 LD rWORD3, rOFF8, rSTR1
872 LD rWORD4, rOFF8, rSTR2
873 cmpld cr7, rWORD1, rWORD2
874 srd r12, rWORD4, rSHR
875 sld rWORD4_SHIFT, rWORD4, rSHL
876 or rWORD4, r12, rWORD2_SHIFT
877 LD rWORD5, rOFF16, rSTR1
878 LD rWORD6, rOFF16, rSTR2
879 cmpld cr1, rWORD3, rWORD4
880 bne cr7, L(duLcr7)
881 srd r0, rWORD6, rSHR
882 sld rWORD6_SHIFT, rWORD6, rSHL
883 or rWORD6, r0, rWORD4_SHIFT
884 LD rWORD7, rOFF24, rSTR1
885 LD rWORD8, rOFF24, rSTR2
886 addi rSTR1, rSTR1, 24
887 addi rSTR2, rSTR2, 24
888 cmpld cr6, rWORD5, rWORD6
889 bne cr1, L(duLcr1)
890 srd r12, rWORD8, rSHR
891 sld rWORD8_SHIFT, rWORD8, rSHL
892 or rWORD8, r12, rWORD6_SHIFT
893 cmpld cr5, rWORD7, rWORD8
894 bdz L(du24) /* Adjust CTR as we start with +4 */
895 /* This is the primary loop */
896 .align 4
897 L(duLoop):
898 LD rWORD1, rOFF8, rSTR1
899 LD rWORD2, rOFF8, rSTR2
900 cmpld cr1, rWORD3, rWORD4
901 bne cr6, L(duLcr6)
902 srd r0, rWORD2, rSHR
903 sld rWORD2_SHIFT, rWORD2, rSHL
904 or rWORD2, r0, rWORD8_SHIFT
905 L(duLoop1):
906 LD rWORD3, rOFF16, rSTR1
907 LD rWORD4, rOFF16, rSTR2
908 cmpld cr6, rWORD5, rWORD6
909 bne cr5, L(duLcr5)
910 srd r12, rWORD4, rSHR
911 sld rWORD4_SHIFT, rWORD4, rSHL
912 or rWORD4, r12, rWORD2_SHIFT
913 L(duLoop2):
914 LD rWORD5, rOFF24, rSTR1
915 LD rWORD6, rOFF24, rSTR2
916 cmpld cr5, rWORD7, rWORD8
917 bne cr7, L(duLcr7)
918 srd r0, rWORD6, rSHR
919 sld rWORD6_SHIFT, rWORD6, rSHL
920 or rWORD6, r0, rWORD4_SHIFT
921 L(duLoop3):
922 LD rWORD7, rOFF32, rSTR1
923 LD rWORD8, rOFF32, rSTR2
924 addi rSTR1, rSTR1, 32
925 addi rSTR2, rSTR2, 32
926 cmpld cr7, rWORD1, rWORD2
927 bne cr1, L(duLcr1)
928 srd r12, rWORD8, rSHR
929 sld rWORD8_SHIFT, rWORD8, rSHL
930 or rWORD8, r12, rWORD6_SHIFT
931 bdnz L(duLoop)
932
933 L(duL4):
934 cmpld cr1, rWORD3, rWORD4
935 bne cr6, L(duLcr6)
936 cmpld cr6, rWORD5, rWORD6
937 bne cr5, L(duLcr5)
938 cmpld cr5, rWORD7, rWORD8
939 L(du44):
940 bne cr7, L(duLcr7)
941 L(du34):
942 bne cr1, L(duLcr1)
943 L(du24):
944 bne cr6, L(duLcr6)
945 L(du14):
946 sldi. rN, rN, 3
947 bne cr5, L(duLcr5)
948 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
949 shift right double to eliminate bits beyond the compare length.
950
951 However it may not be safe to load rWORD2 which may be beyond the
952 string length. So we compare the bit length of the remainder to
953 the right shift count (rSHR). If the bit count is less than or equal
954 we do not need to load rWORD2 (all significant bits are already in
955 rWORD8_SHIFT). */
956 cmpld cr7, rN, rSHR
957 beq L(duZeroReturn)
958 li r0, 0
959 ble cr7, L(dutrim)
960 LD rWORD2, rOFF8, rSTR2
961 srd r0, rWORD2, rSHR
962 .align 4
963 L(dutrim):
964 LD rWORD1, rOFF8, rSTR1
965 ld rWORD8, -8(r1)
966 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
967 or rWORD2, r0, rWORD8_SHIFT
968 ld rWORD7, rWORD7SAVE(r1)
969 ld rSHL, rSHLSAVE(r1)
970 srd rWORD1, rWORD1, rN
971 srd rWORD2, rWORD2, rN
972 ld rSHR, rSHRSAVE(r1)
973 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
974 li rRTN, 0
975 cmpld cr7, rWORD1, rWORD2
976 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
977 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
978 beq cr7, L(dureturn24)
979 li rRTN, 1
980 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
981 ld rOFF8, rOFF8SAVE(r1)
982 ld rOFF16, rOFF16SAVE(r1)
983 ld rOFF24, rOFF24SAVE(r1)
984 ld rOFF32, rOFF32SAVE(r1)
985 bgtlr cr7
986 li rRTN, -1
987 blr
988 .align 4
989 L(duLcr7):
990 ld rWORD8, rWORD8SAVE(r1)
991 ld rWORD7, rWORD7SAVE(r1)
992 li rRTN, 1
993 bgt cr7, L(dureturn29)
994 ld rSHL, rSHLSAVE(r1)
995 ld rSHR, rSHRSAVE(r1)
996 li rRTN, -1
997 b L(dureturn27)
998 .align 4
999 L(duLcr1):
1000 ld rWORD8, rWORD8SAVE(r1)
1001 ld rWORD7, rWORD7SAVE(r1)
1002 li rRTN, 1
1003 bgt cr1, L(dureturn29)
1004 ld rSHL, rSHLSAVE(r1)
1005 ld rSHR, rSHRSAVE(r1)
1006 li rRTN, -1
1007 b L(dureturn27)
1008 .align 4
1009 L(duLcr6):
1010 ld rWORD8, rWORD8SAVE(r1)
1011 ld rWORD7, rWORD7SAVE(r1)
1012 li rRTN, 1
1013 bgt cr6, L(dureturn29)
1014 ld rSHL, rSHLSAVE(r1)
1015 ld rSHR, rSHRSAVE(r1)
1016 li rRTN, -1
1017 b L(dureturn27)
1018 .align 4
1019 L(duLcr5):
1020 ld rWORD8, rWORD8SAVE(r1)
1021 ld rWORD7, rWORD7SAVE(r1)
1022 li rRTN, 1
1023 bgt cr5, L(dureturn29)
1024 ld rSHL, rSHLSAVE(r1)
1025 ld rSHR, rSHRSAVE(r1)
1026 li rRTN, -1
1027 b L(dureturn27)
1028
1029 .align 3
1030 L(duZeroReturn):
1031 li rRTN, 0
1032 .align 4
1033 L(dureturn):
1034 ld rWORD8, rWORD8SAVE(r1)
1035 ld rWORD7, rWORD7SAVE(r1)
1036 L(dureturn29):
1037 ld rSHL, rSHLSAVE(r1)
1038 ld rSHR, rSHRSAVE(r1)
1039 L(dureturn27):
1040 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1041 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1042 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1043 L(dureturn24):
1044 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1045 ld rOFF8, rOFF8SAVE(r1)
1046 ld rOFF16, rOFF16SAVE(r1)
1047 ld rOFF24, rOFF24SAVE(r1)
1048 ld rOFF32, rOFF32SAVE(r1)
1049 blr
1050
1051 L(duzeroLength):
1052 ld rOFF8, rOFF8SAVE(r1)
1053 ld rOFF16, rOFF16SAVE(r1)
1054 ld rOFF24, rOFF24SAVE(r1)
1055 ld rOFF32, rOFF32SAVE(r1)
1056 li rRTN, 0
1057 blr
1058
1059 END (MEMCMP)
1060 libc_hidden_builtin_def (memcmp)
1061 weak_alias (memcmp, bcmp)