]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power4/memcmp.S
powerpc: Fix build failures with current GCC
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memcmp.S
CommitLineData
fe6e95d7 1/* Optimized memcmp implementation for PowerPC64.
04277e02 2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
04067002
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
04067002
UD
18
19#include <sysdep.h>
04067002 20
fe6e95d7
AM
21/* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
04067002 24
b6a66222
WSM
25#ifndef MEMCMP
26# define MEMCMP memcmp
27#endif
28
9250e661 29#ifndef __LITTLE_ENDIAN__
a88f47a7 30 .machine power4
9250e661
GG
31#else
32/* Little endian is only available since POWER8, so it's safe to
33 specify .machine as power8 (or older), even though this is a POWER4
34 file. Since the little-endian code uses 'ldbrx', power7 is enough. */
35 .machine power7
36#endif
d5b41185 37ENTRY_TOCLESS (MEMCMP, 4)
04067002
UD
38 CALL_MCOUNT 3
39
04067002
UD
40#define rRTN r3
41#define rSTR1 r3 /* first string arg */
42#define rSTR2 r4 /* second string arg */
43#define rN r5 /* max string length */
04067002
UD
44#define rWORD1 r6 /* current word in s1 */
45#define rWORD2 r7 /* current word in s2 */
46#define rWORD3 r8 /* next word in s1 */
47#define rWORD4 r9 /* next word in s2 */
48#define rWORD5 r10 /* next word in s1 */
49#define rWORD6 r11 /* next word in s2 */
04067002
UD
50#define rWORD7 r30 /* next word in s1 */
51#define rWORD8 r31 /* next word in s2 */
52
fe6e95d7 53 xor r0, rSTR2, rSTR1
04067002
UD
54 cmpldi cr6, rN, 0
55 cmpldi cr1, rN, 12
fe6e95d7
AM
56 clrldi. r0, r0, 61
57 clrldi r12, rSTR1, 61
58 cmpldi cr5, r12, 0
04067002 59 beq- cr6, L(zeroLength)
fe6e95d7
AM
60 dcbt 0, rSTR1
61 dcbt 0, rSTR2
2ccdea26 62/* If less than 8 bytes or not aligned, use the unaligned
04067002
UD
63 byte loop. */
64 blt cr1, L(bytealigned)
fe6e95d7 65 std rWORD8, -8(r1)
fe6e95d7 66 std rWORD7, -16(r1)
869d7180 67 cfi_offset(rWORD8, -8)
fe6e95d7 68 cfi_offset(rWORD7, -16)
04067002
UD
69 bne L(unaligned)
70/* At this point we know both strings have the same alignment and the
fe6e95d7 71 compare length is at least 8 bytes. r12 contains the low order
04067002 72 3 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7
AM
73 of r12 to 0. If r12 == 0 then we are already double word
74 aligned and can perform the DW aligned loop.
9c84384c 75
04067002 76 Otherwise we know the two strings have the same alignment (but not
fe6e95d7
AM
77 yet DW). So we force the string addresses to the next lower DW
78 boundary and special case this first DW using shift left to
2ccdea26 79 eliminate bits preceding the first byte. Since we want to join the
fe6e95d7 80 normal (DW aligned) compare loop, starting at the second double word,
04067002 81 we need to adjust the length (rN) and special case the loop
fe6e95d7
AM
82 versioning for the first DW. This ensures that the loop count is
83 correct and the first DW (shifted) is in the expected register pair. */
84 .align 4
04067002
UD
85L(samealignment):
86 clrrdi rSTR1, rSTR1, 3
87 clrrdi rSTR2, rSTR2, 3
88 beq cr5, L(DWaligned)
fe6e95d7
AM
89 add rN, rN, r12
90 sldi rWORD6, r12, 3
91 srdi r0, rN, 5 /* Divide by 32 */
92 andi. r12, rN, 24 /* Get the DW remainder */
93#ifdef __LITTLE_ENDIAN__
94 ldbrx rWORD1, 0, rSTR1
95 ldbrx rWORD2, 0, rSTR2
96 addi rSTR1, rSTR1, 8
97 addi rSTR2, rSTR2, 8
98#else
04067002
UD
99 ld rWORD1, 0(rSTR1)
100 ld rWORD2, 0(rSTR2)
fe6e95d7
AM
101#endif
102 cmpldi cr1, r12, 16
04067002
UD
103 cmpldi cr7, rN, 32
104 clrldi rN, rN, 61
105 beq L(dPs4)
fe6e95d7 106 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
107 bgt cr1, L(dPs3)
108 beq cr1, L(dPs2)
109
110/* Remainder is 8 */
fe6e95d7 111 .align 3
04067002 112L(dsP1):
fe6e95d7
AM
113 sld rWORD5, rWORD1, rWORD6
114 sld rWORD6, rWORD2, rWORD6
04067002
UD
115 cmpld cr5, rWORD5, rWORD6
116 blt cr7, L(dP1x)
117/* Do something useful in this cycle since we have to branch anyway. */
fe6e95d7
AM
118#ifdef __LITTLE_ENDIAN__
119 ldbrx rWORD1, 0, rSTR1
120 ldbrx rWORD2, 0, rSTR2
121 addi rSTR1, rSTR1, 8
122 addi rSTR2, rSTR2, 8
123#else
04067002
UD
124 ld rWORD1, 8(rSTR1)
125 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
126#endif
127 cmpld cr7, rWORD1, rWORD2
04067002
UD
128 b L(dP1e)
129/* Remainder is 16 */
fe6e95d7 130 .align 4
04067002 131L(dPs2):
fe6e95d7
AM
132 sld rWORD5, rWORD1, rWORD6
133 sld rWORD6, rWORD2, rWORD6
04067002
UD
134 cmpld cr6, rWORD5, rWORD6
135 blt cr7, L(dP2x)
136/* Do something useful in this cycle since we have to branch anyway. */
fe6e95d7
AM
137#ifdef __LITTLE_ENDIAN__
138 ldbrx rWORD7, 0, rSTR1
139 ldbrx rWORD8, 0, rSTR2
140 addi rSTR1, rSTR1, 8
141 addi rSTR2, rSTR2, 8
142#else
04067002
UD
143 ld rWORD7, 8(rSTR1)
144 ld rWORD8, 8(rSTR2)
fe6e95d7 145#endif
04067002
UD
146 cmpld cr5, rWORD7, rWORD8
147 b L(dP2e)
148/* Remainder is 24 */
fe6e95d7 149 .align 4
04067002 150L(dPs3):
fe6e95d7
AM
151 sld rWORD3, rWORD1, rWORD6
152 sld rWORD4, rWORD2, rWORD6
04067002
UD
153 cmpld cr1, rWORD3, rWORD4
154 b L(dP3e)
155/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 156 .align 4
04067002 157L(dPs4):
fe6e95d7
AM
158 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
159 sld rWORD1, rWORD1, rWORD6
160 sld rWORD2, rWORD2, rWORD6
161 cmpld cr7, rWORD1, rWORD2
04067002
UD
162 b L(dP4e)
163
164/* At this point we know both strings are double word aligned and the
165 compare length is at least 8 bytes. */
fe6e95d7 166 .align 4
04067002 167L(DWaligned):
fe6e95d7
AM
168 andi. r12, rN, 24 /* Get the DW remainder */
169 srdi r0, rN, 5 /* Divide by 32 */
170 cmpldi cr1, r12, 16
04067002
UD
171 cmpldi cr7, rN, 32
172 clrldi rN, rN, 61
173 beq L(dP4)
174 bgt cr1, L(dP3)
175 beq cr1, L(dP2)
9c84384c 176
04067002 177/* Remainder is 8 */
fe6e95d7 178 .align 4
04067002 179L(dP1):
fe6e95d7 180 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002 181/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
2ccdea26
AB
182 (8-15 byte compare), we want to use only volatile registers. This
183 means we can avoid restoring non-volatile registers since we did not
04067002 184 change any on the early exit path. The key here is the non-early
9c84384c 185 exit path only cares about the condition code (cr5), not about which
04067002 186 register pair was used. */
fe6e95d7
AM
187#ifdef __LITTLE_ENDIAN__
188 ldbrx rWORD5, 0, rSTR1
189 ldbrx rWORD6, 0, rSTR2
190 addi rSTR1, rSTR1, 8
191 addi rSTR2, rSTR2, 8
192#else
04067002
UD
193 ld rWORD5, 0(rSTR1)
194 ld rWORD6, 0(rSTR2)
fe6e95d7 195#endif
04067002
UD
196 cmpld cr5, rWORD5, rWORD6
197 blt cr7, L(dP1x)
fe6e95d7
AM
198#ifdef __LITTLE_ENDIAN__
199 ldbrx rWORD1, 0, rSTR1
200 ldbrx rWORD2, 0, rSTR2
201 addi rSTR1, rSTR1, 8
202 addi rSTR2, rSTR2, 8
203#else
04067002
UD
204 ld rWORD1, 8(rSTR1)
205 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
206#endif
207 cmpld cr7, rWORD1, rWORD2
04067002 208L(dP1e):
fe6e95d7
AM
209#ifdef __LITTLE_ENDIAN__
210 ldbrx rWORD3, 0, rSTR1
211 ldbrx rWORD4, 0, rSTR2
212 addi rSTR1, rSTR1, 8
213 addi rSTR2, rSTR2, 8
214#else
04067002
UD
215 ld rWORD3, 16(rSTR1)
216 ld rWORD4, 16(rSTR2)
fe6e95d7 217#endif
04067002 218 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
219#ifdef __LITTLE_ENDIAN__
220 ldbrx rWORD5, 0, rSTR1
221 ldbrx rWORD6, 0, rSTR2
222 addi rSTR1, rSTR1, 8
223 addi rSTR2, rSTR2, 8
224#else
04067002
UD
225 ld rWORD5, 24(rSTR1)
226 ld rWORD6, 24(rSTR2)
fe6e95d7 227#endif
04067002 228 cmpld cr6, rWORD5, rWORD6
fe6e95d7
AM
229 bne cr5, L(dLcr5x)
230 bne cr7, L(dLcr7x)
9c84384c 231
fe6e95d7
AM
232#ifdef __LITTLE_ENDIAN__
233 ldbrx rWORD7, 0, rSTR1
234 ldbrx rWORD8, 0, rSTR2
235 addi rSTR1, rSTR1, 8
236 addi rSTR2, rSTR2, 8
237#else
04067002
UD
238 ldu rWORD7, 32(rSTR1)
239 ldu rWORD8, 32(rSTR2)
fe6e95d7 240#endif
04067002
UD
241 bne cr1, L(dLcr1)
242 cmpld cr5, rWORD7, rWORD8
243 bdnz L(dLoop)
244 bne cr6, L(dLcr6)
fe6e95d7
AM
245 ld rWORD8, -8(r1)
246 ld rWORD7, -16(r1)
247 .align 3
04067002
UD
248L(dP1x):
249 sldi. r12, rN, 3
fe6e95d7 250 bne cr5, L(dLcr5x)
04067002
UD
251 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
252 bne L(d00)
253 li rRTN, 0
254 blr
9c84384c 255
04067002 256/* Remainder is 16 */
fe6e95d7 257 .align 4
04067002 258L(dP2):
fe6e95d7
AM
259 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
260#ifdef __LITTLE_ENDIAN__
261 ldbrx rWORD5, 0, rSTR1
262 ldbrx rWORD6, 0, rSTR2
263 addi rSTR1, rSTR1, 8
264 addi rSTR2, rSTR2, 8
265#else
04067002
UD
266 ld rWORD5, 0(rSTR1)
267 ld rWORD6, 0(rSTR2)
fe6e95d7 268#endif
04067002
UD
269 cmpld cr6, rWORD5, rWORD6
270 blt cr7, L(dP2x)
fe6e95d7
AM
271#ifdef __LITTLE_ENDIAN__
272 ldbrx rWORD7, 0, rSTR1
273 ldbrx rWORD8, 0, rSTR2
274 addi rSTR1, rSTR1, 8
275 addi rSTR2, rSTR2, 8
276#else
04067002
UD
277 ld rWORD7, 8(rSTR1)
278 ld rWORD8, 8(rSTR2)
fe6e95d7 279#endif
04067002
UD
280 cmpld cr5, rWORD7, rWORD8
281L(dP2e):
fe6e95d7
AM
282#ifdef __LITTLE_ENDIAN__
283 ldbrx rWORD1, 0, rSTR1
284 ldbrx rWORD2, 0, rSTR2
285 addi rSTR1, rSTR1, 8
286 addi rSTR2, rSTR2, 8
287#else
04067002
UD
288 ld rWORD1, 16(rSTR1)
289 ld rWORD2, 16(rSTR2)
fe6e95d7
AM
290#endif
291 cmpld cr7, rWORD1, rWORD2
292#ifdef __LITTLE_ENDIAN__
293 ldbrx rWORD3, 0, rSTR1
294 ldbrx rWORD4, 0, rSTR2
295 addi rSTR1, rSTR1, 8
296 addi rSTR2, rSTR2, 8
297#else
04067002
UD
298 ld rWORD3, 24(rSTR1)
299 ld rWORD4, 24(rSTR2)
fe6e95d7 300#endif
04067002 301 cmpld cr1, rWORD3, rWORD4
fe6e95d7 302#ifndef __LITTLE_ENDIAN__
04067002
UD
303 addi rSTR1, rSTR1, 8
304 addi rSTR2, rSTR2, 8
fe6e95d7 305#endif
04067002
UD
306 bne cr6, L(dLcr6)
307 bne cr5, L(dLcr5)
308 b L(dLoop2)
309/* Again we are on a early exit path (16-23 byte compare), we want to
2ccdea26 310 only use volatile registers and avoid restoring non-volatile
04067002 311 registers. */
fe6e95d7 312 .align 4
04067002 313L(dP2x):
fe6e95d7
AM
314#ifdef __LITTLE_ENDIAN__
315 ldbrx rWORD3, 0, rSTR1
316 ldbrx rWORD4, 0, rSTR2
317 addi rSTR1, rSTR1, 8
318 addi rSTR2, rSTR2, 8
319#else
04067002
UD
320 ld rWORD3, 8(rSTR1)
321 ld rWORD4, 8(rSTR2)
fe6e95d7
AM
322#endif
323 cmpld cr1, rWORD3, rWORD4
04067002 324 sldi. r12, rN, 3
fe6e95d7
AM
325 bne cr6, L(dLcr6x)
326#ifndef __LITTLE_ENDIAN__
04067002
UD
327 addi rSTR1, rSTR1, 8
328 addi rSTR2, rSTR2, 8
fe6e95d7
AM
329#endif
330 bne cr1, L(dLcr1x)
04067002
UD
331 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
332 bne L(d00)
333 li rRTN, 0
334 blr
9c84384c 335
04067002 336/* Remainder is 24 */
fe6e95d7 337 .align 4
04067002 338L(dP3):
fe6e95d7
AM
339 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
340#ifdef __LITTLE_ENDIAN__
341 ldbrx rWORD3, 0, rSTR1
342 ldbrx rWORD4, 0, rSTR2
343 addi rSTR1, rSTR1, 8
344 addi rSTR2, rSTR2, 8
345#else
04067002
UD
346 ld rWORD3, 0(rSTR1)
347 ld rWORD4, 0(rSTR2)
fe6e95d7 348#endif
04067002
UD
349 cmpld cr1, rWORD3, rWORD4
350L(dP3e):
fe6e95d7
AM
351#ifdef __LITTLE_ENDIAN__
352 ldbrx rWORD5, 0, rSTR1
353 ldbrx rWORD6, 0, rSTR2
354 addi rSTR1, rSTR1, 8
355 addi rSTR2, rSTR2, 8
356#else
04067002
UD
357 ld rWORD5, 8(rSTR1)
358 ld rWORD6, 8(rSTR2)
fe6e95d7 359#endif
04067002
UD
360 cmpld cr6, rWORD5, rWORD6
361 blt cr7, L(dP3x)
fe6e95d7
AM
362#ifdef __LITTLE_ENDIAN__
363 ldbrx rWORD7, 0, rSTR1
364 ldbrx rWORD8, 0, rSTR2
365 addi rSTR1, rSTR1, 8
366 addi rSTR2, rSTR2, 8
367#else
04067002
UD
368 ld rWORD7, 16(rSTR1)
369 ld rWORD8, 16(rSTR2)
fe6e95d7 370#endif
04067002 371 cmpld cr5, rWORD7, rWORD8
fe6e95d7
AM
372#ifdef __LITTLE_ENDIAN__
373 ldbrx rWORD1, 0, rSTR1
374 ldbrx rWORD2, 0, rSTR2
375 addi rSTR1, rSTR1, 8
376 addi rSTR2, rSTR2, 8
377#else
04067002
UD
378 ld rWORD1, 24(rSTR1)
379 ld rWORD2, 24(rSTR2)
fe6e95d7
AM
380#endif
381 cmpld cr7, rWORD1, rWORD2
382#ifndef __LITTLE_ENDIAN__
04067002
UD
383 addi rSTR1, rSTR1, 16
384 addi rSTR2, rSTR2, 16
fe6e95d7 385#endif
04067002
UD
386 bne cr1, L(dLcr1)
387 bne cr6, L(dLcr6)
388 b L(dLoop1)
389/* Again we are on a early exit path (24-31 byte compare), we want to
2ccdea26 390 only use volatile registers and avoid restoring non-volatile
04067002 391 registers. */
fe6e95d7 392 .align 4
04067002 393L(dP3x):
fe6e95d7
AM
394#ifdef __LITTLE_ENDIAN__
395 ldbrx rWORD1, 0, rSTR1
396 ldbrx rWORD2, 0, rSTR2
397 addi rSTR1, rSTR1, 8
398 addi rSTR2, rSTR2, 8
399#else
04067002
UD
400 ld rWORD1, 16(rSTR1)
401 ld rWORD2, 16(rSTR2)
fe6e95d7
AM
402#endif
403 cmpld cr7, rWORD1, rWORD2
04067002 404 sldi. r12, rN, 3
fe6e95d7
AM
405 bne cr1, L(dLcr1x)
406#ifndef __LITTLE_ENDIAN__
04067002
UD
407 addi rSTR1, rSTR1, 16
408 addi rSTR2, rSTR2, 16
fe6e95d7
AM
409#endif
410 bne cr6, L(dLcr6x)
04067002 411 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
fe6e95d7 412 bne cr7, L(dLcr7x)
04067002
UD
413 bne L(d00)
414 li rRTN, 0
415 blr
9c84384c 416
04067002 417/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 418 .align 4
04067002 419L(dP4):
fe6e95d7
AM
420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
421#ifdef __LITTLE_ENDIAN__
422 ldbrx rWORD1, 0, rSTR1
423 ldbrx rWORD2, 0, rSTR2
424 addi rSTR1, rSTR1, 8
425 addi rSTR2, rSTR2, 8
426#else
04067002
UD
427 ld rWORD1, 0(rSTR1)
428 ld rWORD2, 0(rSTR2)
fe6e95d7
AM
429#endif
430 cmpld cr7, rWORD1, rWORD2
04067002 431L(dP4e):
fe6e95d7
AM
432#ifdef __LITTLE_ENDIAN__
433 ldbrx rWORD3, 0, rSTR1
434 ldbrx rWORD4, 0, rSTR2
435 addi rSTR1, rSTR1, 8
436 addi rSTR2, rSTR2, 8
437#else
04067002
UD
438 ld rWORD3, 8(rSTR1)
439 ld rWORD4, 8(rSTR2)
fe6e95d7 440#endif
04067002 441 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
442#ifdef __LITTLE_ENDIAN__
443 ldbrx rWORD5, 0, rSTR1
444 ldbrx rWORD6, 0, rSTR2
445 addi rSTR1, rSTR1, 8
446 addi rSTR2, rSTR2, 8
447#else
04067002
UD
448 ld rWORD5, 16(rSTR1)
449 ld rWORD6, 16(rSTR2)
fe6e95d7 450#endif
04067002 451 cmpld cr6, rWORD5, rWORD6
fe6e95d7
AM
452#ifdef __LITTLE_ENDIAN__
453 ldbrx rWORD7, 0, rSTR1
454 ldbrx rWORD8, 0, rSTR2
455 addi rSTR1, rSTR1, 8
456 addi rSTR2, rSTR2, 8
457#else
04067002
UD
458 ldu rWORD7, 24(rSTR1)
459 ldu rWORD8, 24(rSTR2)
fe6e95d7 460#endif
04067002 461 cmpld cr5, rWORD7, rWORD8
fe6e95d7 462 bne cr7, L(dLcr7)
04067002
UD
463 bne cr1, L(dLcr1)
464 bdz- L(d24) /* Adjust CTR as we start with +4 */
465/* This is the primary loop */
fe6e95d7 466 .align 4
04067002 467L(dLoop):
fe6e95d7
AM
468#ifdef __LITTLE_ENDIAN__
469 ldbrx rWORD1, 0, rSTR1
470 ldbrx rWORD2, 0, rSTR2
471 addi rSTR1, rSTR1, 8
472 addi rSTR2, rSTR2, 8
473#else
04067002
UD
474 ld rWORD1, 8(rSTR1)
475 ld rWORD2, 8(rSTR2)
fe6e95d7 476#endif
04067002
UD
477 cmpld cr1, rWORD3, rWORD4
478 bne cr6, L(dLcr6)
479L(dLoop1):
fe6e95d7
AM
480#ifdef __LITTLE_ENDIAN__
481 ldbrx rWORD3, 0, rSTR1
482 ldbrx rWORD4, 0, rSTR2
483 addi rSTR1, rSTR1, 8
484 addi rSTR2, rSTR2, 8
485#else
04067002
UD
486 ld rWORD3, 16(rSTR1)
487 ld rWORD4, 16(rSTR2)
fe6e95d7 488#endif
04067002
UD
489 cmpld cr6, rWORD5, rWORD6
490 bne cr5, L(dLcr5)
491L(dLoop2):
fe6e95d7
AM
492#ifdef __LITTLE_ENDIAN__
493 ldbrx rWORD5, 0, rSTR1
494 ldbrx rWORD6, 0, rSTR2
495 addi rSTR1, rSTR1, 8
496 addi rSTR2, rSTR2, 8
497#else
04067002
UD
498 ld rWORD5, 24(rSTR1)
499 ld rWORD6, 24(rSTR2)
fe6e95d7 500#endif
04067002 501 cmpld cr5, rWORD7, rWORD8
fe6e95d7 502 bne cr7, L(dLcr7)
04067002 503L(dLoop3):
fe6e95d7
AM
504#ifdef __LITTLE_ENDIAN__
505 ldbrx rWORD7, 0, rSTR1
506 ldbrx rWORD8, 0, rSTR2
507 addi rSTR1, rSTR1, 8
508 addi rSTR2, rSTR2, 8
509#else
04067002
UD
510 ldu rWORD7, 32(rSTR1)
511 ldu rWORD8, 32(rSTR2)
fe6e95d7 512#endif
04067002 513 bne- cr1, L(dLcr1)
fe6e95d7 514 cmpld cr7, rWORD1, rWORD2
9c84384c
JM
515 bdnz+ L(dLoop)
516
04067002
UD
517L(dL4):
518 cmpld cr1, rWORD3, rWORD4
519 bne cr6, L(dLcr6)
520 cmpld cr6, rWORD5, rWORD6
521 bne cr5, L(dLcr5)
522 cmpld cr5, rWORD7, rWORD8
523L(d44):
fe6e95d7 524 bne cr7, L(dLcr7)
04067002
UD
525L(d34):
526 bne cr1, L(dLcr1)
527L(d24):
528 bne cr6, L(dLcr6)
529L(d14):
530 sldi. r12, rN, 3
9c84384c 531 bne cr5, L(dLcr5)
04067002 532L(d04):
fe6e95d7
AM
533 ld rWORD8, -8(r1)
534 ld rWORD7, -16(r1)
04067002
UD
535 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
536 beq L(zeroLength)
537/* At this point we have a remainder of 1 to 7 bytes to compare. Since
538 we are aligned it is safe to load the whole double word, and use
2ccdea26 539 shift right double to eliminate bits beyond the compare length. */
04067002 540L(d00):
fe6e95d7
AM
541#ifdef __LITTLE_ENDIAN__
542 ldbrx rWORD1, 0, rSTR1
543 ldbrx rWORD2, 0, rSTR2
544 addi rSTR1, rSTR1, 8
545 addi rSTR2, rSTR2, 8
546#else
04067002 547 ld rWORD1, 8(rSTR1)
9c84384c 548 ld rWORD2, 8(rSTR2)
fe6e95d7 549#endif
04067002
UD
550 srd rWORD1, rWORD1, rN
551 srd rWORD2, rWORD2, rN
fe6e95d7
AM
552 cmpld cr7, rWORD1, rWORD2
553 bne cr7, L(dLcr7x)
04067002
UD
554 li rRTN, 0
555 blr
fe6e95d7
AM
556
557 .align 4
558L(dLcr7):
559 ld rWORD8, -8(r1)
560 ld rWORD7, -16(r1)
561L(dLcr7x):
04067002 562 li rRTN, 1
fe6e95d7 563 bgtlr cr7
04067002
UD
564 li rRTN, -1
565 blr
fe6e95d7 566 .align 4
04067002 567L(dLcr1):
fe6e95d7
AM
568 ld rWORD8, -8(r1)
569 ld rWORD7, -16(r1)
570L(dLcr1x):
04067002
UD
571 li rRTN, 1
572 bgtlr cr1
573 li rRTN, -1
574 blr
fe6e95d7 575 .align 4
04067002 576L(dLcr6):
fe6e95d7
AM
577 ld rWORD8, -8(r1)
578 ld rWORD7, -16(r1)
579L(dLcr6x):
04067002
UD
580 li rRTN, 1
581 bgtlr cr6
582 li rRTN, -1
583 blr
fe6e95d7 584 .align 4
04067002 585L(dLcr5):
fe6e95d7
AM
586 ld rWORD8, -8(r1)
587 ld rWORD7, -16(r1)
04067002
UD
588L(dLcr5x):
589 li rRTN, 1
590 bgtlr cr5
591 li rRTN, -1
592 blr
9c84384c 593
fe6e95d7 594 .align 4
04067002 595L(bytealigned):
fe6e95d7
AM
596 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
597#if 0
598/* Huh? We've already branched on cr6! */
04067002 599 beq- cr6, L(zeroLength)
fe6e95d7 600#endif
04067002
UD
601
602/* We need to prime this loop. This loop is swing modulo scheduled
9c84384c 603 to avoid pipe delays. The dependent instruction latencies (load to
04067002
UD
604 compare to conditional branch) is 2 to 3 cycles. In this loop each
605 dispatch group ends in a branch and takes 1 cycle. Effectively
9c84384c
JM
606 the first iteration of the loop only serves to load operands and
607 branches based on compares are delayed until the next loop.
04067002
UD
608
609 So we must precondition some registers and condition codes so that
610 we don't exit the loop early on the first iteration. */
9c84384c 611
04067002
UD
612 lbz rWORD1, 0(rSTR1)
613 lbz rWORD2, 0(rSTR2)
614 bdz- L(b11)
fe6e95d7 615 cmpld cr7, rWORD1, rWORD2
04067002
UD
616 lbz rWORD3, 1(rSTR1)
617 lbz rWORD4, 1(rSTR2)
618 bdz- L(b12)
619 cmpld cr1, rWORD3, rWORD4
620 lbzu rWORD5, 2(rSTR1)
621 lbzu rWORD6, 2(rSTR2)
622 bdz- L(b13)
fe6e95d7 623 .align 4
04067002
UD
624L(bLoop):
625 lbzu rWORD1, 1(rSTR1)
626 lbzu rWORD2, 1(rSTR2)
fe6e95d7 627 bne- cr7, L(bLcr7)
04067002
UD
628
629 cmpld cr6, rWORD5, rWORD6
630 bdz- L(b3i)
9c84384c 631
04067002
UD
632 lbzu rWORD3, 1(rSTR1)
633 lbzu rWORD4, 1(rSTR2)
634 bne- cr1, L(bLcr1)
635
fe6e95d7 636 cmpld cr7, rWORD1, rWORD2
04067002
UD
637 bdz- L(b2i)
638
639 lbzu rWORD5, 1(rSTR1)
640 lbzu rWORD6, 1(rSTR2)
641 bne- cr6, L(bLcr6)
642
643 cmpld cr1, rWORD3, rWORD4
644 bdnz+ L(bLoop)
9c84384c 645
04067002
UD
646/* We speculatively loading bytes before we have tested the previous
647 bytes. But we must avoid overrunning the length (in the ctr) to
9c84384c 648 prevent these speculative loads from causing a segfault. In this
04067002
UD
649 case the loop will exit early (before the all pending bytes are
650 tested. In this case we must complete the pending operations
651 before returning. */
652L(b1i):
fe6e95d7 653 bne- cr7, L(bLcr7)
04067002
UD
654 bne- cr1, L(bLcr1)
655 b L(bx56)
fe6e95d7 656 .align 4
04067002
UD
657L(b2i):
658 bne- cr6, L(bLcr6)
fe6e95d7 659 bne- cr7, L(bLcr7)
04067002 660 b L(bx34)
fe6e95d7 661 .align 4
04067002
UD
662L(b3i):
663 bne- cr1, L(bLcr1)
664 bne- cr6, L(bLcr6)
665 b L(bx12)
fe6e95d7
AM
666 .align 4
667L(bLcr7):
04067002 668 li rRTN, 1
fe6e95d7 669 bgtlr cr7
04067002
UD
670 li rRTN, -1
671 blr
672L(bLcr1):
673 li rRTN, 1
674 bgtlr cr1
675 li rRTN, -1
676 blr
677L(bLcr6):
678 li rRTN, 1
679 bgtlr cr6
680 li rRTN, -1
681 blr
682
683L(b13):
fe6e95d7 684 bne- cr7, L(bx12)
04067002
UD
685 bne- cr1, L(bx34)
686L(bx56):
687 sub rRTN, rWORD5, rWORD6
688 blr
689 nop
690L(b12):
fe6e95d7 691 bne- cr7, L(bx12)
9c84384c 692L(bx34):
04067002
UD
693 sub rRTN, rWORD3, rWORD4
694 blr
695L(b11):
696L(bx12):
697 sub rRTN, rWORD1, rWORD2
698 blr
fe6e95d7 699 .align 4
04067002
UD
700L(zeroLength):
701 li rRTN, 0
702 blr
703
fe6e95d7 704 .align 4
04067002 705/* At this point we know the strings have different alignment and the
fe6e95d7 706 compare length is at least 8 bytes. r12 contains the low order
04067002 707 3 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7 708 of r12 to 0. If r12 == 0 then rStr1 is double word
04067002 709 aligned and can perform the DWunaligned loop.
9c84384c 710
2ccdea26 711 Otherwise we know that rSTR1 is not already DW aligned yet.
04067002 712 So we can force the string addresses to the next lower DW
fe6e95d7 713 boundary and special case this first DW using shift left to
2ccdea26 714 eliminate bits preceding the first byte. Since we want to join the
04067002
UD
715 normal (DWaligned) compare loop, starting at the second double word,
716 we need to adjust the length (rN) and special case the loop
fe6e95d7 717 versioning for the first DW. This ensures that the loop count is
04067002 718 correct and the first DW (shifted) is in the expected resister pair. */
fe6e95d7
AM
719#define rSHL r29 /* Unaligned shift left count. */
720#define rSHR r28 /* Unaligned shift right count. */
721#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
722#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
723#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
724#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
04067002 725L(unaligned):
fe6e95d7
AM
726 std rSHL, -24(r1)
727 cfi_offset(rSHL, -24)
04067002
UD
728 clrldi rSHL, rSTR2, 61
729 beq- cr6, L(duzeroLength)
fe6e95d7
AM
730 std rSHR, -32(r1)
731 cfi_offset(rSHR, -32)
04067002 732 beq cr5, L(DWunaligned)
fe6e95d7
AM
733 std rWORD8_SHIFT, -40(r1)
734 cfi_offset(rWORD8_SHIFT, -40)
735/* Adjust the logical start of rSTR2 to compensate for the extra bits
04067002 736 in the 1st rSTR1 DW. */
fe6e95d7 737 sub rWORD8_SHIFT, rSTR2, r12
04067002
UD
738/* But do not attempt to address the DW before that DW that contains
739 the actual start of rSTR2. */
740 clrrdi rSTR2, rSTR2, 3
fe6e95d7 741 std rWORD2_SHIFT, -48(r1)
fe6e95d7 742/* Compute the left/right shift counts for the unaligned rSTR2,
9c84384c 743 compensating for the logical (DW aligned) start of rSTR1. */
fe6e95d7 744 clrldi rSHL, rWORD8_SHIFT, 61
9c84384c 745 clrrdi rSTR1, rSTR1, 3
fe6e95d7 746 std rWORD4_SHIFT, -56(r1)
04067002 747 sldi rSHL, rSHL, 3
fe6e95d7
AM
748 cmpld cr5, rWORD8_SHIFT, rSTR2
749 add rN, rN, r12
750 sldi rWORD6, r12, 3
751 std rWORD6_SHIFT, -64(r1)
869d7180
RS
752 cfi_offset(rWORD2_SHIFT, -48)
753 cfi_offset(rWORD4_SHIFT, -56)
fe6e95d7 754 cfi_offset(rWORD6_SHIFT, -64)
04067002 755 subfic rSHR, rSHL, 64
fe6e95d7
AM
756 srdi r0, rN, 5 /* Divide by 32 */
757 andi. r12, rN, 24 /* Get the DW remainder */
04067002
UD
758/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
759 this special case those bits may be discarded anyway. Also we
760 must avoid loading a DW where none of the bits are part of rSTR2 as
761 this may cross a page boundary and cause a page fault. */
762 li rWORD8, 0
763 blt cr5, L(dus0)
fe6e95d7
AM
764#ifdef __LITTLE_ENDIAN__
765 ldbrx rWORD8, 0, rSTR2
766 addi rSTR2, rSTR2, 8
767#else
04067002 768 ld rWORD8, 0(rSTR2)
fe6e95d7
AM
769 addi rSTR2, rSTR2, 8
770#endif
04067002
UD
771 sld rWORD8, rWORD8, rSHL
772
773L(dus0):
fe6e95d7
AM
774#ifdef __LITTLE_ENDIAN__
775 ldbrx rWORD1, 0, rSTR1
776 ldbrx rWORD2, 0, rSTR2
777 addi rSTR1, rSTR1, 8
778 addi rSTR2, rSTR2, 8
779#else
04067002
UD
780 ld rWORD1, 0(rSTR1)
781 ld rWORD2, 0(rSTR2)
fe6e95d7
AM
782#endif
783 cmpldi cr1, r12, 16
04067002 784 cmpldi cr7, rN, 32
fe6e95d7 785 srd r12, rWORD2, rSHR
04067002
UD
786 clrldi rN, rN, 61
787 beq L(duPs4)
fe6e95d7
AM
788 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
789 or rWORD8, r12, rWORD8
04067002
UD
790 bgt cr1, L(duPs3)
791 beq cr1, L(duPs2)
792
793/* Remainder is 8 */
fe6e95d7 794 .align 4
04067002 795L(dusP1):
fe6e95d7
AM
796 sld rWORD8_SHIFT, rWORD2, rSHL
797 sld rWORD7, rWORD1, rWORD6
798 sld rWORD8, rWORD8, rWORD6
04067002
UD
799 bge cr7, L(duP1e)
800/* At this point we exit early with the first double word compare
801 complete and remainder of 0 to 7 bytes. See L(du14) for details on
802 how we handle the remaining bytes. */
803 cmpld cr5, rWORD7, rWORD8
804 sldi. rN, rN, 3
805 bne cr5, L(duLcr5)
806 cmpld cr7, rN, rSHR
807 beq L(duZeroReturn)
fe6e95d7 808 li r0, 0
04067002 809 ble cr7, L(dutrim)
fe6e95d7
AM
810#ifdef __LITTLE_ENDIAN__
811 ldbrx rWORD2, 0, rSTR2
812 addi rSTR2, rSTR2, 8
813#else
04067002 814 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
815#endif
816 srd r0, rWORD2, rSHR
04067002
UD
817 b L(dutrim)
818/* Remainder is 16 */
fe6e95d7 819 .align 4
04067002 820L(duPs2):
fe6e95d7
AM
821 sld rWORD6_SHIFT, rWORD2, rSHL
822 sld rWORD5, rWORD1, rWORD6
823 sld rWORD6, rWORD8, rWORD6
04067002
UD
824 b L(duP2e)
825/* Remainder is 24 */
fe6e95d7 826 .align 4
04067002 827L(duPs3):
fe6e95d7
AM
828 sld rWORD4_SHIFT, rWORD2, rSHL
829 sld rWORD3, rWORD1, rWORD6
830 sld rWORD4, rWORD8, rWORD6
04067002
UD
831 b L(duP3e)
832/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 833 .align 4
04067002 834L(duPs4):
fe6e95d7
AM
835 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
836 or rWORD8, r12, rWORD8
837 sld rWORD2_SHIFT, rWORD2, rSHL
838 sld rWORD1, rWORD1, rWORD6
839 sld rWORD2, rWORD8, rWORD6
04067002
UD
840 b L(duP4e)
841
842/* At this point we know rSTR1 is double word aligned and the
843 compare length is at least 8 bytes. */
fe6e95d7 844 .align 4
04067002 845L(DWunaligned):
fe6e95d7 846 std rWORD8_SHIFT, -40(r1)
04067002 847 clrrdi rSTR2, rSTR2, 3
fe6e95d7 848 std rWORD2_SHIFT, -48(r1)
fe6e95d7
AM
849 srdi r0, rN, 5 /* Divide by 32 */
850 std rWORD4_SHIFT, -56(r1)
fe6e95d7
AM
851 andi. r12, rN, 24 /* Get the DW remainder */
852 std rWORD6_SHIFT, -64(r1)
869d7180
RS
853 cfi_offset(rWORD8_SHIFT, -40)
854 cfi_offset(rWORD2_SHIFT, -48)
855 cfi_offset(rWORD4_SHIFT, -56)
fe6e95d7 856 cfi_offset(rWORD6_SHIFT, -64)
04067002 857 sldi rSHL, rSHL, 3
fe6e95d7
AM
858#ifdef __LITTLE_ENDIAN__
859 ldbrx rWORD6, 0, rSTR2
860 addi rSTR2, rSTR2, 8
861 ldbrx rWORD8, 0, rSTR2
862 addi rSTR2, rSTR2, 8
863#else
04067002
UD
864 ld rWORD6, 0(rSTR2)
865 ldu rWORD8, 8(rSTR2)
fe6e95d7
AM
866#endif
867 cmpldi cr1, r12, 16
04067002
UD
868 cmpldi cr7, rN, 32
869 clrldi rN, rN, 61
870 subfic rSHR, rSHL, 64
fe6e95d7 871 sld rWORD6_SHIFT, rWORD6, rSHL
04067002 872 beq L(duP4)
fe6e95d7 873 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
874 bgt cr1, L(duP3)
875 beq cr1, L(duP2)
9c84384c 876
04067002 877/* Remainder is 8 */
fe6e95d7 878 .align 4
04067002 879L(duP1):
fe6e95d7
AM
880 srd r12, rWORD8, rSHR
881#ifdef __LITTLE_ENDIAN__
882 ldbrx rWORD7, 0, rSTR1
883 addi rSTR1, rSTR1, 8
884#else
04067002 885 ld rWORD7, 0(rSTR1)
fe6e95d7
AM
886#endif
887 sld rWORD8_SHIFT, rWORD8, rSHL
888 or rWORD8, r12, rWORD6_SHIFT
04067002
UD
889 blt cr7, L(duP1x)
890L(duP1e):
fe6e95d7
AM
891#ifdef __LITTLE_ENDIAN__
892 ldbrx rWORD1, 0, rSTR1
893 ldbrx rWORD2, 0, rSTR2
894 addi rSTR1, rSTR1, 8
895 addi rSTR2, rSTR2, 8
896#else
04067002
UD
897 ld rWORD1, 8(rSTR1)
898 ld rWORD2, 8(rSTR2)
fe6e95d7 899#endif
04067002 900 cmpld cr5, rWORD7, rWORD8
fe6e95d7
AM
901 srd r0, rWORD2, rSHR
902 sld rWORD2_SHIFT, rWORD2, rSHL
903 or rWORD2, r0, rWORD8_SHIFT
904#ifdef __LITTLE_ENDIAN__
905 ldbrx rWORD3, 0, rSTR1
906 ldbrx rWORD4, 0, rSTR2
907 addi rSTR1, rSTR1, 8
908 addi rSTR2, rSTR2, 8
909#else
04067002
UD
910 ld rWORD3, 16(rSTR1)
911 ld rWORD4, 16(rSTR2)
fe6e95d7
AM
912#endif
913 cmpld cr7, rWORD1, rWORD2
914 srd r12, rWORD4, rSHR
915 sld rWORD4_SHIFT, rWORD4, rSHL
04067002 916 bne cr5, L(duLcr5)
fe6e95d7
AM
917 or rWORD4, r12, rWORD2_SHIFT
918#ifdef __LITTLE_ENDIAN__
919 ldbrx rWORD5, 0, rSTR1
920 ldbrx rWORD6, 0, rSTR2
921 addi rSTR1, rSTR1, 8
922 addi rSTR2, rSTR2, 8
923#else
04067002
UD
924 ld rWORD5, 24(rSTR1)
925 ld rWORD6, 24(rSTR2)
fe6e95d7 926#endif
04067002 927 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
928 srd r0, rWORD6, rSHR
929 sld rWORD6_SHIFT, rWORD6, rSHL
930 bne cr7, L(duLcr7)
931 or rWORD6, r0, rWORD4_SHIFT
04067002 932 cmpld cr6, rWORD5, rWORD6
9c84384c 933 b L(duLoop3)
fe6e95d7 934 .align 4
04067002
UD
935/* At this point we exit early with the first double word compare
936 complete and remainder of 0 to 7 bytes. See L(du14) for details on
937 how we handle the remaining bytes. */
938L(duP1x):
939 cmpld cr5, rWORD7, rWORD8
940 sldi. rN, rN, 3
941 bne cr5, L(duLcr5)
942 cmpld cr7, rN, rSHR
943 beq L(duZeroReturn)
fe6e95d7 944 li r0, 0
04067002 945 ble cr7, L(dutrim)
fe6e95d7
AM
946#ifdef __LITTLE_ENDIAN__
947 ldbrx rWORD2, 0, rSTR2
948 addi rSTR2, rSTR2, 8
949#else
04067002 950 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
951#endif
952 srd r0, rWORD2, rSHR
04067002
UD
953 b L(dutrim)
954/* Remainder is 16 */
fe6e95d7 955 .align 4
04067002 956L(duP2):
fe6e95d7
AM
957 srd r0, rWORD8, rSHR
958#ifdef __LITTLE_ENDIAN__
959 ldbrx rWORD5, 0, rSTR1
960 addi rSTR1, rSTR1, 8
961#else
04067002 962 ld rWORD5, 0(rSTR1)
fe6e95d7
AM
963#endif
964 or rWORD6, r0, rWORD6_SHIFT
965 sld rWORD6_SHIFT, rWORD8, rSHL
04067002 966L(duP2e):
fe6e95d7
AM
967#ifdef __LITTLE_ENDIAN__
968 ldbrx rWORD7, 0, rSTR1
969 ldbrx rWORD8, 0, rSTR2
970 addi rSTR1, rSTR1, 8
971 addi rSTR2, rSTR2, 8
972#else
04067002
UD
973 ld rWORD7, 8(rSTR1)
974 ld rWORD8, 8(rSTR2)
fe6e95d7 975#endif
04067002 976 cmpld cr6, rWORD5, rWORD6
fe6e95d7
AM
977 srd r12, rWORD8, rSHR
978 sld rWORD8_SHIFT, rWORD8, rSHL
979 or rWORD8, r12, rWORD6_SHIFT
04067002 980 blt cr7, L(duP2x)
fe6e95d7
AM
981#ifdef __LITTLE_ENDIAN__
982 ldbrx rWORD1, 0, rSTR1
983 ldbrx rWORD2, 0, rSTR2
984 addi rSTR1, rSTR1, 8
985 addi rSTR2, rSTR2, 8
986#else
04067002
UD
987 ld rWORD1, 16(rSTR1)
988 ld rWORD2, 16(rSTR2)
fe6e95d7 989#endif
04067002
UD
990 cmpld cr5, rWORD7, rWORD8
991 bne cr6, L(duLcr6)
fe6e95d7
AM
992 srd r0, rWORD2, rSHR
993 sld rWORD2_SHIFT, rWORD2, rSHL
994 or rWORD2, r0, rWORD8_SHIFT
995#ifdef __LITTLE_ENDIAN__
996 ldbrx rWORD3, 0, rSTR1
997 ldbrx rWORD4, 0, rSTR2
998 addi rSTR1, rSTR1, 8
999 addi rSTR2, rSTR2, 8
1000#else
04067002
UD
1001 ld rWORD3, 24(rSTR1)
1002 ld rWORD4, 24(rSTR2)
fe6e95d7
AM
1003#endif
1004 cmpld cr7, rWORD1, rWORD2
04067002 1005 bne cr5, L(duLcr5)
fe6e95d7
AM
1006 srd r12, rWORD4, rSHR
1007 sld rWORD4_SHIFT, rWORD4, rSHL
1008 or rWORD4, r12, rWORD2_SHIFT
1009#ifndef __LITTLE_ENDIAN__
04067002
UD
1010 addi rSTR1, rSTR1, 8
1011 addi rSTR2, rSTR2, 8
fe6e95d7 1012#endif
04067002
UD
1013 cmpld cr1, rWORD3, rWORD4
1014 b L(duLoop2)
fe6e95d7 1015 .align 4
04067002
UD
1016L(duP2x):
1017 cmpld cr5, rWORD7, rWORD8
fe6e95d7 1018#ifndef __LITTLE_ENDIAN__
04067002
UD
1019 addi rSTR1, rSTR1, 8
1020 addi rSTR2, rSTR2, 8
fe6e95d7 1021#endif
04067002
UD
1022 bne cr6, L(duLcr6)
1023 sldi. rN, rN, 3
1024 bne cr5, L(duLcr5)
1025 cmpld cr7, rN, rSHR
1026 beq L(duZeroReturn)
fe6e95d7 1027 li r0, 0
04067002 1028 ble cr7, L(dutrim)
fe6e95d7
AM
1029#ifdef __LITTLE_ENDIAN__
1030 ldbrx rWORD2, 0, rSTR2
1031 addi rSTR2, rSTR2, 8
1032#else
04067002 1033 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
1034#endif
1035 srd r0, rWORD2, rSHR
04067002 1036 b L(dutrim)
9c84384c 1037
04067002 1038/* Remainder is 24 */
fe6e95d7 1039 .align 4
04067002 1040L(duP3):
fe6e95d7
AM
1041 srd r12, rWORD8, rSHR
1042#ifdef __LITTLE_ENDIAN__
1043 ldbrx rWORD3, 0, rSTR1
1044 addi rSTR1, rSTR1, 8
1045#else
04067002 1046 ld rWORD3, 0(rSTR1)
fe6e95d7
AM
1047#endif
1048 sld rWORD4_SHIFT, rWORD8, rSHL
1049 or rWORD4, r12, rWORD6_SHIFT
04067002 1050L(duP3e):
fe6e95d7
AM
1051#ifdef __LITTLE_ENDIAN__
1052 ldbrx rWORD5, 0, rSTR1
1053 ldbrx rWORD6, 0, rSTR2
1054 addi rSTR1, rSTR1, 8
1055 addi rSTR2, rSTR2, 8
1056#else
04067002
UD
1057 ld rWORD5, 8(rSTR1)
1058 ld rWORD6, 8(rSTR2)
fe6e95d7 1059#endif
04067002 1060 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
1061 srd r0, rWORD6, rSHR
1062 sld rWORD6_SHIFT, rWORD6, rSHL
1063 or rWORD6, r0, rWORD4_SHIFT
1064#ifdef __LITTLE_ENDIAN__
1065 ldbrx rWORD7, 0, rSTR1
1066 ldbrx rWORD8, 0, rSTR2
1067 addi rSTR1, rSTR1, 8
1068 addi rSTR2, rSTR2, 8
1069#else
04067002
UD
1070 ld rWORD7, 16(rSTR1)
1071 ld rWORD8, 16(rSTR2)
fe6e95d7 1072#endif
04067002
UD
1073 cmpld cr6, rWORD5, rWORD6
1074 bne cr1, L(duLcr1)
fe6e95d7
AM
1075 srd r12, rWORD8, rSHR
1076 sld rWORD8_SHIFT, rWORD8, rSHL
1077 or rWORD8, r12, rWORD6_SHIFT
04067002 1078 blt cr7, L(duP3x)
fe6e95d7
AM
1079#ifdef __LITTLE_ENDIAN__
1080 ldbrx rWORD1, 0, rSTR1
1081 ldbrx rWORD2, 0, rSTR2
1082 addi rSTR1, rSTR1, 8
1083 addi rSTR2, rSTR2, 8
1084#else
04067002
UD
1085 ld rWORD1, 24(rSTR1)
1086 ld rWORD2, 24(rSTR2)
fe6e95d7 1087#endif
04067002
UD
1088 cmpld cr5, rWORD7, rWORD8
1089 bne cr6, L(duLcr6)
fe6e95d7
AM
1090 srd r0, rWORD2, rSHR
1091 sld rWORD2_SHIFT, rWORD2, rSHL
1092 or rWORD2, r0, rWORD8_SHIFT
1093#ifndef __LITTLE_ENDIAN__
04067002
UD
1094 addi rSTR1, rSTR1, 16
1095 addi rSTR2, rSTR2, 16
fe6e95d7
AM
1096#endif
1097 cmpld cr7, rWORD1, rWORD2
04067002 1098 b L(duLoop1)
fe6e95d7 1099 .align 4
04067002 1100L(duP3x):
fe6e95d7 1101#ifndef __LITTLE_ENDIAN__
04067002
UD
1102 addi rSTR1, rSTR1, 16
1103 addi rSTR2, rSTR2, 16
fe6e95d7
AM
1104#endif
1105#if 0
1106/* Huh? We've already branched on cr1! */
04067002 1107 bne cr1, L(duLcr1)
fe6e95d7 1108#endif
04067002
UD
1109 cmpld cr5, rWORD7, rWORD8
1110 bne cr6, L(duLcr6)
1111 sldi. rN, rN, 3
1112 bne cr5, L(duLcr5)
1113 cmpld cr7, rN, rSHR
1114 beq L(duZeroReturn)
fe6e95d7 1115 li r0, 0
04067002 1116 ble cr7, L(dutrim)
fe6e95d7
AM
1117#ifdef __LITTLE_ENDIAN__
1118 ldbrx rWORD2, 0, rSTR2
1119 addi rSTR2, rSTR2, 8
1120#else
04067002 1121 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
1122#endif
1123 srd r0, rWORD2, rSHR
04067002 1124 b L(dutrim)
9c84384c 1125
04067002 1126/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 1127 .align 4
04067002 1128L(duP4):
fe6e95d7
AM
1129 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1130 srd r0, rWORD8, rSHR
1131#ifdef __LITTLE_ENDIAN__
1132 ldbrx rWORD1, 0, rSTR1
1133 addi rSTR1, rSTR1, 8
1134#else
04067002 1135 ld rWORD1, 0(rSTR1)
fe6e95d7
AM
1136#endif
1137 sld rWORD2_SHIFT, rWORD8, rSHL
1138 or rWORD2, r0, rWORD6_SHIFT
04067002 1139L(duP4e):
fe6e95d7
AM
1140#ifdef __LITTLE_ENDIAN__
1141 ldbrx rWORD3, 0, rSTR1
1142 ldbrx rWORD4, 0, rSTR2
1143 addi rSTR1, rSTR1, 8
1144 addi rSTR2, rSTR2, 8
1145#else
04067002
UD
1146 ld rWORD3, 8(rSTR1)
1147 ld rWORD4, 8(rSTR2)
fe6e95d7
AM
1148#endif
1149 cmpld cr7, rWORD1, rWORD2
1150 srd r12, rWORD4, rSHR
1151 sld rWORD4_SHIFT, rWORD4, rSHL
1152 or rWORD4, r12, rWORD2_SHIFT
1153#ifdef __LITTLE_ENDIAN__
1154 ldbrx rWORD5, 0, rSTR1
1155 ldbrx rWORD6, 0, rSTR2
1156 addi rSTR1, rSTR1, 8
1157 addi rSTR2, rSTR2, 8
1158#else
04067002
UD
1159 ld rWORD5, 16(rSTR1)
1160 ld rWORD6, 16(rSTR2)
fe6e95d7 1161#endif
04067002 1162 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
1163 bne cr7, L(duLcr7)
1164 srd r0, rWORD6, rSHR
1165 sld rWORD6_SHIFT, rWORD6, rSHL
1166 or rWORD6, r0, rWORD4_SHIFT
1167#ifdef __LITTLE_ENDIAN__
1168 ldbrx rWORD7, 0, rSTR1
1169 ldbrx rWORD8, 0, rSTR2
1170 addi rSTR1, rSTR1, 8
1171 addi rSTR2, rSTR2, 8
1172#else
04067002
UD
1173 ldu rWORD7, 24(rSTR1)
1174 ldu rWORD8, 24(rSTR2)
fe6e95d7 1175#endif
04067002
UD
1176 cmpld cr6, rWORD5, rWORD6
1177 bne cr1, L(duLcr1)
fe6e95d7
AM
1178 srd r12, rWORD8, rSHR
1179 sld rWORD8_SHIFT, rWORD8, rSHL
1180 or rWORD8, r12, rWORD6_SHIFT
04067002
UD
1181 cmpld cr5, rWORD7, rWORD8
1182 bdz- L(du24) /* Adjust CTR as we start with +4 */
1183/* This is the primary loop */
fe6e95d7 1184 .align 4
04067002 1185L(duLoop):
fe6e95d7
AM
1186#ifdef __LITTLE_ENDIAN__
1187 ldbrx rWORD1, 0, rSTR1
1188 ldbrx rWORD2, 0, rSTR2
1189 addi rSTR1, rSTR1, 8
1190 addi rSTR2, rSTR2, 8
1191#else
04067002
UD
1192 ld rWORD1, 8(rSTR1)
1193 ld rWORD2, 8(rSTR2)
fe6e95d7 1194#endif
04067002
UD
1195 cmpld cr1, rWORD3, rWORD4
1196 bne cr6, L(duLcr6)
fe6e95d7
AM
1197 srd r0, rWORD2, rSHR
1198 sld rWORD2_SHIFT, rWORD2, rSHL
1199 or rWORD2, r0, rWORD8_SHIFT
04067002 1200L(duLoop1):
fe6e95d7
AM
1201#ifdef __LITTLE_ENDIAN__
1202 ldbrx rWORD3, 0, rSTR1
1203 ldbrx rWORD4, 0, rSTR2
1204 addi rSTR1, rSTR1, 8
1205 addi rSTR2, rSTR2, 8
1206#else
04067002
UD
1207 ld rWORD3, 16(rSTR1)
1208 ld rWORD4, 16(rSTR2)
fe6e95d7 1209#endif
04067002
UD
1210 cmpld cr6, rWORD5, rWORD6
1211 bne cr5, L(duLcr5)
fe6e95d7
AM
1212 srd r12, rWORD4, rSHR
1213 sld rWORD4_SHIFT, rWORD4, rSHL
1214 or rWORD4, r12, rWORD2_SHIFT
04067002 1215L(duLoop2):
fe6e95d7
AM
1216#ifdef __LITTLE_ENDIAN__
1217 ldbrx rWORD5, 0, rSTR1
1218 ldbrx rWORD6, 0, rSTR2
1219 addi rSTR1, rSTR1, 8
1220 addi rSTR2, rSTR2, 8
1221#else
04067002
UD
1222 ld rWORD5, 24(rSTR1)
1223 ld rWORD6, 24(rSTR2)
fe6e95d7 1224#endif
04067002 1225 cmpld cr5, rWORD7, rWORD8
fe6e95d7
AM
1226 bne cr7, L(duLcr7)
1227 srd r0, rWORD6, rSHR
1228 sld rWORD6_SHIFT, rWORD6, rSHL
1229 or rWORD6, r0, rWORD4_SHIFT
04067002 1230L(duLoop3):
fe6e95d7
AM
1231#ifdef __LITTLE_ENDIAN__
1232 ldbrx rWORD7, 0, rSTR1
1233 ldbrx rWORD8, 0, rSTR2
1234 addi rSTR1, rSTR1, 8
1235 addi rSTR2, rSTR2, 8
1236#else
04067002
UD
1237 ldu rWORD7, 32(rSTR1)
1238 ldu rWORD8, 32(rSTR2)
fe6e95d7
AM
1239#endif
1240 cmpld cr7, rWORD1, rWORD2
04067002 1241 bne- cr1, L(duLcr1)
fe6e95d7
AM
1242 srd r12, rWORD8, rSHR
1243 sld rWORD8_SHIFT, rWORD8, rSHL
1244 or rWORD8, r12, rWORD6_SHIFT
9c84384c
JM
1245 bdnz+ L(duLoop)
1246
04067002 1247L(duL4):
fe6e95d7
AM
1248#if 0
1249/* Huh? We've already branched on cr1! */
04067002 1250 bne cr1, L(duLcr1)
fe6e95d7 1251#endif
04067002
UD
1252 cmpld cr1, rWORD3, rWORD4
1253 bne cr6, L(duLcr6)
1254 cmpld cr6, rWORD5, rWORD6
1255 bne cr5, L(duLcr5)
1256 cmpld cr5, rWORD7, rWORD8
1257L(du44):
fe6e95d7 1258 bne cr7, L(duLcr7)
04067002
UD
1259L(du34):
1260 bne cr1, L(duLcr1)
1261L(du24):
1262 bne cr6, L(duLcr6)
1263L(du14):
1264 sldi. rN, rN, 3
1265 bne cr5, L(duLcr5)
1266/* At this point we have a remainder of 1 to 7 bytes to compare. We use
2ccdea26 1267 shift right double to eliminate bits beyond the compare length.
04067002 1268
9c84384c 1269 However it may not be safe to load rWORD2 which may be beyond the
04067002
UD
1270 string length. So we compare the bit length of the remainder to
1271 the right shift count (rSHR). If the bit count is less than or equal
1272 we do not need to load rWORD2 (all significant bits are already in
fe6e95d7 1273 rWORD8_SHIFT). */
04067002
UD
1274 cmpld cr7, rN, rSHR
1275 beq L(duZeroReturn)
fe6e95d7 1276 li r0, 0
04067002 1277 ble cr7, L(dutrim)
fe6e95d7
AM
1278#ifdef __LITTLE_ENDIAN__
1279 ldbrx rWORD2, 0, rSTR2
1280 addi rSTR2, rSTR2, 8
1281#else
04067002 1282 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
1283#endif
1284 srd r0, rWORD2, rSHR
1285 .align 4
04067002 1286L(dutrim):
fe6e95d7
AM
1287#ifdef __LITTLE_ENDIAN__
1288 ldbrx rWORD1, 0, rSTR1
1289#else
04067002 1290 ld rWORD1, 8(rSTR1)
fe6e95d7
AM
1291#endif
1292 ld rWORD8, -8(r1)
9c84384c 1293 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
fe6e95d7
AM
1294 or rWORD2, r0, rWORD8_SHIFT
1295 ld rWORD7, -16(r1)
1296 ld rSHL, -24(r1)
04067002
UD
1297 srd rWORD1, rWORD1, rN
1298 srd rWORD2, rWORD2, rN
fe6e95d7
AM
1299 ld rSHR, -32(r1)
1300 ld rWORD8_SHIFT, -40(r1)
04067002 1301 li rRTN, 0
fe6e95d7
AM
1302 cmpld cr7, rWORD1, rWORD2
1303 ld rWORD2_SHIFT, -48(r1)
1304 ld rWORD4_SHIFT, -56(r1)
1305 beq cr7, L(dureturn24)
04067002 1306 li rRTN, 1
fe6e95d7
AM
1307 ld rWORD6_SHIFT, -64(r1)
1308 bgtlr cr7
04067002
UD
1309 li rRTN, -1
1310 blr
fe6e95d7
AM
1311 .align 4
1312L(duLcr7):
1313 ld rWORD8, -8(r1)
1314 ld rWORD7, -16(r1)
04067002 1315 li rRTN, 1
fe6e95d7
AM
1316 bgt cr7, L(dureturn29)
1317 ld rSHL, -24(r1)
1318 ld rSHR, -32(r1)
04067002
UD
1319 li rRTN, -1
1320 b L(dureturn27)
fe6e95d7 1321 .align 4
04067002 1322L(duLcr1):
fe6e95d7
AM
1323 ld rWORD8, -8(r1)
1324 ld rWORD7, -16(r1)
04067002 1325 li rRTN, 1
9c84384c 1326 bgt cr1, L(dureturn29)
fe6e95d7
AM
1327 ld rSHL, -24(r1)
1328 ld rSHR, -32(r1)
04067002
UD
1329 li rRTN, -1
1330 b L(dureturn27)
fe6e95d7 1331 .align 4
04067002 1332L(duLcr6):
fe6e95d7
AM
1333 ld rWORD8, -8(r1)
1334 ld rWORD7, -16(r1)
04067002 1335 li rRTN, 1
9c84384c 1336 bgt cr6, L(dureturn29)
fe6e95d7
AM
1337 ld rSHL, -24(r1)
1338 ld rSHR, -32(r1)
04067002
UD
1339 li rRTN, -1
1340 b L(dureturn27)
fe6e95d7 1341 .align 4
04067002 1342L(duLcr5):
fe6e95d7
AM
1343 ld rWORD8, -8(r1)
1344 ld rWORD7, -16(r1)
04067002 1345 li rRTN, 1
9c84384c 1346 bgt cr5, L(dureturn29)
fe6e95d7
AM
1347 ld rSHL, -24(r1)
1348 ld rSHR, -32(r1)
04067002
UD
1349 li rRTN, -1
1350 b L(dureturn27)
1351 .align 3
1352L(duZeroReturn):
fe6e95d7 1353 li rRTN, 0
04067002
UD
1354 .align 4
1355L(dureturn):
fe6e95d7
AM
1356 ld rWORD8, -8(r1)
1357 ld rWORD7, -16(r1)
9c84384c 1358L(dureturn29):
fe6e95d7
AM
1359 ld rSHL, -24(r1)
1360 ld rSHR, -32(r1)
9c84384c 1361L(dureturn27):
fe6e95d7 1362 ld rWORD8_SHIFT, -40(r1)
9c84384c 1363L(dureturn26):
fe6e95d7 1364 ld rWORD2_SHIFT, -48(r1)
9c84384c 1365L(dureturn25):
fe6e95d7 1366 ld rWORD4_SHIFT, -56(r1)
04067002 1367L(dureturn24):
fe6e95d7 1368 ld rWORD6_SHIFT, -64(r1)
04067002
UD
1369 blr
1370L(duzeroLength):
fe6e95d7 1371 li rRTN, 0
04067002
UD
1372 blr
1373
b6a66222 1374END (MEMCMP)
04067002
UD
1375libc_hidden_builtin_def (memcmp)
1376weak_alias (memcmp, bcmp)