]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power4/memcmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memcmp.S
CommitLineData
fe6e95d7 1/* Optimized memcmp implementation for PowerPC64.
b168057a 2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
04067002
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
04067002
UD
18
19#include <sysdep.h>
04067002 20
fe6e95d7
AM
21/* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
04067002 24
a88f47a7 25 .machine power4
2d67d91a 26EALIGN (memcmp, 4, 0)
04067002
UD
27 CALL_MCOUNT 3
28
04067002
UD
29#define rRTN r3
30#define rSTR1 r3 /* first string arg */
31#define rSTR2 r4 /* second string arg */
32#define rN r5 /* max string length */
04067002
UD
33#define rWORD1 r6 /* current word in s1 */
34#define rWORD2 r7 /* current word in s2 */
35#define rWORD3 r8 /* next word in s1 */
36#define rWORD4 r9 /* next word in s2 */
37#define rWORD5 r10 /* next word in s1 */
38#define rWORD6 r11 /* next word in s2 */
04067002
UD
39#define rWORD7 r30 /* next word in s1 */
40#define rWORD8 r31 /* next word in s2 */
41
fe6e95d7 42 xor r0, rSTR2, rSTR1
04067002
UD
43 cmpldi cr6, rN, 0
44 cmpldi cr1, rN, 12
fe6e95d7
AM
45 clrldi. r0, r0, 61
46 clrldi r12, rSTR1, 61
47 cmpldi cr5, r12, 0
04067002 48 beq- cr6, L(zeroLength)
fe6e95d7
AM
49 dcbt 0, rSTR1
50 dcbt 0, rSTR2
2ccdea26 51/* If less than 8 bytes or not aligned, use the unaligned
04067002
UD
52 byte loop. */
53 blt cr1, L(bytealigned)
fe6e95d7
AM
54 std rWORD8, -8(r1)
55 cfi_offset(rWORD8, -8)
56 std rWORD7, -16(r1)
57 cfi_offset(rWORD7, -16)
04067002
UD
58 bne L(unaligned)
59/* At this point we know both strings have the same alignment and the
fe6e95d7 60 compare length is at least 8 bytes. r12 contains the low order
04067002 61 3 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7
AM
62 of r12 to 0. If r12 == 0 then we are already double word
63 aligned and can perform the DW aligned loop.
9c84384c 64
04067002 65 Otherwise we know the two strings have the same alignment (but not
fe6e95d7
AM
66 yet DW). So we force the string addresses to the next lower DW
67 boundary and special case this first DW using shift left to
2ccdea26 68 eliminate bits preceding the first byte. Since we want to join the
fe6e95d7 69 normal (DW aligned) compare loop, starting at the second double word,
04067002 70 we need to adjust the length (rN) and special case the loop
fe6e95d7
AM
71 versioning for the first DW. This ensures that the loop count is
72 correct and the first DW (shifted) is in the expected register pair. */
73 .align 4
04067002
UD
74L(samealignment):
75 clrrdi rSTR1, rSTR1, 3
76 clrrdi rSTR2, rSTR2, 3
77 beq cr5, L(DWaligned)
fe6e95d7
AM
78 add rN, rN, r12
79 sldi rWORD6, r12, 3
80 srdi r0, rN, 5 /* Divide by 32 */
81 andi. r12, rN, 24 /* Get the DW remainder */
82#ifdef __LITTLE_ENDIAN__
83 ldbrx rWORD1, 0, rSTR1
84 ldbrx rWORD2, 0, rSTR2
85 addi rSTR1, rSTR1, 8
86 addi rSTR2, rSTR2, 8
87#else
04067002
UD
88 ld rWORD1, 0(rSTR1)
89 ld rWORD2, 0(rSTR2)
fe6e95d7
AM
90#endif
91 cmpldi cr1, r12, 16
04067002
UD
92 cmpldi cr7, rN, 32
93 clrldi rN, rN, 61
94 beq L(dPs4)
fe6e95d7 95 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
96 bgt cr1, L(dPs3)
97 beq cr1, L(dPs2)
98
99/* Remainder is 8 */
fe6e95d7 100 .align 3
04067002 101L(dsP1):
fe6e95d7
AM
102 sld rWORD5, rWORD1, rWORD6
103 sld rWORD6, rWORD2, rWORD6
04067002
UD
104 cmpld cr5, rWORD5, rWORD6
105 blt cr7, L(dP1x)
106/* Do something useful in this cycle since we have to branch anyway. */
fe6e95d7
AM
107#ifdef __LITTLE_ENDIAN__
108 ldbrx rWORD1, 0, rSTR1
109 ldbrx rWORD2, 0, rSTR2
110 addi rSTR1, rSTR1, 8
111 addi rSTR2, rSTR2, 8
112#else
04067002
UD
113 ld rWORD1, 8(rSTR1)
114 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
115#endif
116 cmpld cr7, rWORD1, rWORD2
04067002
UD
117 b L(dP1e)
118/* Remainder is 16 */
fe6e95d7 119 .align 4
04067002 120L(dPs2):
fe6e95d7
AM
121 sld rWORD5, rWORD1, rWORD6
122 sld rWORD6, rWORD2, rWORD6
04067002
UD
123 cmpld cr6, rWORD5, rWORD6
124 blt cr7, L(dP2x)
125/* Do something useful in this cycle since we have to branch anyway. */
fe6e95d7
AM
126#ifdef __LITTLE_ENDIAN__
127 ldbrx rWORD7, 0, rSTR1
128 ldbrx rWORD8, 0, rSTR2
129 addi rSTR1, rSTR1, 8
130 addi rSTR2, rSTR2, 8
131#else
04067002
UD
132 ld rWORD7, 8(rSTR1)
133 ld rWORD8, 8(rSTR2)
fe6e95d7 134#endif
04067002
UD
135 cmpld cr5, rWORD7, rWORD8
136 b L(dP2e)
137/* Remainder is 24 */
fe6e95d7 138 .align 4
04067002 139L(dPs3):
fe6e95d7
AM
140 sld rWORD3, rWORD1, rWORD6
141 sld rWORD4, rWORD2, rWORD6
04067002
UD
142 cmpld cr1, rWORD3, rWORD4
143 b L(dP3e)
144/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 145 .align 4
04067002 146L(dPs4):
fe6e95d7
AM
147 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
148 sld rWORD1, rWORD1, rWORD6
149 sld rWORD2, rWORD2, rWORD6
150 cmpld cr7, rWORD1, rWORD2
04067002
UD
151 b L(dP4e)
152
153/* At this point we know both strings are double word aligned and the
154 compare length is at least 8 bytes. */
fe6e95d7 155 .align 4
04067002 156L(DWaligned):
fe6e95d7
AM
157 andi. r12, rN, 24 /* Get the DW remainder */
158 srdi r0, rN, 5 /* Divide by 32 */
159 cmpldi cr1, r12, 16
04067002
UD
160 cmpldi cr7, rN, 32
161 clrldi rN, rN, 61
162 beq L(dP4)
163 bgt cr1, L(dP3)
164 beq cr1, L(dP2)
9c84384c 165
04067002 166/* Remainder is 8 */
fe6e95d7 167 .align 4
04067002 168L(dP1):
fe6e95d7 169 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002 170/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
2ccdea26
AB
171 (8-15 byte compare), we want to use only volatile registers. This
172 means we can avoid restoring non-volatile registers since we did not
04067002 173 change any on the early exit path. The key here is the non-early
9c84384c 174 exit path only cares about the condition code (cr5), not about which
04067002 175 register pair was used. */
fe6e95d7
AM
176#ifdef __LITTLE_ENDIAN__
177 ldbrx rWORD5, 0, rSTR1
178 ldbrx rWORD6, 0, rSTR2
179 addi rSTR1, rSTR1, 8
180 addi rSTR2, rSTR2, 8
181#else
04067002
UD
182 ld rWORD5, 0(rSTR1)
183 ld rWORD6, 0(rSTR2)
fe6e95d7 184#endif
04067002
UD
185 cmpld cr5, rWORD5, rWORD6
186 blt cr7, L(dP1x)
fe6e95d7
AM
187#ifdef __LITTLE_ENDIAN__
188 ldbrx rWORD1, 0, rSTR1
189 ldbrx rWORD2, 0, rSTR2
190 addi rSTR1, rSTR1, 8
191 addi rSTR2, rSTR2, 8
192#else
04067002
UD
193 ld rWORD1, 8(rSTR1)
194 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
195#endif
196 cmpld cr7, rWORD1, rWORD2
04067002 197L(dP1e):
fe6e95d7
AM
198#ifdef __LITTLE_ENDIAN__
199 ldbrx rWORD3, 0, rSTR1
200 ldbrx rWORD4, 0, rSTR2
201 addi rSTR1, rSTR1, 8
202 addi rSTR2, rSTR2, 8
203#else
04067002
UD
204 ld rWORD3, 16(rSTR1)
205 ld rWORD4, 16(rSTR2)
fe6e95d7 206#endif
04067002 207 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
208#ifdef __LITTLE_ENDIAN__
209 ldbrx rWORD5, 0, rSTR1
210 ldbrx rWORD6, 0, rSTR2
211 addi rSTR1, rSTR1, 8
212 addi rSTR2, rSTR2, 8
213#else
04067002
UD
214 ld rWORD5, 24(rSTR1)
215 ld rWORD6, 24(rSTR2)
fe6e95d7 216#endif
04067002 217 cmpld cr6, rWORD5, rWORD6
fe6e95d7
AM
218 bne cr5, L(dLcr5x)
219 bne cr7, L(dLcr7x)
9c84384c 220
fe6e95d7
AM
221#ifdef __LITTLE_ENDIAN__
222 ldbrx rWORD7, 0, rSTR1
223 ldbrx rWORD8, 0, rSTR2
224 addi rSTR1, rSTR1, 8
225 addi rSTR2, rSTR2, 8
226#else
04067002
UD
227 ldu rWORD7, 32(rSTR1)
228 ldu rWORD8, 32(rSTR2)
fe6e95d7 229#endif
04067002
UD
230 bne cr1, L(dLcr1)
231 cmpld cr5, rWORD7, rWORD8
232 bdnz L(dLoop)
233 bne cr6, L(dLcr6)
fe6e95d7
AM
234 ld rWORD8, -8(r1)
235 ld rWORD7, -16(r1)
236 .align 3
04067002
UD
237L(dP1x):
238 sldi. r12, rN, 3
fe6e95d7 239 bne cr5, L(dLcr5x)
04067002
UD
240 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
241 bne L(d00)
242 li rRTN, 0
243 blr
9c84384c 244
04067002 245/* Remainder is 16 */
fe6e95d7 246 .align 4
04067002 247L(dP2):
fe6e95d7
AM
248 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
249#ifdef __LITTLE_ENDIAN__
250 ldbrx rWORD5, 0, rSTR1
251 ldbrx rWORD6, 0, rSTR2
252 addi rSTR1, rSTR1, 8
253 addi rSTR2, rSTR2, 8
254#else
04067002
UD
255 ld rWORD5, 0(rSTR1)
256 ld rWORD6, 0(rSTR2)
fe6e95d7 257#endif
04067002
UD
258 cmpld cr6, rWORD5, rWORD6
259 blt cr7, L(dP2x)
fe6e95d7
AM
260#ifdef __LITTLE_ENDIAN__
261 ldbrx rWORD7, 0, rSTR1
262 ldbrx rWORD8, 0, rSTR2
263 addi rSTR1, rSTR1, 8
264 addi rSTR2, rSTR2, 8
265#else
04067002
UD
266 ld rWORD7, 8(rSTR1)
267 ld rWORD8, 8(rSTR2)
fe6e95d7 268#endif
04067002
UD
269 cmpld cr5, rWORD7, rWORD8
270L(dP2e):
fe6e95d7
AM
271#ifdef __LITTLE_ENDIAN__
272 ldbrx rWORD1, 0, rSTR1
273 ldbrx rWORD2, 0, rSTR2
274 addi rSTR1, rSTR1, 8
275 addi rSTR2, rSTR2, 8
276#else
04067002
UD
277 ld rWORD1, 16(rSTR1)
278 ld rWORD2, 16(rSTR2)
fe6e95d7
AM
279#endif
280 cmpld cr7, rWORD1, rWORD2
281#ifdef __LITTLE_ENDIAN__
282 ldbrx rWORD3, 0, rSTR1
283 ldbrx rWORD4, 0, rSTR2
284 addi rSTR1, rSTR1, 8
285 addi rSTR2, rSTR2, 8
286#else
04067002
UD
287 ld rWORD3, 24(rSTR1)
288 ld rWORD4, 24(rSTR2)
fe6e95d7 289#endif
04067002 290 cmpld cr1, rWORD3, rWORD4
fe6e95d7 291#ifndef __LITTLE_ENDIAN__
04067002
UD
292 addi rSTR1, rSTR1, 8
293 addi rSTR2, rSTR2, 8
fe6e95d7 294#endif
04067002
UD
295 bne cr6, L(dLcr6)
296 bne cr5, L(dLcr5)
297 b L(dLoop2)
298/* Again we are on a early exit path (16-23 byte compare), we want to
2ccdea26 299 only use volatile registers and avoid restoring non-volatile
04067002 300 registers. */
fe6e95d7 301 .align 4
04067002 302L(dP2x):
fe6e95d7
AM
303#ifdef __LITTLE_ENDIAN__
304 ldbrx rWORD3, 0, rSTR1
305 ldbrx rWORD4, 0, rSTR2
306 addi rSTR1, rSTR1, 8
307 addi rSTR2, rSTR2, 8
308#else
04067002
UD
309 ld rWORD3, 8(rSTR1)
310 ld rWORD4, 8(rSTR2)
fe6e95d7
AM
311#endif
312 cmpld cr1, rWORD3, rWORD4
04067002 313 sldi. r12, rN, 3
fe6e95d7
AM
314 bne cr6, L(dLcr6x)
315#ifndef __LITTLE_ENDIAN__
04067002
UD
316 addi rSTR1, rSTR1, 8
317 addi rSTR2, rSTR2, 8
fe6e95d7
AM
318#endif
319 bne cr1, L(dLcr1x)
04067002
UD
320 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
321 bne L(d00)
322 li rRTN, 0
323 blr
9c84384c 324
04067002 325/* Remainder is 24 */
fe6e95d7 326 .align 4
04067002 327L(dP3):
fe6e95d7
AM
328 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
329#ifdef __LITTLE_ENDIAN__
330 ldbrx rWORD3, 0, rSTR1
331 ldbrx rWORD4, 0, rSTR2
332 addi rSTR1, rSTR1, 8
333 addi rSTR2, rSTR2, 8
334#else
04067002
UD
335 ld rWORD3, 0(rSTR1)
336 ld rWORD4, 0(rSTR2)
fe6e95d7 337#endif
04067002
UD
338 cmpld cr1, rWORD3, rWORD4
339L(dP3e):
fe6e95d7
AM
340#ifdef __LITTLE_ENDIAN__
341 ldbrx rWORD5, 0, rSTR1
342 ldbrx rWORD6, 0, rSTR2
343 addi rSTR1, rSTR1, 8
344 addi rSTR2, rSTR2, 8
345#else
04067002
UD
346 ld rWORD5, 8(rSTR1)
347 ld rWORD6, 8(rSTR2)
fe6e95d7 348#endif
04067002
UD
349 cmpld cr6, rWORD5, rWORD6
350 blt cr7, L(dP3x)
fe6e95d7
AM
351#ifdef __LITTLE_ENDIAN__
352 ldbrx rWORD7, 0, rSTR1
353 ldbrx rWORD8, 0, rSTR2
354 addi rSTR1, rSTR1, 8
355 addi rSTR2, rSTR2, 8
356#else
04067002
UD
357 ld rWORD7, 16(rSTR1)
358 ld rWORD8, 16(rSTR2)
fe6e95d7 359#endif
04067002 360 cmpld cr5, rWORD7, rWORD8
fe6e95d7
AM
361#ifdef __LITTLE_ENDIAN__
362 ldbrx rWORD1, 0, rSTR1
363 ldbrx rWORD2, 0, rSTR2
364 addi rSTR1, rSTR1, 8
365 addi rSTR2, rSTR2, 8
366#else
04067002
UD
367 ld rWORD1, 24(rSTR1)
368 ld rWORD2, 24(rSTR2)
fe6e95d7
AM
369#endif
370 cmpld cr7, rWORD1, rWORD2
371#ifndef __LITTLE_ENDIAN__
04067002
UD
372 addi rSTR1, rSTR1, 16
373 addi rSTR2, rSTR2, 16
fe6e95d7 374#endif
04067002
UD
375 bne cr1, L(dLcr1)
376 bne cr6, L(dLcr6)
377 b L(dLoop1)
378/* Again we are on a early exit path (24-31 byte compare), we want to
2ccdea26 379 only use volatile registers and avoid restoring non-volatile
04067002 380 registers. */
fe6e95d7 381 .align 4
04067002 382L(dP3x):
fe6e95d7
AM
383#ifdef __LITTLE_ENDIAN__
384 ldbrx rWORD1, 0, rSTR1
385 ldbrx rWORD2, 0, rSTR2
386 addi rSTR1, rSTR1, 8
387 addi rSTR2, rSTR2, 8
388#else
04067002
UD
389 ld rWORD1, 16(rSTR1)
390 ld rWORD2, 16(rSTR2)
fe6e95d7
AM
391#endif
392 cmpld cr7, rWORD1, rWORD2
04067002 393 sldi. r12, rN, 3
fe6e95d7
AM
394 bne cr1, L(dLcr1x)
395#ifndef __LITTLE_ENDIAN__
04067002
UD
396 addi rSTR1, rSTR1, 16
397 addi rSTR2, rSTR2, 16
fe6e95d7
AM
398#endif
399 bne cr6, L(dLcr6x)
04067002 400 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
fe6e95d7 401 bne cr7, L(dLcr7x)
04067002
UD
402 bne L(d00)
403 li rRTN, 0
404 blr
9c84384c 405
04067002 406/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 407 .align 4
04067002 408L(dP4):
fe6e95d7
AM
409 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
410#ifdef __LITTLE_ENDIAN__
411 ldbrx rWORD1, 0, rSTR1
412 ldbrx rWORD2, 0, rSTR2
413 addi rSTR1, rSTR1, 8
414 addi rSTR2, rSTR2, 8
415#else
04067002
UD
416 ld rWORD1, 0(rSTR1)
417 ld rWORD2, 0(rSTR2)
fe6e95d7
AM
418#endif
419 cmpld cr7, rWORD1, rWORD2
04067002 420L(dP4e):
fe6e95d7
AM
421#ifdef __LITTLE_ENDIAN__
422 ldbrx rWORD3, 0, rSTR1
423 ldbrx rWORD4, 0, rSTR2
424 addi rSTR1, rSTR1, 8
425 addi rSTR2, rSTR2, 8
426#else
04067002
UD
427 ld rWORD3, 8(rSTR1)
428 ld rWORD4, 8(rSTR2)
fe6e95d7 429#endif
04067002 430 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
431#ifdef __LITTLE_ENDIAN__
432 ldbrx rWORD5, 0, rSTR1
433 ldbrx rWORD6, 0, rSTR2
434 addi rSTR1, rSTR1, 8
435 addi rSTR2, rSTR2, 8
436#else
04067002
UD
437 ld rWORD5, 16(rSTR1)
438 ld rWORD6, 16(rSTR2)
fe6e95d7 439#endif
04067002 440 cmpld cr6, rWORD5, rWORD6
fe6e95d7
AM
441#ifdef __LITTLE_ENDIAN__
442 ldbrx rWORD7, 0, rSTR1
443 ldbrx rWORD8, 0, rSTR2
444 addi rSTR1, rSTR1, 8
445 addi rSTR2, rSTR2, 8
446#else
04067002
UD
447 ldu rWORD7, 24(rSTR1)
448 ldu rWORD8, 24(rSTR2)
fe6e95d7 449#endif
04067002 450 cmpld cr5, rWORD7, rWORD8
fe6e95d7 451 bne cr7, L(dLcr7)
04067002
UD
452 bne cr1, L(dLcr1)
453 bdz- L(d24) /* Adjust CTR as we start with +4 */
454/* This is the primary loop */
fe6e95d7 455 .align 4
04067002 456L(dLoop):
fe6e95d7
AM
457#ifdef __LITTLE_ENDIAN__
458 ldbrx rWORD1, 0, rSTR1
459 ldbrx rWORD2, 0, rSTR2
460 addi rSTR1, rSTR1, 8
461 addi rSTR2, rSTR2, 8
462#else
04067002
UD
463 ld rWORD1, 8(rSTR1)
464 ld rWORD2, 8(rSTR2)
fe6e95d7 465#endif
04067002
UD
466 cmpld cr1, rWORD3, rWORD4
467 bne cr6, L(dLcr6)
468L(dLoop1):
fe6e95d7
AM
469#ifdef __LITTLE_ENDIAN__
470 ldbrx rWORD3, 0, rSTR1
471 ldbrx rWORD4, 0, rSTR2
472 addi rSTR1, rSTR1, 8
473 addi rSTR2, rSTR2, 8
474#else
04067002
UD
475 ld rWORD3, 16(rSTR1)
476 ld rWORD4, 16(rSTR2)
fe6e95d7 477#endif
04067002
UD
478 cmpld cr6, rWORD5, rWORD6
479 bne cr5, L(dLcr5)
480L(dLoop2):
fe6e95d7
AM
481#ifdef __LITTLE_ENDIAN__
482 ldbrx rWORD5, 0, rSTR1
483 ldbrx rWORD6, 0, rSTR2
484 addi rSTR1, rSTR1, 8
485 addi rSTR2, rSTR2, 8
486#else
04067002
UD
487 ld rWORD5, 24(rSTR1)
488 ld rWORD6, 24(rSTR2)
fe6e95d7 489#endif
04067002 490 cmpld cr5, rWORD7, rWORD8
fe6e95d7 491 bne cr7, L(dLcr7)
04067002 492L(dLoop3):
fe6e95d7
AM
493#ifdef __LITTLE_ENDIAN__
494 ldbrx rWORD7, 0, rSTR1
495 ldbrx rWORD8, 0, rSTR2
496 addi rSTR1, rSTR1, 8
497 addi rSTR2, rSTR2, 8
498#else
04067002
UD
499 ldu rWORD7, 32(rSTR1)
500 ldu rWORD8, 32(rSTR2)
fe6e95d7 501#endif
04067002 502 bne- cr1, L(dLcr1)
fe6e95d7 503 cmpld cr7, rWORD1, rWORD2
9c84384c
JM
504 bdnz+ L(dLoop)
505
04067002
UD
506L(dL4):
507 cmpld cr1, rWORD3, rWORD4
508 bne cr6, L(dLcr6)
509 cmpld cr6, rWORD5, rWORD6
510 bne cr5, L(dLcr5)
511 cmpld cr5, rWORD7, rWORD8
512L(d44):
fe6e95d7 513 bne cr7, L(dLcr7)
04067002
UD
514L(d34):
515 bne cr1, L(dLcr1)
516L(d24):
517 bne cr6, L(dLcr6)
518L(d14):
519 sldi. r12, rN, 3
9c84384c 520 bne cr5, L(dLcr5)
04067002 521L(d04):
fe6e95d7
AM
522 ld rWORD8, -8(r1)
523 ld rWORD7, -16(r1)
04067002
UD
524 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
525 beq L(zeroLength)
526/* At this point we have a remainder of 1 to 7 bytes to compare. Since
527 we are aligned it is safe to load the whole double word, and use
2ccdea26 528 shift right double to eliminate bits beyond the compare length. */
04067002 529L(d00):
fe6e95d7
AM
530#ifdef __LITTLE_ENDIAN__
531 ldbrx rWORD1, 0, rSTR1
532 ldbrx rWORD2, 0, rSTR2
533 addi rSTR1, rSTR1, 8
534 addi rSTR2, rSTR2, 8
535#else
04067002 536 ld rWORD1, 8(rSTR1)
9c84384c 537 ld rWORD2, 8(rSTR2)
fe6e95d7 538#endif
04067002
UD
539 srd rWORD1, rWORD1, rN
540 srd rWORD2, rWORD2, rN
fe6e95d7
AM
541 cmpld cr7, rWORD1, rWORD2
542 bne cr7, L(dLcr7x)
04067002
UD
543 li rRTN, 0
544 blr
fe6e95d7
AM
545
546 .align 4
547L(dLcr7):
548 ld rWORD8, -8(r1)
549 ld rWORD7, -16(r1)
550L(dLcr7x):
04067002 551 li rRTN, 1
fe6e95d7 552 bgtlr cr7
04067002
UD
553 li rRTN, -1
554 blr
fe6e95d7 555 .align 4
04067002 556L(dLcr1):
fe6e95d7
AM
557 ld rWORD8, -8(r1)
558 ld rWORD7, -16(r1)
559L(dLcr1x):
04067002
UD
560 li rRTN, 1
561 bgtlr cr1
562 li rRTN, -1
563 blr
fe6e95d7 564 .align 4
04067002 565L(dLcr6):
fe6e95d7
AM
566 ld rWORD8, -8(r1)
567 ld rWORD7, -16(r1)
568L(dLcr6x):
04067002
UD
569 li rRTN, 1
570 bgtlr cr6
571 li rRTN, -1
572 blr
fe6e95d7 573 .align 4
04067002 574L(dLcr5):
fe6e95d7
AM
575 ld rWORD8, -8(r1)
576 ld rWORD7, -16(r1)
04067002
UD
577L(dLcr5x):
578 li rRTN, 1
579 bgtlr cr5
580 li rRTN, -1
581 blr
9c84384c 582
fe6e95d7 583 .align 4
04067002 584L(bytealigned):
fe6e95d7
AM
585 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
586#if 0
587/* Huh? We've already branched on cr6! */
04067002 588 beq- cr6, L(zeroLength)
fe6e95d7 589#endif
04067002
UD
590
591/* We need to prime this loop. This loop is swing modulo scheduled
9c84384c 592 to avoid pipe delays. The dependent instruction latencies (load to
04067002
UD
593 compare to conditional branch) is 2 to 3 cycles. In this loop each
594 dispatch group ends in a branch and takes 1 cycle. Effectively
9c84384c
JM
595 the first iteration of the loop only serves to load operands and
596 branches based on compares are delayed until the next loop.
04067002
UD
597
598 So we must precondition some registers and condition codes so that
599 we don't exit the loop early on the first iteration. */
9c84384c 600
04067002
UD
601 lbz rWORD1, 0(rSTR1)
602 lbz rWORD2, 0(rSTR2)
603 bdz- L(b11)
fe6e95d7 604 cmpld cr7, rWORD1, rWORD2
04067002
UD
605 lbz rWORD3, 1(rSTR1)
606 lbz rWORD4, 1(rSTR2)
607 bdz- L(b12)
608 cmpld cr1, rWORD3, rWORD4
609 lbzu rWORD5, 2(rSTR1)
610 lbzu rWORD6, 2(rSTR2)
611 bdz- L(b13)
fe6e95d7 612 .align 4
04067002
UD
613L(bLoop):
614 lbzu rWORD1, 1(rSTR1)
615 lbzu rWORD2, 1(rSTR2)
fe6e95d7 616 bne- cr7, L(bLcr7)
04067002
UD
617
618 cmpld cr6, rWORD5, rWORD6
619 bdz- L(b3i)
9c84384c 620
04067002
UD
621 lbzu rWORD3, 1(rSTR1)
622 lbzu rWORD4, 1(rSTR2)
623 bne- cr1, L(bLcr1)
624
fe6e95d7 625 cmpld cr7, rWORD1, rWORD2
04067002
UD
626 bdz- L(b2i)
627
628 lbzu rWORD5, 1(rSTR1)
629 lbzu rWORD6, 1(rSTR2)
630 bne- cr6, L(bLcr6)
631
632 cmpld cr1, rWORD3, rWORD4
633 bdnz+ L(bLoop)
9c84384c 634
04067002
UD
635/* We speculatively loading bytes before we have tested the previous
636 bytes. But we must avoid overrunning the length (in the ctr) to
9c84384c 637 prevent these speculative loads from causing a segfault. In this
04067002
UD
638 case the loop will exit early (before the all pending bytes are
639 tested. In this case we must complete the pending operations
640 before returning. */
641L(b1i):
fe6e95d7 642 bne- cr7, L(bLcr7)
04067002
UD
643 bne- cr1, L(bLcr1)
644 b L(bx56)
fe6e95d7 645 .align 4
04067002
UD
646L(b2i):
647 bne- cr6, L(bLcr6)
fe6e95d7 648 bne- cr7, L(bLcr7)
04067002 649 b L(bx34)
fe6e95d7 650 .align 4
04067002
UD
651L(b3i):
652 bne- cr1, L(bLcr1)
653 bne- cr6, L(bLcr6)
654 b L(bx12)
fe6e95d7
AM
655 .align 4
656L(bLcr7):
04067002 657 li rRTN, 1
fe6e95d7 658 bgtlr cr7
04067002
UD
659 li rRTN, -1
660 blr
661L(bLcr1):
662 li rRTN, 1
663 bgtlr cr1
664 li rRTN, -1
665 blr
666L(bLcr6):
667 li rRTN, 1
668 bgtlr cr6
669 li rRTN, -1
670 blr
671
672L(b13):
fe6e95d7 673 bne- cr7, L(bx12)
04067002
UD
674 bne- cr1, L(bx34)
675L(bx56):
676 sub rRTN, rWORD5, rWORD6
677 blr
678 nop
679L(b12):
fe6e95d7 680 bne- cr7, L(bx12)
9c84384c 681L(bx34):
04067002
UD
682 sub rRTN, rWORD3, rWORD4
683 blr
684L(b11):
685L(bx12):
686 sub rRTN, rWORD1, rWORD2
687 blr
fe6e95d7 688 .align 4
04067002
UD
689L(zeroLength):
690 li rRTN, 0
691 blr
692
fe6e95d7 693 .align 4
04067002 694/* At this point we know the strings have different alignment and the
fe6e95d7 695 compare length is at least 8 bytes. r12 contains the low order
04067002 696 3 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7 697 of r12 to 0. If r12 == 0 then rStr1 is double word
04067002 698 aligned and can perform the DWunaligned loop.
9c84384c 699
2ccdea26 700 Otherwise we know that rSTR1 is not already DW aligned yet.
04067002 701 So we can force the string addresses to the next lower DW
fe6e95d7 702 boundary and special case this first DW using shift left to
2ccdea26 703 eliminate bits preceding the first byte. Since we want to join the
04067002
UD
704 normal (DWaligned) compare loop, starting at the second double word,
705 we need to adjust the length (rN) and special case the loop
fe6e95d7 706 versioning for the first DW. This ensures that the loop count is
04067002 707 correct and the first DW (shifted) is in the expected resister pair. */
fe6e95d7
AM
708#define rSHL r29 /* Unaligned shift left count. */
709#define rSHR r28 /* Unaligned shift right count. */
710#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
711#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
712#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
713#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
04067002 714L(unaligned):
fe6e95d7
AM
715 std rSHL, -24(r1)
716 cfi_offset(rSHL, -24)
04067002
UD
717 clrldi rSHL, rSTR2, 61
718 beq- cr6, L(duzeroLength)
fe6e95d7
AM
719 std rSHR, -32(r1)
720 cfi_offset(rSHR, -32)
04067002 721 beq cr5, L(DWunaligned)
fe6e95d7
AM
722 std rWORD8_SHIFT, -40(r1)
723 cfi_offset(rWORD8_SHIFT, -40)
724/* Adjust the logical start of rSTR2 to compensate for the extra bits
04067002 725 in the 1st rSTR1 DW. */
fe6e95d7 726 sub rWORD8_SHIFT, rSTR2, r12
04067002
UD
727/* But do not attempt to address the DW before that DW that contains
728 the actual start of rSTR2. */
729 clrrdi rSTR2, rSTR2, 3
fe6e95d7
AM
730 std rWORD2_SHIFT, -48(r1)
731 cfi_offset(rWORD2_SHIFT, -48)
732/* Compute the left/right shift counts for the unaligned rSTR2,
9c84384c 733 compensating for the logical (DW aligned) start of rSTR1. */
fe6e95d7 734 clrldi rSHL, rWORD8_SHIFT, 61
9c84384c 735 clrrdi rSTR1, rSTR1, 3
fe6e95d7
AM
736 std rWORD4_SHIFT, -56(r1)
737 cfi_offset(rWORD4_SHIFT, -56)
04067002 738 sldi rSHL, rSHL, 3
fe6e95d7
AM
739 cmpld cr5, rWORD8_SHIFT, rSTR2
740 add rN, rN, r12
741 sldi rWORD6, r12, 3
742 std rWORD6_SHIFT, -64(r1)
743 cfi_offset(rWORD6_SHIFT, -64)
04067002 744 subfic rSHR, rSHL, 64
fe6e95d7
AM
745 srdi r0, rN, 5 /* Divide by 32 */
746 andi. r12, rN, 24 /* Get the DW remainder */
04067002
UD
747/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
748 this special case those bits may be discarded anyway. Also we
749 must avoid loading a DW where none of the bits are part of rSTR2 as
750 this may cross a page boundary and cause a page fault. */
751 li rWORD8, 0
752 blt cr5, L(dus0)
fe6e95d7
AM
753#ifdef __LITTLE_ENDIAN__
754 ldbrx rWORD8, 0, rSTR2
755 addi rSTR2, rSTR2, 8
756#else
04067002 757 ld rWORD8, 0(rSTR2)
fe6e95d7
AM
758 addi rSTR2, rSTR2, 8
759#endif
04067002
UD
760 sld rWORD8, rWORD8, rSHL
761
762L(dus0):
fe6e95d7
AM
763#ifdef __LITTLE_ENDIAN__
764 ldbrx rWORD1, 0, rSTR1
765 ldbrx rWORD2, 0, rSTR2
766 addi rSTR1, rSTR1, 8
767 addi rSTR2, rSTR2, 8
768#else
04067002
UD
769 ld rWORD1, 0(rSTR1)
770 ld rWORD2, 0(rSTR2)
fe6e95d7
AM
771#endif
772 cmpldi cr1, r12, 16
04067002 773 cmpldi cr7, rN, 32
fe6e95d7 774 srd r12, rWORD2, rSHR
04067002
UD
775 clrldi rN, rN, 61
776 beq L(duPs4)
fe6e95d7
AM
777 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
778 or rWORD8, r12, rWORD8
04067002
UD
779 bgt cr1, L(duPs3)
780 beq cr1, L(duPs2)
781
782/* Remainder is 8 */
fe6e95d7 783 .align 4
04067002 784L(dusP1):
fe6e95d7
AM
785 sld rWORD8_SHIFT, rWORD2, rSHL
786 sld rWORD7, rWORD1, rWORD6
787 sld rWORD8, rWORD8, rWORD6
04067002
UD
788 bge cr7, L(duP1e)
789/* At this point we exit early with the first double word compare
790 complete and remainder of 0 to 7 bytes. See L(du14) for details on
791 how we handle the remaining bytes. */
792 cmpld cr5, rWORD7, rWORD8
793 sldi. rN, rN, 3
794 bne cr5, L(duLcr5)
795 cmpld cr7, rN, rSHR
796 beq L(duZeroReturn)
fe6e95d7 797 li r0, 0
04067002 798 ble cr7, L(dutrim)
fe6e95d7
AM
799#ifdef __LITTLE_ENDIAN__
800 ldbrx rWORD2, 0, rSTR2
801 addi rSTR2, rSTR2, 8
802#else
04067002 803 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
804#endif
805 srd r0, rWORD2, rSHR
04067002
UD
806 b L(dutrim)
807/* Remainder is 16 */
fe6e95d7 808 .align 4
04067002 809L(duPs2):
fe6e95d7
AM
810 sld rWORD6_SHIFT, rWORD2, rSHL
811 sld rWORD5, rWORD1, rWORD6
812 sld rWORD6, rWORD8, rWORD6
04067002
UD
813 b L(duP2e)
814/* Remainder is 24 */
fe6e95d7 815 .align 4
04067002 816L(duPs3):
fe6e95d7
AM
817 sld rWORD4_SHIFT, rWORD2, rSHL
818 sld rWORD3, rWORD1, rWORD6
819 sld rWORD4, rWORD8, rWORD6
04067002
UD
820 b L(duP3e)
821/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 822 .align 4
04067002 823L(duPs4):
fe6e95d7
AM
824 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
825 or rWORD8, r12, rWORD8
826 sld rWORD2_SHIFT, rWORD2, rSHL
827 sld rWORD1, rWORD1, rWORD6
828 sld rWORD2, rWORD8, rWORD6
04067002
UD
829 b L(duP4e)
830
831/* At this point we know rSTR1 is double word aligned and the
832 compare length is at least 8 bytes. */
fe6e95d7 833 .align 4
04067002 834L(DWunaligned):
fe6e95d7
AM
835 std rWORD8_SHIFT, -40(r1)
836 cfi_offset(rWORD8_SHIFT, -40)
04067002 837 clrrdi rSTR2, rSTR2, 3
fe6e95d7
AM
838 std rWORD2_SHIFT, -48(r1)
839 cfi_offset(rWORD2_SHIFT, -48)
840 srdi r0, rN, 5 /* Divide by 32 */
841 std rWORD4_SHIFT, -56(r1)
842 cfi_offset(rWORD4_SHIFT, -56)
843 andi. r12, rN, 24 /* Get the DW remainder */
844 std rWORD6_SHIFT, -64(r1)
845 cfi_offset(rWORD6_SHIFT, -64)
04067002 846 sldi rSHL, rSHL, 3
fe6e95d7
AM
847#ifdef __LITTLE_ENDIAN__
848 ldbrx rWORD6, 0, rSTR2
849 addi rSTR2, rSTR2, 8
850 ldbrx rWORD8, 0, rSTR2
851 addi rSTR2, rSTR2, 8
852#else
04067002
UD
853 ld rWORD6, 0(rSTR2)
854 ldu rWORD8, 8(rSTR2)
fe6e95d7
AM
855#endif
856 cmpldi cr1, r12, 16
04067002
UD
857 cmpldi cr7, rN, 32
858 clrldi rN, rN, 61
859 subfic rSHR, rSHL, 64
fe6e95d7 860 sld rWORD6_SHIFT, rWORD6, rSHL
04067002 861 beq L(duP4)
fe6e95d7 862 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
863 bgt cr1, L(duP3)
864 beq cr1, L(duP2)
9c84384c 865
04067002 866/* Remainder is 8 */
fe6e95d7 867 .align 4
04067002 868L(duP1):
fe6e95d7
AM
869 srd r12, rWORD8, rSHR
870#ifdef __LITTLE_ENDIAN__
871 ldbrx rWORD7, 0, rSTR1
872 addi rSTR1, rSTR1, 8
873#else
04067002 874 ld rWORD7, 0(rSTR1)
fe6e95d7
AM
875#endif
876 sld rWORD8_SHIFT, rWORD8, rSHL
877 or rWORD8, r12, rWORD6_SHIFT
04067002
UD
878 blt cr7, L(duP1x)
879L(duP1e):
fe6e95d7
AM
880#ifdef __LITTLE_ENDIAN__
881 ldbrx rWORD1, 0, rSTR1
882 ldbrx rWORD2, 0, rSTR2
883 addi rSTR1, rSTR1, 8
884 addi rSTR2, rSTR2, 8
885#else
04067002
UD
886 ld rWORD1, 8(rSTR1)
887 ld rWORD2, 8(rSTR2)
fe6e95d7 888#endif
04067002 889 cmpld cr5, rWORD7, rWORD8
fe6e95d7
AM
890 srd r0, rWORD2, rSHR
891 sld rWORD2_SHIFT, rWORD2, rSHL
892 or rWORD2, r0, rWORD8_SHIFT
893#ifdef __LITTLE_ENDIAN__
894 ldbrx rWORD3, 0, rSTR1
895 ldbrx rWORD4, 0, rSTR2
896 addi rSTR1, rSTR1, 8
897 addi rSTR2, rSTR2, 8
898#else
04067002
UD
899 ld rWORD3, 16(rSTR1)
900 ld rWORD4, 16(rSTR2)
fe6e95d7
AM
901#endif
902 cmpld cr7, rWORD1, rWORD2
903 srd r12, rWORD4, rSHR
904 sld rWORD4_SHIFT, rWORD4, rSHL
04067002 905 bne cr5, L(duLcr5)
fe6e95d7
AM
906 or rWORD4, r12, rWORD2_SHIFT
907#ifdef __LITTLE_ENDIAN__
908 ldbrx rWORD5, 0, rSTR1
909 ldbrx rWORD6, 0, rSTR2
910 addi rSTR1, rSTR1, 8
911 addi rSTR2, rSTR2, 8
912#else
04067002
UD
913 ld rWORD5, 24(rSTR1)
914 ld rWORD6, 24(rSTR2)
fe6e95d7 915#endif
04067002 916 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
917 srd r0, rWORD6, rSHR
918 sld rWORD6_SHIFT, rWORD6, rSHL
919 bne cr7, L(duLcr7)
920 or rWORD6, r0, rWORD4_SHIFT
04067002 921 cmpld cr6, rWORD5, rWORD6
9c84384c 922 b L(duLoop3)
fe6e95d7 923 .align 4
04067002
UD
924/* At this point we exit early with the first double word compare
925 complete and remainder of 0 to 7 bytes. See L(du14) for details on
926 how we handle the remaining bytes. */
927L(duP1x):
928 cmpld cr5, rWORD7, rWORD8
929 sldi. rN, rN, 3
930 bne cr5, L(duLcr5)
931 cmpld cr7, rN, rSHR
932 beq L(duZeroReturn)
fe6e95d7 933 li r0, 0
04067002 934 ble cr7, L(dutrim)
fe6e95d7
AM
935#ifdef __LITTLE_ENDIAN__
936 ldbrx rWORD2, 0, rSTR2
937 addi rSTR2, rSTR2, 8
938#else
04067002 939 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
940#endif
941 srd r0, rWORD2, rSHR
04067002
UD
942 b L(dutrim)
943/* Remainder is 16 */
fe6e95d7 944 .align 4
04067002 945L(duP2):
fe6e95d7
AM
946 srd r0, rWORD8, rSHR
947#ifdef __LITTLE_ENDIAN__
948 ldbrx rWORD5, 0, rSTR1
949 addi rSTR1, rSTR1, 8
950#else
04067002 951 ld rWORD5, 0(rSTR1)
fe6e95d7
AM
952#endif
953 or rWORD6, r0, rWORD6_SHIFT
954 sld rWORD6_SHIFT, rWORD8, rSHL
04067002 955L(duP2e):
fe6e95d7
AM
956#ifdef __LITTLE_ENDIAN__
957 ldbrx rWORD7, 0, rSTR1
958 ldbrx rWORD8, 0, rSTR2
959 addi rSTR1, rSTR1, 8
960 addi rSTR2, rSTR2, 8
961#else
04067002
UD
962 ld rWORD7, 8(rSTR1)
963 ld rWORD8, 8(rSTR2)
fe6e95d7 964#endif
04067002 965 cmpld cr6, rWORD5, rWORD6
fe6e95d7
AM
966 srd r12, rWORD8, rSHR
967 sld rWORD8_SHIFT, rWORD8, rSHL
968 or rWORD8, r12, rWORD6_SHIFT
04067002 969 blt cr7, L(duP2x)
fe6e95d7
AM
970#ifdef __LITTLE_ENDIAN__
971 ldbrx rWORD1, 0, rSTR1
972 ldbrx rWORD2, 0, rSTR2
973 addi rSTR1, rSTR1, 8
974 addi rSTR2, rSTR2, 8
975#else
04067002
UD
976 ld rWORD1, 16(rSTR1)
977 ld rWORD2, 16(rSTR2)
fe6e95d7 978#endif
04067002
UD
979 cmpld cr5, rWORD7, rWORD8
980 bne cr6, L(duLcr6)
fe6e95d7
AM
981 srd r0, rWORD2, rSHR
982 sld rWORD2_SHIFT, rWORD2, rSHL
983 or rWORD2, r0, rWORD8_SHIFT
984#ifdef __LITTLE_ENDIAN__
985 ldbrx rWORD3, 0, rSTR1
986 ldbrx rWORD4, 0, rSTR2
987 addi rSTR1, rSTR1, 8
988 addi rSTR2, rSTR2, 8
989#else
04067002
UD
990 ld rWORD3, 24(rSTR1)
991 ld rWORD4, 24(rSTR2)
fe6e95d7
AM
992#endif
993 cmpld cr7, rWORD1, rWORD2
04067002 994 bne cr5, L(duLcr5)
fe6e95d7
AM
995 srd r12, rWORD4, rSHR
996 sld rWORD4_SHIFT, rWORD4, rSHL
997 or rWORD4, r12, rWORD2_SHIFT
998#ifndef __LITTLE_ENDIAN__
04067002
UD
999 addi rSTR1, rSTR1, 8
1000 addi rSTR2, rSTR2, 8
fe6e95d7 1001#endif
04067002
UD
1002 cmpld cr1, rWORD3, rWORD4
1003 b L(duLoop2)
fe6e95d7 1004 .align 4
04067002
UD
1005L(duP2x):
1006 cmpld cr5, rWORD7, rWORD8
fe6e95d7 1007#ifndef __LITTLE_ENDIAN__
04067002
UD
1008 addi rSTR1, rSTR1, 8
1009 addi rSTR2, rSTR2, 8
fe6e95d7 1010#endif
04067002
UD
1011 bne cr6, L(duLcr6)
1012 sldi. rN, rN, 3
1013 bne cr5, L(duLcr5)
1014 cmpld cr7, rN, rSHR
1015 beq L(duZeroReturn)
fe6e95d7 1016 li r0, 0
04067002 1017 ble cr7, L(dutrim)
fe6e95d7
AM
1018#ifdef __LITTLE_ENDIAN__
1019 ldbrx rWORD2, 0, rSTR2
1020 addi rSTR2, rSTR2, 8
1021#else
04067002 1022 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
1023#endif
1024 srd r0, rWORD2, rSHR
04067002 1025 b L(dutrim)
9c84384c 1026
04067002 1027/* Remainder is 24 */
fe6e95d7 1028 .align 4
04067002 1029L(duP3):
fe6e95d7
AM
1030 srd r12, rWORD8, rSHR
1031#ifdef __LITTLE_ENDIAN__
1032 ldbrx rWORD3, 0, rSTR1
1033 addi rSTR1, rSTR1, 8
1034#else
04067002 1035 ld rWORD3, 0(rSTR1)
fe6e95d7
AM
1036#endif
1037 sld rWORD4_SHIFT, rWORD8, rSHL
1038 or rWORD4, r12, rWORD6_SHIFT
04067002 1039L(duP3e):
fe6e95d7
AM
1040#ifdef __LITTLE_ENDIAN__
1041 ldbrx rWORD5, 0, rSTR1
1042 ldbrx rWORD6, 0, rSTR2
1043 addi rSTR1, rSTR1, 8
1044 addi rSTR2, rSTR2, 8
1045#else
04067002
UD
1046 ld rWORD5, 8(rSTR1)
1047 ld rWORD6, 8(rSTR2)
fe6e95d7 1048#endif
04067002 1049 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
1050 srd r0, rWORD6, rSHR
1051 sld rWORD6_SHIFT, rWORD6, rSHL
1052 or rWORD6, r0, rWORD4_SHIFT
1053#ifdef __LITTLE_ENDIAN__
1054 ldbrx rWORD7, 0, rSTR1
1055 ldbrx rWORD8, 0, rSTR2
1056 addi rSTR1, rSTR1, 8
1057 addi rSTR2, rSTR2, 8
1058#else
04067002
UD
1059 ld rWORD7, 16(rSTR1)
1060 ld rWORD8, 16(rSTR2)
fe6e95d7 1061#endif
04067002
UD
1062 cmpld cr6, rWORD5, rWORD6
1063 bne cr1, L(duLcr1)
fe6e95d7
AM
1064 srd r12, rWORD8, rSHR
1065 sld rWORD8_SHIFT, rWORD8, rSHL
1066 or rWORD8, r12, rWORD6_SHIFT
04067002 1067 blt cr7, L(duP3x)
fe6e95d7
AM
1068#ifdef __LITTLE_ENDIAN__
1069 ldbrx rWORD1, 0, rSTR1
1070 ldbrx rWORD2, 0, rSTR2
1071 addi rSTR1, rSTR1, 8
1072 addi rSTR2, rSTR2, 8
1073#else
04067002
UD
1074 ld rWORD1, 24(rSTR1)
1075 ld rWORD2, 24(rSTR2)
fe6e95d7 1076#endif
04067002
UD
1077 cmpld cr5, rWORD7, rWORD8
1078 bne cr6, L(duLcr6)
fe6e95d7
AM
1079 srd r0, rWORD2, rSHR
1080 sld rWORD2_SHIFT, rWORD2, rSHL
1081 or rWORD2, r0, rWORD8_SHIFT
1082#ifndef __LITTLE_ENDIAN__
04067002
UD
1083 addi rSTR1, rSTR1, 16
1084 addi rSTR2, rSTR2, 16
fe6e95d7
AM
1085#endif
1086 cmpld cr7, rWORD1, rWORD2
04067002 1087 b L(duLoop1)
fe6e95d7 1088 .align 4
04067002 1089L(duP3x):
fe6e95d7 1090#ifndef __LITTLE_ENDIAN__
04067002
UD
1091 addi rSTR1, rSTR1, 16
1092 addi rSTR2, rSTR2, 16
fe6e95d7
AM
1093#endif
1094#if 0
1095/* Huh? We've already branched on cr1! */
04067002 1096 bne cr1, L(duLcr1)
fe6e95d7 1097#endif
04067002
UD
1098 cmpld cr5, rWORD7, rWORD8
1099 bne cr6, L(duLcr6)
1100 sldi. rN, rN, 3
1101 bne cr5, L(duLcr5)
1102 cmpld cr7, rN, rSHR
1103 beq L(duZeroReturn)
fe6e95d7 1104 li r0, 0
04067002 1105 ble cr7, L(dutrim)
fe6e95d7
AM
1106#ifdef __LITTLE_ENDIAN__
1107 ldbrx rWORD2, 0, rSTR2
1108 addi rSTR2, rSTR2, 8
1109#else
04067002 1110 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
1111#endif
1112 srd r0, rWORD2, rSHR
04067002 1113 b L(dutrim)
9c84384c 1114
04067002 1115/* Count is a multiple of 32, remainder is 0 */
fe6e95d7 1116 .align 4
04067002 1117L(duP4):
fe6e95d7
AM
1118 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1119 srd r0, rWORD8, rSHR
1120#ifdef __LITTLE_ENDIAN__
1121 ldbrx rWORD1, 0, rSTR1
1122 addi rSTR1, rSTR1, 8
1123#else
04067002 1124 ld rWORD1, 0(rSTR1)
fe6e95d7
AM
1125#endif
1126 sld rWORD2_SHIFT, rWORD8, rSHL
1127 or rWORD2, r0, rWORD6_SHIFT
04067002 1128L(duP4e):
fe6e95d7
AM
1129#ifdef __LITTLE_ENDIAN__
1130 ldbrx rWORD3, 0, rSTR1
1131 ldbrx rWORD4, 0, rSTR2
1132 addi rSTR1, rSTR1, 8
1133 addi rSTR2, rSTR2, 8
1134#else
04067002
UD
1135 ld rWORD3, 8(rSTR1)
1136 ld rWORD4, 8(rSTR2)
fe6e95d7
AM
1137#endif
1138 cmpld cr7, rWORD1, rWORD2
1139 srd r12, rWORD4, rSHR
1140 sld rWORD4_SHIFT, rWORD4, rSHL
1141 or rWORD4, r12, rWORD2_SHIFT
1142#ifdef __LITTLE_ENDIAN__
1143 ldbrx rWORD5, 0, rSTR1
1144 ldbrx rWORD6, 0, rSTR2
1145 addi rSTR1, rSTR1, 8
1146 addi rSTR2, rSTR2, 8
1147#else
04067002
UD
1148 ld rWORD5, 16(rSTR1)
1149 ld rWORD6, 16(rSTR2)
fe6e95d7 1150#endif
04067002 1151 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
1152 bne cr7, L(duLcr7)
1153 srd r0, rWORD6, rSHR
1154 sld rWORD6_SHIFT, rWORD6, rSHL
1155 or rWORD6, r0, rWORD4_SHIFT
1156#ifdef __LITTLE_ENDIAN__
1157 ldbrx rWORD7, 0, rSTR1
1158 ldbrx rWORD8, 0, rSTR2
1159 addi rSTR1, rSTR1, 8
1160 addi rSTR2, rSTR2, 8
1161#else
04067002
UD
1162 ldu rWORD7, 24(rSTR1)
1163 ldu rWORD8, 24(rSTR2)
fe6e95d7 1164#endif
04067002
UD
1165 cmpld cr6, rWORD5, rWORD6
1166 bne cr1, L(duLcr1)
fe6e95d7
AM
1167 srd r12, rWORD8, rSHR
1168 sld rWORD8_SHIFT, rWORD8, rSHL
1169 or rWORD8, r12, rWORD6_SHIFT
04067002
UD
1170 cmpld cr5, rWORD7, rWORD8
1171 bdz- L(du24) /* Adjust CTR as we start with +4 */
1172/* This is the primary loop */
fe6e95d7 1173 .align 4
04067002 1174L(duLoop):
fe6e95d7
AM
1175#ifdef __LITTLE_ENDIAN__
1176 ldbrx rWORD1, 0, rSTR1
1177 ldbrx rWORD2, 0, rSTR2
1178 addi rSTR1, rSTR1, 8
1179 addi rSTR2, rSTR2, 8
1180#else
04067002
UD
1181 ld rWORD1, 8(rSTR1)
1182 ld rWORD2, 8(rSTR2)
fe6e95d7 1183#endif
04067002
UD
1184 cmpld cr1, rWORD3, rWORD4
1185 bne cr6, L(duLcr6)
fe6e95d7
AM
1186 srd r0, rWORD2, rSHR
1187 sld rWORD2_SHIFT, rWORD2, rSHL
1188 or rWORD2, r0, rWORD8_SHIFT
04067002 1189L(duLoop1):
fe6e95d7
AM
1190#ifdef __LITTLE_ENDIAN__
1191 ldbrx rWORD3, 0, rSTR1
1192 ldbrx rWORD4, 0, rSTR2
1193 addi rSTR1, rSTR1, 8
1194 addi rSTR2, rSTR2, 8
1195#else
04067002
UD
1196 ld rWORD3, 16(rSTR1)
1197 ld rWORD4, 16(rSTR2)
fe6e95d7 1198#endif
04067002
UD
1199 cmpld cr6, rWORD5, rWORD6
1200 bne cr5, L(duLcr5)
fe6e95d7
AM
1201 srd r12, rWORD4, rSHR
1202 sld rWORD4_SHIFT, rWORD4, rSHL
1203 or rWORD4, r12, rWORD2_SHIFT
04067002 1204L(duLoop2):
fe6e95d7
AM
1205#ifdef __LITTLE_ENDIAN__
1206 ldbrx rWORD5, 0, rSTR1
1207 ldbrx rWORD6, 0, rSTR2
1208 addi rSTR1, rSTR1, 8
1209 addi rSTR2, rSTR2, 8
1210#else
04067002
UD
1211 ld rWORD5, 24(rSTR1)
1212 ld rWORD6, 24(rSTR2)
fe6e95d7 1213#endif
04067002 1214 cmpld cr5, rWORD7, rWORD8
fe6e95d7
AM
1215 bne cr7, L(duLcr7)
1216 srd r0, rWORD6, rSHR
1217 sld rWORD6_SHIFT, rWORD6, rSHL
1218 or rWORD6, r0, rWORD4_SHIFT
04067002 1219L(duLoop3):
fe6e95d7
AM
1220#ifdef __LITTLE_ENDIAN__
1221 ldbrx rWORD7, 0, rSTR1
1222 ldbrx rWORD8, 0, rSTR2
1223 addi rSTR1, rSTR1, 8
1224 addi rSTR2, rSTR2, 8
1225#else
04067002
UD
1226 ldu rWORD7, 32(rSTR1)
1227 ldu rWORD8, 32(rSTR2)
fe6e95d7
AM
1228#endif
1229 cmpld cr7, rWORD1, rWORD2
04067002 1230 bne- cr1, L(duLcr1)
fe6e95d7
AM
1231 srd r12, rWORD8, rSHR
1232 sld rWORD8_SHIFT, rWORD8, rSHL
1233 or rWORD8, r12, rWORD6_SHIFT
9c84384c
JM
1234 bdnz+ L(duLoop)
1235
04067002 1236L(duL4):
fe6e95d7
AM
1237#if 0
1238/* Huh? We've already branched on cr1! */
04067002 1239 bne cr1, L(duLcr1)
fe6e95d7 1240#endif
04067002
UD
1241 cmpld cr1, rWORD3, rWORD4
1242 bne cr6, L(duLcr6)
1243 cmpld cr6, rWORD5, rWORD6
1244 bne cr5, L(duLcr5)
1245 cmpld cr5, rWORD7, rWORD8
1246L(du44):
fe6e95d7 1247 bne cr7, L(duLcr7)
04067002
UD
1248L(du34):
1249 bne cr1, L(duLcr1)
1250L(du24):
1251 bne cr6, L(duLcr6)
1252L(du14):
1253 sldi. rN, rN, 3
1254 bne cr5, L(duLcr5)
1255/* At this point we have a remainder of 1 to 7 bytes to compare. We use
2ccdea26 1256 shift right double to eliminate bits beyond the compare length.
04067002 1257
9c84384c 1258 However it may not be safe to load rWORD2 which may be beyond the
04067002
UD
1259 string length. So we compare the bit length of the remainder to
1260 the right shift count (rSHR). If the bit count is less than or equal
1261 we do not need to load rWORD2 (all significant bits are already in
fe6e95d7 1262 rWORD8_SHIFT). */
04067002
UD
1263 cmpld cr7, rN, rSHR
1264 beq L(duZeroReturn)
fe6e95d7 1265 li r0, 0
04067002 1266 ble cr7, L(dutrim)
fe6e95d7
AM
1267#ifdef __LITTLE_ENDIAN__
1268 ldbrx rWORD2, 0, rSTR2
1269 addi rSTR2, rSTR2, 8
1270#else
04067002 1271 ld rWORD2, 8(rSTR2)
fe6e95d7
AM
1272#endif
1273 srd r0, rWORD2, rSHR
1274 .align 4
04067002 1275L(dutrim):
fe6e95d7
AM
1276#ifdef __LITTLE_ENDIAN__
1277 ldbrx rWORD1, 0, rSTR1
1278#else
04067002 1279 ld rWORD1, 8(rSTR1)
fe6e95d7
AM
1280#endif
1281 ld rWORD8, -8(r1)
9c84384c 1282 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
fe6e95d7
AM
1283 or rWORD2, r0, rWORD8_SHIFT
1284 ld rWORD7, -16(r1)
1285 ld rSHL, -24(r1)
04067002
UD
1286 srd rWORD1, rWORD1, rN
1287 srd rWORD2, rWORD2, rN
fe6e95d7
AM
1288 ld rSHR, -32(r1)
1289 ld rWORD8_SHIFT, -40(r1)
04067002 1290 li rRTN, 0
fe6e95d7
AM
1291 cmpld cr7, rWORD1, rWORD2
1292 ld rWORD2_SHIFT, -48(r1)
1293 ld rWORD4_SHIFT, -56(r1)
1294 beq cr7, L(dureturn24)
04067002 1295 li rRTN, 1
fe6e95d7
AM
1296 ld rWORD6_SHIFT, -64(r1)
1297 bgtlr cr7
04067002
UD
1298 li rRTN, -1
1299 blr
fe6e95d7
AM
1300 .align 4
1301L(duLcr7):
1302 ld rWORD8, -8(r1)
1303 ld rWORD7, -16(r1)
04067002 1304 li rRTN, 1
fe6e95d7
AM
1305 bgt cr7, L(dureturn29)
1306 ld rSHL, -24(r1)
1307 ld rSHR, -32(r1)
04067002
UD
1308 li rRTN, -1
1309 b L(dureturn27)
fe6e95d7 1310 .align 4
04067002 1311L(duLcr1):
fe6e95d7
AM
1312 ld rWORD8, -8(r1)
1313 ld rWORD7, -16(r1)
04067002 1314 li rRTN, 1
9c84384c 1315 bgt cr1, L(dureturn29)
fe6e95d7
AM
1316 ld rSHL, -24(r1)
1317 ld rSHR, -32(r1)
04067002
UD
1318 li rRTN, -1
1319 b L(dureturn27)
fe6e95d7 1320 .align 4
04067002 1321L(duLcr6):
fe6e95d7
AM
1322 ld rWORD8, -8(r1)
1323 ld rWORD7, -16(r1)
04067002 1324 li rRTN, 1
9c84384c 1325 bgt cr6, L(dureturn29)
fe6e95d7
AM
1326 ld rSHL, -24(r1)
1327 ld rSHR, -32(r1)
04067002
UD
1328 li rRTN, -1
1329 b L(dureturn27)
fe6e95d7 1330 .align 4
04067002 1331L(duLcr5):
fe6e95d7
AM
1332 ld rWORD8, -8(r1)
1333 ld rWORD7, -16(r1)
04067002 1334 li rRTN, 1
9c84384c 1335 bgt cr5, L(dureturn29)
fe6e95d7
AM
1336 ld rSHL, -24(r1)
1337 ld rSHR, -32(r1)
04067002
UD
1338 li rRTN, -1
1339 b L(dureturn27)
1340 .align 3
1341L(duZeroReturn):
fe6e95d7 1342 li rRTN, 0
04067002
UD
1343 .align 4
1344L(dureturn):
fe6e95d7
AM
1345 ld rWORD8, -8(r1)
1346 ld rWORD7, -16(r1)
9c84384c 1347L(dureturn29):
fe6e95d7
AM
1348 ld rSHL, -24(r1)
1349 ld rSHR, -32(r1)
9c84384c 1350L(dureturn27):
fe6e95d7 1351 ld rWORD8_SHIFT, -40(r1)
9c84384c 1352L(dureturn26):
fe6e95d7 1353 ld rWORD2_SHIFT, -48(r1)
9c84384c 1354L(dureturn25):
fe6e95d7 1355 ld rWORD4_SHIFT, -56(r1)
04067002 1356L(dureturn24):
fe6e95d7 1357 ld rWORD6_SHIFT, -64(r1)
04067002
UD
1358 blr
1359L(duzeroLength):
fe6e95d7 1360 li rRTN, 0
04067002
UD
1361 blr
1362
2d67d91a 1363END (memcmp)
04067002
UD
1364libc_hidden_builtin_def (memcmp)
1365weak_alias (memcmp, bcmp)