]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power4/memcmp.S
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power4 / memcmp.S
CommitLineData
fe6e95d7 1/* Optimized strcmp implementation for PowerPC32.
d4697bc9 2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
04067002
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
04067002
UD
18
19#include <sysdep.h>
04067002 20
fe6e95d7
AM
21/* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
04067002 24
a88f47a7 25 .machine power4
b5510883 26EALIGN (memcmp, 4, 0)
04067002
UD
27 CALL_MCOUNT
28
04067002
UD
29#define rRTN r3
30#define rSTR1 r3 /* first string arg */
31#define rSTR2 r4 /* second string arg */
32#define rN r5 /* max string length */
33#define rWORD1 r6 /* current word in s1 */
34#define rWORD2 r7 /* current word in s2 */
35#define rWORD3 r8 /* next word in s1 */
36#define rWORD4 r9 /* next word in s2 */
37#define rWORD5 r10 /* next word in s1 */
38#define rWORD6 r11 /* next word in s2 */
04067002
UD
39#define rWORD7 r30 /* next word in s1 */
40#define rWORD8 r31 /* next word in s2 */
41
fe6e95d7 42 xor r0, rSTR2, rSTR1
04067002
UD
43 cmplwi cr6, rN, 0
44 cmplwi cr1, rN, 12
fe6e95d7
AM
45 clrlwi. r0, r0, 30
46 clrlwi r12, rSTR1, 30
47 cmplwi cr5, r12, 0
04067002 48 beq- cr6, L(zeroLength)
fe6e95d7
AM
49 dcbt 0, rSTR1
50 dcbt 0, rSTR2
04067002
UD
51/* If less than 8 bytes or not aligned, use the unaligned
52 byte loop. */
53 blt cr1, L(bytealigned)
fe6e95d7 54 stwu 1, -64(r1)
04067002 55 cfi_adjust_cfa_offset(64)
fe6e95d7
AM
56 stw rWORD8, 48(r1)
57 cfi_offset(rWORD8, (48-64))
58 stw rWORD7, 44(r1)
59 cfi_offset(rWORD7, (44-64))
04067002
UD
60 bne L(unaligned)
61/* At this point we know both strings have the same alignment and the
fe6e95d7 62 compare length is at least 8 bytes. r12 contains the low order
04067002 63 2 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7 64 of r12 to 0. If r12 == 0 then we are already word
04067002 65 aligned and can perform the word aligned loop.
9c84384c 66
04067002
UD
67 Otherwise we know the two strings have the same alignment (but not
68 yet word aligned). So we force the string addresses to the next lower
69 word boundary and special case this first word using shift left to
2ccdea26 70 eliminate bits preceding the first byte. Since we want to join the
04067002
UD
71 normal (word aligned) compare loop, starting at the second word,
72 we need to adjust the length (rN) and special case the loop
fe6e95d7 73 versioning for the first word. This ensures that the loop count is
04067002 74 correct and the first word (shifted) is in the expected register pair. */
fe6e95d7 75 .align 4
04067002
UD
76L(samealignment):
77 clrrwi rSTR1, rSTR1, 2
78 clrrwi rSTR2, rSTR2, 2
79 beq cr5, L(Waligned)
fe6e95d7
AM
80 add rN, rN, r12
81 slwi rWORD6, r12, 3
82 srwi r0, rN, 4 /* Divide by 16 */
83 andi. r12, rN, 12 /* Get the word remainder */
84#ifdef __LITTLE_ENDIAN__
85 lwbrx rWORD1, 0, rSTR1
86 lwbrx rWORD2, 0, rSTR2
87 addi rSTR1, rSTR1, 4
88 addi rSTR2, rSTR2, 4
89#else
04067002
UD
90 lwz rWORD1, 0(rSTR1)
91 lwz rWORD2, 0(rSTR2)
fe6e95d7
AM
92#endif
93 cmplwi cr1, r12, 8
04067002
UD
94 cmplwi cr7, rN, 16
95 clrlwi rN, rN, 30
96 beq L(dPs4)
fe6e95d7 97 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
98 bgt cr1, L(dPs3)
99 beq cr1, L(dPs2)
100
101/* Remainder is 4 */
fe6e95d7 102 .align 3
04067002 103L(dsP1):
fe6e95d7
AM
104 slw rWORD5, rWORD1, rWORD6
105 slw rWORD6, rWORD2, rWORD6
04067002
UD
106 cmplw cr5, rWORD5, rWORD6
107 blt cr7, L(dP1x)
108/* Do something useful in this cycle since we have to branch anyway. */
fe6e95d7
AM
109#ifdef __LITTLE_ENDIAN__
110 lwbrx rWORD1, 0, rSTR1
111 lwbrx rWORD2, 0, rSTR2
112 addi rSTR1, rSTR1, 4
113 addi rSTR2, rSTR2, 4
114#else
04067002
UD
115 lwz rWORD1, 4(rSTR1)
116 lwz rWORD2, 4(rSTR2)
fe6e95d7
AM
117#endif
118 cmplw cr7, rWORD1, rWORD2
04067002
UD
119 b L(dP1e)
120/* Remainder is 8 */
fe6e95d7 121 .align 4
04067002 122L(dPs2):
fe6e95d7
AM
123 slw rWORD5, rWORD1, rWORD6
124 slw rWORD6, rWORD2, rWORD6
04067002
UD
125 cmplw cr6, rWORD5, rWORD6
126 blt cr7, L(dP2x)
127/* Do something useful in this cycle since we have to branch anyway. */
fe6e95d7
AM
128#ifdef __LITTLE_ENDIAN__
129 lwbrx rWORD7, 0, rSTR1
130 lwbrx rWORD8, 0, rSTR2
131 addi rSTR1, rSTR1, 4
132 addi rSTR2, rSTR2, 4
133#else
04067002
UD
134 lwz rWORD7, 4(rSTR1)
135 lwz rWORD8, 4(rSTR2)
fe6e95d7 136#endif
04067002
UD
137 cmplw cr5, rWORD7, rWORD8
138 b L(dP2e)
139/* Remainder is 12 */
fe6e95d7 140 .align 4
04067002 141L(dPs3):
fe6e95d7
AM
142 slw rWORD3, rWORD1, rWORD6
143 slw rWORD4, rWORD2, rWORD6
04067002
UD
144 cmplw cr1, rWORD3, rWORD4
145 b L(dP3e)
146/* Count is a multiple of 16, remainder is 0 */
fe6e95d7 147 .align 4
04067002 148L(dPs4):
fe6e95d7
AM
149 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
150 slw rWORD1, rWORD1, rWORD6
151 slw rWORD2, rWORD2, rWORD6
152 cmplw cr7, rWORD1, rWORD2
04067002
UD
153 b L(dP4e)
154
155/* At this point we know both strings are word aligned and the
156 compare length is at least 8 bytes. */
fe6e95d7 157 .align 4
04067002 158L(Waligned):
fe6e95d7
AM
159 andi. r12, rN, 12 /* Get the word remainder */
160 srwi r0, rN, 4 /* Divide by 16 */
161 cmplwi cr1, r12, 8
04067002
UD
162 cmplwi cr7, rN, 16
163 clrlwi rN, rN, 30
164 beq L(dP4)
165 bgt cr1, L(dP3)
166 beq cr1, L(dP2)
9c84384c 167
04067002 168/* Remainder is 4 */
fe6e95d7 169 .align 4
04067002 170L(dP1):
fe6e95d7 171 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
172/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
173 (8-15 byte compare), we want to use only volatile registers. This
174 means we can avoid restoring non-volatile registers since we did not
175 change any on the early exit path. The key here is the non-early
9c84384c 176 exit path only cares about the condition code (cr5), not about which
04067002 177 register pair was used. */
fe6e95d7
AM
178#ifdef __LITTLE_ENDIAN__
179 lwbrx rWORD5, 0, rSTR1
180 lwbrx rWORD6, 0, rSTR2
181 addi rSTR1, rSTR1, 4
182 addi rSTR2, rSTR2, 4
183#else
04067002
UD
184 lwz rWORD5, 0(rSTR1)
185 lwz rWORD6, 0(rSTR2)
fe6e95d7 186#endif
04067002
UD
187 cmplw cr5, rWORD5, rWORD6
188 blt cr7, L(dP1x)
fe6e95d7
AM
189#ifdef __LITTLE_ENDIAN__
190 lwbrx rWORD1, 0, rSTR1
191 lwbrx rWORD2, 0, rSTR2
192 addi rSTR1, rSTR1, 4
193 addi rSTR2, rSTR2, 4
194#else
04067002
UD
195 lwz rWORD1, 4(rSTR1)
196 lwz rWORD2, 4(rSTR2)
fe6e95d7
AM
197#endif
198 cmplw cr7, rWORD1, rWORD2
04067002 199L(dP1e):
fe6e95d7
AM
200#ifdef __LITTLE_ENDIAN__
201 lwbrx rWORD3, 0, rSTR1
202 lwbrx rWORD4, 0, rSTR2
203 addi rSTR1, rSTR1, 4
204 addi rSTR2, rSTR2, 4
205#else
04067002
UD
206 lwz rWORD3, 8(rSTR1)
207 lwz rWORD4, 8(rSTR2)
fe6e95d7 208#endif
04067002 209 cmplw cr1, rWORD3, rWORD4
fe6e95d7
AM
210#ifdef __LITTLE_ENDIAN__
211 lwbrx rWORD5, 0, rSTR1
212 lwbrx rWORD6, 0, rSTR2
213 addi rSTR1, rSTR1, 4
214 addi rSTR2, rSTR2, 4
215#else
04067002
UD
216 lwz rWORD5, 12(rSTR1)
217 lwz rWORD6, 12(rSTR2)
fe6e95d7 218#endif
04067002 219 cmplw cr6, rWORD5, rWORD6
fe6e95d7
AM
220 bne cr5, L(dLcr5x)
221 bne cr7, L(dLcr7x)
9c84384c 222
fe6e95d7
AM
223#ifdef __LITTLE_ENDIAN__
224 lwbrx rWORD7, 0, rSTR1
225 lwbrx rWORD8, 0, rSTR2
226 addi rSTR1, rSTR1, 4
227 addi rSTR2, rSTR2, 4
228#else
04067002
UD
229 lwzu rWORD7, 16(rSTR1)
230 lwzu rWORD8, 16(rSTR2)
fe6e95d7 231#endif
04067002
UD
232 bne cr1, L(dLcr1)
233 cmplw cr5, rWORD7, rWORD8
234 bdnz L(dLoop)
235 bne cr6, L(dLcr6)
fe6e95d7
AM
236 lwz rWORD7, 44(r1)
237 lwz rWORD8, 48(r1)
238 .align 3
04067002
UD
239L(dP1x):
240 slwi. r12, rN, 3
fe6e95d7 241 bne cr5, L(dLcr5x)
04067002 242 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
fe6e95d7
AM
243 addi 1, 1, 64
244 cfi_adjust_cfa_offset(-64)
04067002
UD
245 bne L(d00)
246 li rRTN, 0
247 blr
9c84384c 248
04067002 249/* Remainder is 8 */
fe6e95d7
AM
250 .align 4
251 cfi_adjust_cfa_offset(64)
04067002 252L(dP2):
fe6e95d7
AM
253 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
254#ifdef __LITTLE_ENDIAN__
255 lwbrx rWORD5, 0, rSTR1
256 lwbrx rWORD6, 0, rSTR2
257 addi rSTR1, rSTR1, 4
258 addi rSTR2, rSTR2, 4
259#else
04067002
UD
260 lwz rWORD5, 0(rSTR1)
261 lwz rWORD6, 0(rSTR2)
fe6e95d7 262#endif
04067002
UD
263 cmplw cr6, rWORD5, rWORD6
264 blt cr7, L(dP2x)
fe6e95d7
AM
265#ifdef __LITTLE_ENDIAN__
266 lwbrx rWORD7, 0, rSTR1
267 lwbrx rWORD8, 0, rSTR2
268 addi rSTR1, rSTR1, 4
269 addi rSTR2, rSTR2, 4
270#else
04067002
UD
271 lwz rWORD7, 4(rSTR1)
272 lwz rWORD8, 4(rSTR2)
fe6e95d7 273#endif
04067002
UD
274 cmplw cr5, rWORD7, rWORD8
275L(dP2e):
fe6e95d7
AM
276#ifdef __LITTLE_ENDIAN__
277 lwbrx rWORD1, 0, rSTR1
278 lwbrx rWORD2, 0, rSTR2
279 addi rSTR1, rSTR1, 4
280 addi rSTR2, rSTR2, 4
281#else
04067002
UD
282 lwz rWORD1, 8(rSTR1)
283 lwz rWORD2, 8(rSTR2)
fe6e95d7
AM
284#endif
285 cmplw cr7, rWORD1, rWORD2
286#ifdef __LITTLE_ENDIAN__
287 lwbrx rWORD3, 0, rSTR1
288 lwbrx rWORD4, 0, rSTR2
289 addi rSTR1, rSTR1, 4
290 addi rSTR2, rSTR2, 4
291#else
04067002
UD
292 lwz rWORD3, 12(rSTR1)
293 lwz rWORD4, 12(rSTR2)
fe6e95d7 294#endif
04067002 295 cmplw cr1, rWORD3, rWORD4
fe6e95d7 296#ifndef __LITTLE_ENDIAN__
04067002
UD
297 addi rSTR1, rSTR1, 4
298 addi rSTR2, rSTR2, 4
fe6e95d7 299#endif
04067002
UD
300 bne cr6, L(dLcr6)
301 bne cr5, L(dLcr5)
302 b L(dLoop2)
303/* Again we are on a early exit path (16-23 byte compare), we want to
304 only use volatile registers and avoid restoring non-volatile
305 registers. */
fe6e95d7 306 .align 4
04067002 307L(dP2x):
fe6e95d7
AM
308#ifdef __LITTLE_ENDIAN__
309 lwbrx rWORD3, 0, rSTR1
310 lwbrx rWORD4, 0, rSTR2
311 addi rSTR1, rSTR1, 4
312 addi rSTR2, rSTR2, 4
313#else
04067002
UD
314 lwz rWORD3, 4(rSTR1)
315 lwz rWORD4, 4(rSTR2)
fe6e95d7
AM
316#endif
317 cmplw cr1, rWORD3, rWORD4
04067002 318 slwi. r12, rN, 3
fe6e95d7
AM
319 bne cr6, L(dLcr6x)
320#ifndef __LITTLE_ENDIAN__
04067002
UD
321 addi rSTR1, rSTR1, 4
322 addi rSTR2, rSTR2, 4
fe6e95d7
AM
323#endif
324 bne cr1, L(dLcr1x)
04067002 325 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
fe6e95d7
AM
326 addi 1, 1, 64
327 cfi_adjust_cfa_offset(-64)
04067002
UD
328 bne L(d00)
329 li rRTN, 0
330 blr
9c84384c 331
04067002 332/* Remainder is 12 */
fe6e95d7
AM
333 .align 4
334 cfi_adjust_cfa_offset(64)
04067002 335L(dP3):
fe6e95d7
AM
336 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
337#ifdef __LITTLE_ENDIAN__
338 lwbrx rWORD3, 0, rSTR1
339 lwbrx rWORD4, 0, rSTR2
340 addi rSTR1, rSTR1, 4
341 addi rSTR2, rSTR2, 4
342#else
04067002
UD
343 lwz rWORD3, 0(rSTR1)
344 lwz rWORD4, 0(rSTR2)
fe6e95d7 345#endif
04067002
UD
346 cmplw cr1, rWORD3, rWORD4
347L(dP3e):
fe6e95d7
AM
348#ifdef __LITTLE_ENDIAN__
349 lwbrx rWORD5, 0, rSTR1
350 lwbrx rWORD6, 0, rSTR2
351 addi rSTR1, rSTR1, 4
352 addi rSTR2, rSTR2, 4
353#else
04067002
UD
354 lwz rWORD5, 4(rSTR1)
355 lwz rWORD6, 4(rSTR2)
fe6e95d7 356#endif
04067002
UD
357 cmplw cr6, rWORD5, rWORD6
358 blt cr7, L(dP3x)
fe6e95d7
AM
359#ifdef __LITTLE_ENDIAN__
360 lwbrx rWORD7, 0, rSTR1
361 lwbrx rWORD8, 0, rSTR2
362 addi rSTR1, rSTR1, 4
363 addi rSTR2, rSTR2, 4
364#else
04067002
UD
365 lwz rWORD7, 8(rSTR1)
366 lwz rWORD8, 8(rSTR2)
fe6e95d7 367#endif
04067002 368 cmplw cr5, rWORD7, rWORD8
fe6e95d7
AM
369#ifdef __LITTLE_ENDIAN__
370 lwbrx rWORD1, 0, rSTR1
371 lwbrx rWORD2, 0, rSTR2
372 addi rSTR1, rSTR1, 4
373 addi rSTR2, rSTR2, 4
374#else
04067002
UD
375 lwz rWORD1, 12(rSTR1)
376 lwz rWORD2, 12(rSTR2)
fe6e95d7
AM
377#endif
378 cmplw cr7, rWORD1, rWORD2
379#ifndef __LITTLE_ENDIAN__
04067002
UD
380 addi rSTR1, rSTR1, 8
381 addi rSTR2, rSTR2, 8
fe6e95d7 382#endif
04067002
UD
383 bne cr1, L(dLcr1)
384 bne cr6, L(dLcr6)
385 b L(dLoop1)
386/* Again we are on a early exit path (24-31 byte compare), we want to
387 only use volatile registers and avoid restoring non-volatile
388 registers. */
fe6e95d7 389 .align 4
04067002 390L(dP3x):
fe6e95d7
AM
391#ifdef __LITTLE_ENDIAN__
392 lwbrx rWORD1, 0, rSTR1
393 lwbrx rWORD2, 0, rSTR2
394 addi rSTR1, rSTR1, 4
395 addi rSTR2, rSTR2, 4
396#else
04067002
UD
397 lwz rWORD1, 8(rSTR1)
398 lwz rWORD2, 8(rSTR2)
fe6e95d7
AM
399#endif
400 cmplw cr7, rWORD1, rWORD2
04067002 401 slwi. r12, rN, 3
fe6e95d7
AM
402 bne cr1, L(dLcr1x)
403#ifndef __LITTLE_ENDIAN__
04067002
UD
404 addi rSTR1, rSTR1, 8
405 addi rSTR2, rSTR2, 8
fe6e95d7
AM
406#endif
407 bne cr6, L(dLcr6x)
04067002 408 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
fe6e95d7
AM
409 bne cr7, L(dLcr7x)
410 addi 1, 1, 64
411 cfi_adjust_cfa_offset(-64)
04067002
UD
412 bne L(d00)
413 li rRTN, 0
414 blr
9c84384c 415
04067002 416/* Count is a multiple of 16, remainder is 0 */
fe6e95d7
AM
417 .align 4
418 cfi_adjust_cfa_offset(64)
04067002 419L(dP4):
fe6e95d7
AM
420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
421#ifdef __LITTLE_ENDIAN__
422 lwbrx rWORD1, 0, rSTR1
423 lwbrx rWORD2, 0, rSTR2
424 addi rSTR1, rSTR1, 4
425 addi rSTR2, rSTR2, 4
426#else
04067002
UD
427 lwz rWORD1, 0(rSTR1)
428 lwz rWORD2, 0(rSTR2)
fe6e95d7
AM
429#endif
430 cmplw cr7, rWORD1, rWORD2
04067002 431L(dP4e):
fe6e95d7
AM
432#ifdef __LITTLE_ENDIAN__
433 lwbrx rWORD3, 0, rSTR1
434 lwbrx rWORD4, 0, rSTR2
435 addi rSTR1, rSTR1, 4
436 addi rSTR2, rSTR2, 4
437#else
04067002
UD
438 lwz rWORD3, 4(rSTR1)
439 lwz rWORD4, 4(rSTR2)
fe6e95d7 440#endif
04067002 441 cmplw cr1, rWORD3, rWORD4
fe6e95d7
AM
442#ifdef __LITTLE_ENDIAN__
443 lwbrx rWORD5, 0, rSTR1
444 lwbrx rWORD6, 0, rSTR2
445 addi rSTR1, rSTR1, 4
446 addi rSTR2, rSTR2, 4
447#else
04067002
UD
448 lwz rWORD5, 8(rSTR1)
449 lwz rWORD6, 8(rSTR2)
fe6e95d7 450#endif
04067002 451 cmplw cr6, rWORD5, rWORD6
fe6e95d7
AM
452#ifdef __LITTLE_ENDIAN__
453 lwbrx rWORD7, 0, rSTR1
454 lwbrx rWORD8, 0, rSTR2
455 addi rSTR1, rSTR1, 4
456 addi rSTR2, rSTR2, 4
457#else
04067002
UD
458 lwzu rWORD7, 12(rSTR1)
459 lwzu rWORD8, 12(rSTR2)
fe6e95d7 460#endif
04067002 461 cmplw cr5, rWORD7, rWORD8
fe6e95d7 462 bne cr7, L(dLcr7)
04067002
UD
463 bne cr1, L(dLcr1)
464 bdz- L(d24) /* Adjust CTR as we start with +4 */
465/* This is the primary loop */
fe6e95d7 466 .align 4
04067002 467L(dLoop):
fe6e95d7
AM
468#ifdef __LITTLE_ENDIAN__
469 lwbrx rWORD1, 0, rSTR1
470 lwbrx rWORD2, 0, rSTR2
471 addi rSTR1, rSTR1, 4
472 addi rSTR2, rSTR2, 4
473#else
04067002
UD
474 lwz rWORD1, 4(rSTR1)
475 lwz rWORD2, 4(rSTR2)
fe6e95d7 476#endif
04067002
UD
477 cmplw cr1, rWORD3, rWORD4
478 bne cr6, L(dLcr6)
479L(dLoop1):
fe6e95d7
AM
480#ifdef __LITTLE_ENDIAN__
481 lwbrx rWORD3, 0, rSTR1
482 lwbrx rWORD4, 0, rSTR2
483 addi rSTR1, rSTR1, 4
484 addi rSTR2, rSTR2, 4
485#else
04067002
UD
486 lwz rWORD3, 8(rSTR1)
487 lwz rWORD4, 8(rSTR2)
fe6e95d7 488#endif
04067002
UD
489 cmplw cr6, rWORD5, rWORD6
490 bne cr5, L(dLcr5)
491L(dLoop2):
fe6e95d7
AM
492#ifdef __LITTLE_ENDIAN__
493 lwbrx rWORD5, 0, rSTR1
494 lwbrx rWORD6, 0, rSTR2
495 addi rSTR1, rSTR1, 4
496 addi rSTR2, rSTR2, 4
497#else
04067002
UD
498 lwz rWORD5, 12(rSTR1)
499 lwz rWORD6, 12(rSTR2)
fe6e95d7 500#endif
04067002 501 cmplw cr5, rWORD7, rWORD8
fe6e95d7 502 bne cr7, L(dLcr7)
04067002 503L(dLoop3):
fe6e95d7
AM
504#ifdef __LITTLE_ENDIAN__
505 lwbrx rWORD7, 0, rSTR1
506 lwbrx rWORD8, 0, rSTR2
507 addi rSTR1, rSTR1, 4
508 addi rSTR2, rSTR2, 4
509#else
04067002
UD
510 lwzu rWORD7, 16(rSTR1)
511 lwzu rWORD8, 16(rSTR2)
fe6e95d7 512#endif
04067002 513 bne- cr1, L(dLcr1)
fe6e95d7 514 cmplw cr7, rWORD1, rWORD2
9c84384c
JM
515 bdnz+ L(dLoop)
516
04067002
UD
517L(dL4):
518 cmplw cr1, rWORD3, rWORD4
519 bne cr6, L(dLcr6)
520 cmplw cr6, rWORD5, rWORD6
521 bne cr5, L(dLcr5)
522 cmplw cr5, rWORD7, rWORD8
523L(d44):
fe6e95d7 524 bne cr7, L(dLcr7)
04067002
UD
525L(d34):
526 bne cr1, L(dLcr1)
527L(d24):
528 bne cr6, L(dLcr6)
529L(d14):
530 slwi. r12, rN, 3
9c84384c 531 bne cr5, L(dLcr5)
04067002 532L(d04):
fe6e95d7
AM
533 lwz rWORD7, 44(r1)
534 lwz rWORD8, 48(r1)
535 addi 1, 1, 64
536 cfi_adjust_cfa_offset(-64)
04067002
UD
537 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
538 beq L(zeroLength)
539/* At this point we have a remainder of 1 to 3 bytes to compare. Since
540 we are aligned it is safe to load the whole word, and use
fe6e95d7 541 shift right to eliminate bits beyond the compare length. */
04067002 542L(d00):
fe6e95d7
AM
543#ifdef __LITTLE_ENDIAN__
544 lwbrx rWORD1, 0, rSTR1
545 lwbrx rWORD2, 0, rSTR2
546 addi rSTR1, rSTR1, 4
547 addi rSTR2, rSTR2, 4
548#else
04067002 549 lwz rWORD1, 4(rSTR1)
9c84384c 550 lwz rWORD2, 4(rSTR2)
fe6e95d7 551#endif
04067002
UD
552 srw rWORD1, rWORD1, rN
553 srw rWORD2, rWORD2, rN
fe6e95d7
AM
554 sub rRTN, rWORD1, rWORD2
555 blr
556
557 .align 4
558 cfi_adjust_cfa_offset(64)
559L(dLcr7):
560 lwz rWORD7, 44(r1)
561 lwz rWORD8, 48(r1)
562L(dLcr7x):
04067002 563 li rRTN, 1
fe6e95d7
AM
564 addi 1, 1, 64
565 cfi_adjust_cfa_offset(-64)
566 bgtlr cr7
04067002
UD
567 li rRTN, -1
568 blr
fe6e95d7
AM
569 .align 4
570 cfi_adjust_cfa_offset(64)
04067002 571L(dLcr1):
fe6e95d7
AM
572 lwz rWORD7, 44(r1)
573 lwz rWORD8, 48(r1)
574L(dLcr1x):
04067002 575 li rRTN, 1
fe6e95d7
AM
576 addi 1, 1, 64
577 cfi_adjust_cfa_offset(-64)
04067002
UD
578 bgtlr cr1
579 li rRTN, -1
580 blr
fe6e95d7
AM
581 .align 4
582 cfi_adjust_cfa_offset(64)
04067002 583L(dLcr6):
fe6e95d7
AM
584 lwz rWORD7, 44(r1)
585 lwz rWORD8, 48(r1)
586L(dLcr6x):
04067002 587 li rRTN, 1
fe6e95d7
AM
588 addi 1, 1, 64
589 cfi_adjust_cfa_offset(-64)
04067002
UD
590 bgtlr cr6
591 li rRTN, -1
592 blr
fe6e95d7
AM
593 .align 4
594 cfi_adjust_cfa_offset(64)
04067002 595L(dLcr5):
fe6e95d7
AM
596 lwz rWORD7, 44(r1)
597 lwz rWORD8, 48(r1)
04067002
UD
598L(dLcr5x):
599 li rRTN, 1
fe6e95d7
AM
600 addi 1, 1, 64
601 cfi_adjust_cfa_offset(-64)
04067002
UD
602 bgtlr cr5
603 li rRTN, -1
604 blr
9c84384c 605
fe6e95d7 606 .align 4
04067002 607L(bytealigned):
fe6e95d7 608 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
609
610/* We need to prime this loop. This loop is swing modulo scheduled
9c84384c 611 to avoid pipe delays. The dependent instruction latencies (load to
04067002
UD
612 compare to conditional branch) is 2 to 3 cycles. In this loop each
613 dispatch group ends in a branch and takes 1 cycle. Effectively
9c84384c
JM
614 the first iteration of the loop only serves to load operands and
615 branches based on compares are delayed until the next loop.
04067002
UD
616
617 So we must precondition some registers and condition codes so that
618 we don't exit the loop early on the first iteration. */
9c84384c 619
04067002
UD
620 lbz rWORD1, 0(rSTR1)
621 lbz rWORD2, 0(rSTR2)
622 bdz- L(b11)
fe6e95d7 623 cmplw cr7, rWORD1, rWORD2
04067002
UD
624 lbz rWORD3, 1(rSTR1)
625 lbz rWORD4, 1(rSTR2)
626 bdz- L(b12)
627 cmplw cr1, rWORD3, rWORD4
628 lbzu rWORD5, 2(rSTR1)
629 lbzu rWORD6, 2(rSTR2)
630 bdz- L(b13)
fe6e95d7 631 .align 4
04067002
UD
632L(bLoop):
633 lbzu rWORD1, 1(rSTR1)
634 lbzu rWORD2, 1(rSTR2)
fe6e95d7 635 bne- cr7, L(bLcr7)
04067002
UD
636
637 cmplw cr6, rWORD5, rWORD6
638 bdz- L(b3i)
9c84384c 639
04067002
UD
640 lbzu rWORD3, 1(rSTR1)
641 lbzu rWORD4, 1(rSTR2)
642 bne- cr1, L(bLcr1)
643
fe6e95d7 644 cmplw cr7, rWORD1, rWORD2
04067002
UD
645 bdz- L(b2i)
646
647 lbzu rWORD5, 1(rSTR1)
648 lbzu rWORD6, 1(rSTR2)
649 bne- cr6, L(bLcr6)
650
651 cmplw cr1, rWORD3, rWORD4
652 bdnz+ L(bLoop)
9c84384c 653
04067002
UD
654/* We speculatively loading bytes before we have tested the previous
655 bytes. But we must avoid overrunning the length (in the ctr) to
9c84384c 656 prevent these speculative loads from causing a segfault. In this
04067002
UD
657 case the loop will exit early (before the all pending bytes are
658 tested. In this case we must complete the pending operations
659 before returning. */
660L(b1i):
fe6e95d7 661 bne- cr7, L(bLcr7)
04067002
UD
662 bne- cr1, L(bLcr1)
663 b L(bx56)
fe6e95d7 664 .align 4
04067002
UD
665L(b2i):
666 bne- cr6, L(bLcr6)
fe6e95d7 667 bne- cr7, L(bLcr7)
04067002 668 b L(bx34)
fe6e95d7 669 .align 4
04067002
UD
670L(b3i):
671 bne- cr1, L(bLcr1)
672 bne- cr6, L(bLcr6)
673 b L(bx12)
fe6e95d7
AM
674 .align 4
675L(bLcr7):
04067002 676 li rRTN, 1
fe6e95d7 677 bgtlr cr7
04067002
UD
678 li rRTN, -1
679 blr
680L(bLcr1):
681 li rRTN, 1
682 bgtlr cr1
683 li rRTN, -1
684 blr
685L(bLcr6):
686 li rRTN, 1
687 bgtlr cr6
688 li rRTN, -1
689 blr
690
691L(b13):
fe6e95d7 692 bne- cr7, L(bx12)
04067002
UD
693 bne- cr1, L(bx34)
694L(bx56):
695 sub rRTN, rWORD5, rWORD6
696 blr
697 nop
698L(b12):
fe6e95d7 699 bne- cr7, L(bx12)
9c84384c 700L(bx34):
04067002
UD
701 sub rRTN, rWORD3, rWORD4
702 blr
04067002
UD
703L(b11):
704L(bx12):
705 sub rRTN, rWORD1, rWORD2
706 blr
fe6e95d7 707 .align 4
04067002
UD
708L(zeroLength):
709 li rRTN, 0
710 blr
711
fe6e95d7 712 .align 4
04067002 713/* At this point we know the strings have different alignment and the
fe6e95d7 714 compare length is at least 8 bytes. r12 contains the low order
04067002 715 2 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7 716 of r12 to 0. If r12 == 0 then rStr1 is word aligned and can
04067002 717 perform the Wunaligned loop.
9c84384c 718
c0c3f78a 719 Otherwise we know that rSTR1 is not already word aligned yet.
04067002
UD
720 So we can force the string addresses to the next lower word
721 boundary and special case this first word using shift left to
2ccdea26 722 eliminate bits preceding the first byte. Since we want to join the
04067002
UD
723 normal (Wualigned) compare loop, starting at the second word,
724 we need to adjust the length (rN) and special case the loop
fe6e95d7 725 versioning for the first W. This ensures that the loop count is
04067002
UD
726 correct and the first W (shifted) is in the expected resister pair. */
727#define rSHL r29 /* Unaligned shift left count. */
728#define rSHR r28 /* Unaligned shift right count. */
fe6e95d7
AM
729#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
730#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
731#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
732#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
733 cfi_adjust_cfa_offset(64)
04067002 734L(unaligned):
fe6e95d7
AM
735 stw rSHL, 40(r1)
736 cfi_offset(rSHL, (40-64))
04067002 737 clrlwi rSHL, rSTR2, 30
fe6e95d7
AM
738 stw rSHR, 36(r1)
739 cfi_offset(rSHR, (36-64))
04067002 740 beq cr5, L(Wunaligned)
fe6e95d7
AM
741 stw rWORD8_SHIFT, 32(r1)
742 cfi_offset(rWORD8_SHIFT, (32-64))
04067002
UD
743/* Adjust the logical start of rSTR2 to compensate for the extra bits
744 in the 1st rSTR1 W. */
fe6e95d7 745 sub rWORD8_SHIFT, rSTR2, r12
04067002
UD
746/* But do not attempt to address the W before that W that contains
747 the actual start of rSTR2. */
748 clrrwi rSTR2, rSTR2, 2
fe6e95d7
AM
749 stw rWORD2_SHIFT, 28(r1)
750 cfi_offset(rWORD2_SHIFT, (28-64))
751/* Compute the left/right shift counts for the unaligned rSTR2,
9c84384c 752 compensating for the logical (W aligned) start of rSTR1. */
fe6e95d7 753 clrlwi rSHL, rWORD8_SHIFT, 30
9c84384c 754 clrrwi rSTR1, rSTR1, 2
fe6e95d7
AM
755 stw rWORD4_SHIFT, 24(r1)
756 cfi_offset(rWORD4_SHIFT, (24-64))
04067002 757 slwi rSHL, rSHL, 3
fe6e95d7
AM
758 cmplw cr5, rWORD8_SHIFT, rSTR2
759 add rN, rN, r12
760 slwi rWORD6, r12, 3
761 stw rWORD6_SHIFT, 20(r1)
762 cfi_offset(rWORD6_SHIFT, (20-64))
04067002 763 subfic rSHR, rSHL, 32
fe6e95d7
AM
764 srwi r0, rN, 4 /* Divide by 16 */
765 andi. r12, rN, 12 /* Get the W remainder */
04067002
UD
766/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
767 this special case those bits may be discarded anyway. Also we
768 must avoid loading a W where none of the bits are part of rSTR2 as
769 this may cross a page boundary and cause a page fault. */
770 li rWORD8, 0
771 blt cr5, L(dus0)
fe6e95d7
AM
772#ifdef __LITTLE_ENDIAN__
773 lwbrx rWORD8, 0, rSTR2
774 addi rSTR2, rSTR2, 4
775#else
04067002 776 lwz rWORD8, 0(rSTR2)
fe6e95d7
AM
777 addi rSTR2, rSTR2, 4
778#endif
04067002
UD
779 slw rWORD8, rWORD8, rSHL
780
781L(dus0):
fe6e95d7
AM
782#ifdef __LITTLE_ENDIAN__
783 lwbrx rWORD1, 0, rSTR1
784 lwbrx rWORD2, 0, rSTR2
785 addi rSTR1, rSTR1, 4
786 addi rSTR2, rSTR2, 4
787#else
04067002
UD
788 lwz rWORD1, 0(rSTR1)
789 lwz rWORD2, 0(rSTR2)
fe6e95d7
AM
790#endif
791 cmplwi cr1, r12, 8
04067002 792 cmplwi cr7, rN, 16
fe6e95d7 793 srw r12, rWORD2, rSHR
04067002
UD
794 clrlwi rN, rN, 30
795 beq L(duPs4)
fe6e95d7
AM
796 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
797 or rWORD8, r12, rWORD8
04067002
UD
798 bgt cr1, L(duPs3)
799 beq cr1, L(duPs2)
800
801/* Remainder is 4 */
fe6e95d7 802 .align 4
04067002 803L(dusP1):
fe6e95d7
AM
804 slw rWORD8_SHIFT, rWORD2, rSHL
805 slw rWORD7, rWORD1, rWORD6
806 slw rWORD8, rWORD8, rWORD6
04067002
UD
807 bge cr7, L(duP1e)
808/* At this point we exit early with the first word compare
809 complete and remainder of 0 to 3 bytes. See L(du14) for details on
810 how we handle the remaining bytes. */
811 cmplw cr5, rWORD7, rWORD8
812 slwi. rN, rN, 3
813 bne cr5, L(duLcr5)
814 cmplw cr7, rN, rSHR
815 beq L(duZeroReturn)
fe6e95d7 816 li r0, 0
04067002 817 ble cr7, L(dutrim)
fe6e95d7
AM
818#ifdef __LITTLE_ENDIAN__
819 lwbrx rWORD2, 0, rSTR2
820 addi rSTR2, rSTR2, 4
821#else
04067002 822 lwz rWORD2, 4(rSTR2)
fe6e95d7
AM
823#endif
824 srw r0, rWORD2, rSHR
04067002
UD
825 b L(dutrim)
826/* Remainder is 8 */
fe6e95d7 827 .align 4
04067002 828L(duPs2):
fe6e95d7
AM
829 slw rWORD6_SHIFT, rWORD2, rSHL
830 slw rWORD5, rWORD1, rWORD6
831 slw rWORD6, rWORD8, rWORD6
04067002
UD
832 b L(duP2e)
833/* Remainder is 12 */
fe6e95d7 834 .align 4
04067002 835L(duPs3):
fe6e95d7
AM
836 slw rWORD4_SHIFT, rWORD2, rSHL
837 slw rWORD3, rWORD1, rWORD6
838 slw rWORD4, rWORD8, rWORD6
04067002
UD
839 b L(duP3e)
840/* Count is a multiple of 16, remainder is 0 */
fe6e95d7 841 .align 4
04067002 842L(duPs4):
fe6e95d7
AM
843 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
844 or rWORD8, r12, rWORD8
845 slw rWORD2_SHIFT, rWORD2, rSHL
846 slw rWORD1, rWORD1, rWORD6
847 slw rWORD2, rWORD8, rWORD6
04067002
UD
848 b L(duP4e)
849
850/* At this point we know rSTR1 is word aligned and the
851 compare length is at least 8 bytes. */
fe6e95d7 852 .align 4
04067002 853L(Wunaligned):
fe6e95d7
AM
854 stw rWORD8_SHIFT, 32(r1)
855 cfi_offset(rWORD8_SHIFT, (32-64))
04067002 856 clrrwi rSTR2, rSTR2, 2
fe6e95d7
AM
857 stw rWORD2_SHIFT, 28(r1)
858 cfi_offset(rWORD2_SHIFT, (28-64))
859 srwi r0, rN, 4 /* Divide by 16 */
860 stw rWORD4_SHIFT, 24(r1)
861 cfi_offset(rWORD4_SHIFT, (24-64))
862 andi. r12, rN, 12 /* Get the W remainder */
863 stw rWORD6_SHIFT, 20(r1)
864 cfi_offset(rWORD6_SHIFT, (20-64))
04067002 865 slwi rSHL, rSHL, 3
fe6e95d7
AM
866#ifdef __LITTLE_ENDIAN__
867 lwbrx rWORD6, 0, rSTR2
868 addi rSTR2, rSTR2, 4
869 lwbrx rWORD8, 0, rSTR2
870 addi rSTR2, rSTR2, 4
871#else
04067002
UD
872 lwz rWORD6, 0(rSTR2)
873 lwzu rWORD8, 4(rSTR2)
fe6e95d7
AM
874#endif
875 cmplwi cr1, r12, 8
04067002
UD
876 cmplwi cr7, rN, 16
877 clrlwi rN, rN, 30
878 subfic rSHR, rSHL, 32
fe6e95d7 879 slw rWORD6_SHIFT, rWORD6, rSHL
04067002 880 beq L(duP4)
fe6e95d7 881 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
04067002
UD
882 bgt cr1, L(duP3)
883 beq cr1, L(duP2)
9c84384c 884
04067002 885/* Remainder is 4 */
fe6e95d7 886 .align 4
04067002 887L(duP1):
fe6e95d7
AM
888 srw r12, rWORD8, rSHR
889#ifdef __LITTLE_ENDIAN__
890 lwbrx rWORD7, 0, rSTR1
891 addi rSTR1, rSTR1, 4
892#else
04067002 893 lwz rWORD7, 0(rSTR1)
fe6e95d7
AM
894#endif
895 slw rWORD8_SHIFT, rWORD8, rSHL
896 or rWORD8, r12, rWORD6_SHIFT
04067002
UD
897 blt cr7, L(duP1x)
898L(duP1e):
fe6e95d7
AM
899#ifdef __LITTLE_ENDIAN__
900 lwbrx rWORD1, 0, rSTR1
901 lwbrx rWORD2, 0, rSTR2
902 addi rSTR1, rSTR1, 4
903 addi rSTR2, rSTR2, 4
904#else
04067002
UD
905 lwz rWORD1, 4(rSTR1)
906 lwz rWORD2, 4(rSTR2)
fe6e95d7 907#endif
04067002 908 cmplw cr5, rWORD7, rWORD8
fe6e95d7
AM
909 srw r0, rWORD2, rSHR
910 slw rWORD2_SHIFT, rWORD2, rSHL
911 or rWORD2, r0, rWORD8_SHIFT
912#ifdef __LITTLE_ENDIAN__
913 lwbrx rWORD3, 0, rSTR1
914 lwbrx rWORD4, 0, rSTR2
915 addi rSTR1, rSTR1, 4
916 addi rSTR2, rSTR2, 4
917#else
04067002
UD
918 lwz rWORD3, 8(rSTR1)
919 lwz rWORD4, 8(rSTR2)
fe6e95d7
AM
920#endif
921 cmplw cr7, rWORD1, rWORD2
922 srw r12, rWORD4, rSHR
923 slw rWORD4_SHIFT, rWORD4, rSHL
04067002 924 bne cr5, L(duLcr5)
fe6e95d7
AM
925 or rWORD4, r12, rWORD2_SHIFT
926#ifdef __LITTLE_ENDIAN__
927 lwbrx rWORD5, 0, rSTR1
928 lwbrx rWORD6, 0, rSTR2
929 addi rSTR1, rSTR1, 4
930 addi rSTR2, rSTR2, 4
931#else
04067002
UD
932 lwz rWORD5, 12(rSTR1)
933 lwz rWORD6, 12(rSTR2)
fe6e95d7 934#endif
04067002 935 cmplw cr1, rWORD3, rWORD4
fe6e95d7
AM
936 srw r0, rWORD6, rSHR
937 slw rWORD6_SHIFT, rWORD6, rSHL
938 bne cr7, L(duLcr7)
939 or rWORD6, r0, rWORD4_SHIFT
04067002 940 cmplw cr6, rWORD5, rWORD6
9c84384c 941 b L(duLoop3)
fe6e95d7 942 .align 4
04067002
UD
943/* At this point we exit early with the first word compare
944 complete and remainder of 0 to 3 bytes. See L(du14) for details on
945 how we handle the remaining bytes. */
946L(duP1x):
947 cmplw cr5, rWORD7, rWORD8
948 slwi. rN, rN, 3
949 bne cr5, L(duLcr5)
950 cmplw cr7, rN, rSHR
951 beq L(duZeroReturn)
fe6e95d7 952 li r0, 0
04067002 953 ble cr7, L(dutrim)
fe6e95d7
AM
954#ifdef __LITTLE_ENDIAN__
955 lwbrx rWORD2, 0, rSTR2
956 addi rSTR2, rSTR2, 4
957#else
958 lwz rWORD2, 8(rSTR2)
959#endif
960 srw r0, rWORD2, rSHR
04067002
UD
961 b L(dutrim)
962/* Remainder is 8 */
fe6e95d7 963 .align 4
04067002 964L(duP2):
fe6e95d7
AM
965 srw r0, rWORD8, rSHR
966#ifdef __LITTLE_ENDIAN__
967 lwbrx rWORD5, 0, rSTR1
968 addi rSTR1, rSTR1, 4
969#else
04067002 970 lwz rWORD5, 0(rSTR1)
fe6e95d7
AM
971#endif
972 or rWORD6, r0, rWORD6_SHIFT
973 slw rWORD6_SHIFT, rWORD8, rSHL
04067002 974L(duP2e):
fe6e95d7
AM
975#ifdef __LITTLE_ENDIAN__
976 lwbrx rWORD7, 0, rSTR1
977 lwbrx rWORD8, 0, rSTR2
978 addi rSTR1, rSTR1, 4
979 addi rSTR2, rSTR2, 4
980#else
04067002
UD
981 lwz rWORD7, 4(rSTR1)
982 lwz rWORD8, 4(rSTR2)
fe6e95d7 983#endif
04067002 984 cmplw cr6, rWORD5, rWORD6
fe6e95d7
AM
985 srw r12, rWORD8, rSHR
986 slw rWORD8_SHIFT, rWORD8, rSHL
987 or rWORD8, r12, rWORD6_SHIFT
04067002 988 blt cr7, L(duP2x)
fe6e95d7
AM
989#ifdef __LITTLE_ENDIAN__
990 lwbrx rWORD1, 0, rSTR1
991 lwbrx rWORD2, 0, rSTR2
992 addi rSTR1, rSTR1, 4
993 addi rSTR2, rSTR2, 4
994#else
04067002
UD
995 lwz rWORD1, 8(rSTR1)
996 lwz rWORD2, 8(rSTR2)
fe6e95d7 997#endif
04067002
UD
998 cmplw cr5, rWORD7, rWORD8
999 bne cr6, L(duLcr6)
fe6e95d7
AM
1000 srw r0, rWORD2, rSHR
1001 slw rWORD2_SHIFT, rWORD2, rSHL
1002 or rWORD2, r0, rWORD8_SHIFT
1003#ifdef __LITTLE_ENDIAN__
1004 lwbrx rWORD3, 0, rSTR1
1005 lwbrx rWORD4, 0, rSTR2
1006 addi rSTR1, rSTR1, 4
1007 addi rSTR2, rSTR2, 4
1008#else
04067002
UD
1009 lwz rWORD3, 12(rSTR1)
1010 lwz rWORD4, 12(rSTR2)
fe6e95d7
AM
1011#endif
1012 cmplw cr7, rWORD1, rWORD2
04067002 1013 bne cr5, L(duLcr5)
fe6e95d7
AM
1014 srw r12, rWORD4, rSHR
1015 slw rWORD4_SHIFT, rWORD4, rSHL
1016 or rWORD4, r12, rWORD2_SHIFT
1017#ifndef __LITTLE_ENDIAN__
04067002
UD
1018 addi rSTR1, rSTR1, 4
1019 addi rSTR2, rSTR2, 4
fe6e95d7 1020#endif
04067002
UD
1021 cmplw cr1, rWORD3, rWORD4
1022 b L(duLoop2)
fe6e95d7 1023 .align 4
04067002
UD
1024L(duP2x):
1025 cmplw cr5, rWORD7, rWORD8
fe6e95d7 1026#ifndef __LITTLE_ENDIAN__
04067002
UD
1027 addi rSTR1, rSTR1, 4
1028 addi rSTR2, rSTR2, 4
fe6e95d7 1029#endif
04067002
UD
1030 bne cr6, L(duLcr6)
1031 slwi. rN, rN, 3
1032 bne cr5, L(duLcr5)
1033 cmplw cr7, rN, rSHR
1034 beq L(duZeroReturn)
fe6e95d7 1035 li r0, 0
04067002 1036 ble cr7, L(dutrim)
fe6e95d7
AM
1037#ifdef __LITTLE_ENDIAN__
1038 lwbrx rWORD2, 0, rSTR2
1039 addi rSTR2, rSTR2, 4
1040#else
04067002 1041 lwz rWORD2, 4(rSTR2)
fe6e95d7
AM
1042#endif
1043 srw r0, rWORD2, rSHR
04067002 1044 b L(dutrim)
9c84384c 1045
04067002 1046/* Remainder is 12 */
fe6e95d7 1047 .align 4
04067002 1048L(duP3):
fe6e95d7
AM
1049 srw r12, rWORD8, rSHR
1050#ifdef __LITTLE_ENDIAN__
1051 lwbrx rWORD3, 0, rSTR1
1052 addi rSTR1, rSTR1, 4
1053#else
04067002 1054 lwz rWORD3, 0(rSTR1)
fe6e95d7
AM
1055#endif
1056 slw rWORD4_SHIFT, rWORD8, rSHL
1057 or rWORD4, r12, rWORD6_SHIFT
04067002 1058L(duP3e):
fe6e95d7
AM
1059#ifdef __LITTLE_ENDIAN__
1060 lwbrx rWORD5, 0, rSTR1
1061 lwbrx rWORD6, 0, rSTR2
1062 addi rSTR1, rSTR1, 4
1063 addi rSTR2, rSTR2, 4
1064#else
04067002
UD
1065 lwz rWORD5, 4(rSTR1)
1066 lwz rWORD6, 4(rSTR2)
fe6e95d7 1067#endif
04067002 1068 cmplw cr1, rWORD3, rWORD4
fe6e95d7
AM
1069 srw r0, rWORD6, rSHR
1070 slw rWORD6_SHIFT, rWORD6, rSHL
1071 or rWORD6, r0, rWORD4_SHIFT
1072#ifdef __LITTLE_ENDIAN__
1073 lwbrx rWORD7, 0, rSTR1
1074 lwbrx rWORD8, 0, rSTR2
1075 addi rSTR1, rSTR1, 4
1076 addi rSTR2, rSTR2, 4
1077#else
04067002
UD
1078 lwz rWORD7, 8(rSTR1)
1079 lwz rWORD8, 8(rSTR2)
fe6e95d7 1080#endif
04067002
UD
1081 cmplw cr6, rWORD5, rWORD6
1082 bne cr1, L(duLcr1)
fe6e95d7
AM
1083 srw r12, rWORD8, rSHR
1084 slw rWORD8_SHIFT, rWORD8, rSHL
1085 or rWORD8, r12, rWORD6_SHIFT
04067002 1086 blt cr7, L(duP3x)
fe6e95d7
AM
1087#ifdef __LITTLE_ENDIAN__
1088 lwbrx rWORD1, 0, rSTR1
1089 lwbrx rWORD2, 0, rSTR2
1090 addi rSTR1, rSTR1, 4
1091 addi rSTR2, rSTR2, 4
1092#else
04067002
UD
1093 lwz rWORD1, 12(rSTR1)
1094 lwz rWORD2, 12(rSTR2)
fe6e95d7 1095#endif
04067002
UD
1096 cmplw cr5, rWORD7, rWORD8
1097 bne cr6, L(duLcr6)
fe6e95d7
AM
1098 srw r0, rWORD2, rSHR
1099 slw rWORD2_SHIFT, rWORD2, rSHL
1100 or rWORD2, r0, rWORD8_SHIFT
1101#ifndef __LITTLE_ENDIAN__
04067002
UD
1102 addi rSTR1, rSTR1, 8
1103 addi rSTR2, rSTR2, 8
fe6e95d7
AM
1104#endif
1105 cmplw cr7, rWORD1, rWORD2
04067002 1106 b L(duLoop1)
fe6e95d7 1107 .align 4
04067002 1108L(duP3x):
fe6e95d7 1109#ifndef __LITTLE_ENDIAN__
04067002
UD
1110 addi rSTR1, rSTR1, 8
1111 addi rSTR2, rSTR2, 8
fe6e95d7
AM
1112#endif
1113#if 0
1114/* Huh? We've already branched on cr1! */
04067002 1115 bne cr1, L(duLcr1)
fe6e95d7 1116#endif
04067002
UD
1117 cmplw cr5, rWORD7, rWORD8
1118 bne cr6, L(duLcr6)
1119 slwi. rN, rN, 3
1120 bne cr5, L(duLcr5)
1121 cmplw cr7, rN, rSHR
1122 beq L(duZeroReturn)
fe6e95d7 1123 li r0, 0
04067002 1124 ble cr7, L(dutrim)
fe6e95d7
AM
1125#ifdef __LITTLE_ENDIAN__
1126 lwbrx rWORD2, 0, rSTR2
1127 addi rSTR2, rSTR2, 4
1128#else
04067002 1129 lwz rWORD2, 4(rSTR2)
fe6e95d7
AM
1130#endif
1131 srw r0, rWORD2, rSHR
04067002 1132 b L(dutrim)
9c84384c 1133
04067002 1134/* Count is a multiple of 16, remainder is 0 */
fe6e95d7 1135 .align 4
04067002 1136L(duP4):
fe6e95d7
AM
1137 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1138 srw r0, rWORD8, rSHR
1139#ifdef __LITTLE_ENDIAN__
1140 lwbrx rWORD1, 0, rSTR1
1141 addi rSTR1, rSTR1, 4
1142#else
04067002 1143 lwz rWORD1, 0(rSTR1)
fe6e95d7
AM
1144#endif
1145 slw rWORD2_SHIFT, rWORD8, rSHL
1146 or rWORD2, r0, rWORD6_SHIFT
04067002 1147L(duP4e):
fe6e95d7
AM
1148#ifdef __LITTLE_ENDIAN__
1149 lwbrx rWORD3, 0, rSTR1
1150 lwbrx rWORD4, 0, rSTR2
1151 addi rSTR1, rSTR1, 4
1152 addi rSTR2, rSTR2, 4
1153#else
04067002
UD
1154 lwz rWORD3, 4(rSTR1)
1155 lwz rWORD4, 4(rSTR2)
fe6e95d7
AM
1156#endif
1157 cmplw cr7, rWORD1, rWORD2
1158 srw r12, rWORD4, rSHR
1159 slw rWORD4_SHIFT, rWORD4, rSHL
1160 or rWORD4, r12, rWORD2_SHIFT
1161#ifdef __LITTLE_ENDIAN__
1162 lwbrx rWORD5, 0, rSTR1
1163 lwbrx rWORD6, 0, rSTR2
1164 addi rSTR1, rSTR1, 4
1165 addi rSTR2, rSTR2, 4
1166#else
04067002
UD
1167 lwz rWORD5, 8(rSTR1)
1168 lwz rWORD6, 8(rSTR2)
fe6e95d7 1169#endif
04067002 1170 cmplw cr1, rWORD3, rWORD4
fe6e95d7
AM
1171 bne cr7, L(duLcr7)
1172 srw r0, rWORD6, rSHR
1173 slw rWORD6_SHIFT, rWORD6, rSHL
1174 or rWORD6, r0, rWORD4_SHIFT
1175#ifdef __LITTLE_ENDIAN__
1176 lwbrx rWORD7, 0, rSTR1
1177 lwbrx rWORD8, 0, rSTR2
1178 addi rSTR1, rSTR1, 4
1179 addi rSTR2, rSTR2, 4
1180#else
04067002
UD
1181 lwzu rWORD7, 12(rSTR1)
1182 lwzu rWORD8, 12(rSTR2)
fe6e95d7 1183#endif
04067002
UD
1184 cmplw cr6, rWORD5, rWORD6
1185 bne cr1, L(duLcr1)
fe6e95d7
AM
1186 srw r12, rWORD8, rSHR
1187 slw rWORD8_SHIFT, rWORD8, rSHL
1188 or rWORD8, r12, rWORD6_SHIFT
04067002
UD
1189 cmplw cr5, rWORD7, rWORD8
1190 bdz- L(du24) /* Adjust CTR as we start with +4 */
1191/* This is the primary loop */
fe6e95d7 1192 .align 4
04067002 1193L(duLoop):
fe6e95d7
AM
1194#ifdef __LITTLE_ENDIAN__
1195 lwbrx rWORD1, 0, rSTR1
1196 lwbrx rWORD2, 0, rSTR2
1197 addi rSTR1, rSTR1, 4
1198 addi rSTR2, rSTR2, 4
1199#else
04067002
UD
1200 lwz rWORD1, 4(rSTR1)
1201 lwz rWORD2, 4(rSTR2)
fe6e95d7 1202#endif
04067002
UD
1203 cmplw cr1, rWORD3, rWORD4
1204 bne cr6, L(duLcr6)
fe6e95d7
AM
1205 srw r0, rWORD2, rSHR
1206 slw rWORD2_SHIFT, rWORD2, rSHL
1207 or rWORD2, r0, rWORD8_SHIFT
04067002 1208L(duLoop1):
fe6e95d7
AM
1209#ifdef __LITTLE_ENDIAN__
1210 lwbrx rWORD3, 0, rSTR1
1211 lwbrx rWORD4, 0, rSTR2
1212 addi rSTR1, rSTR1, 4
1213 addi rSTR2, rSTR2, 4
1214#else
04067002
UD
1215 lwz rWORD3, 8(rSTR1)
1216 lwz rWORD4, 8(rSTR2)
fe6e95d7 1217#endif
04067002
UD
1218 cmplw cr6, rWORD5, rWORD6
1219 bne cr5, L(duLcr5)
fe6e95d7
AM
1220 srw r12, rWORD4, rSHR
1221 slw rWORD4_SHIFT, rWORD4, rSHL
1222 or rWORD4, r12, rWORD2_SHIFT
04067002 1223L(duLoop2):
fe6e95d7
AM
1224#ifdef __LITTLE_ENDIAN__
1225 lwbrx rWORD5, 0, rSTR1
1226 lwbrx rWORD6, 0, rSTR2
1227 addi rSTR1, rSTR1, 4
1228 addi rSTR2, rSTR2, 4
1229#else
04067002
UD
1230 lwz rWORD5, 12(rSTR1)
1231 lwz rWORD6, 12(rSTR2)
fe6e95d7 1232#endif
04067002 1233 cmplw cr5, rWORD7, rWORD8
fe6e95d7
AM
1234 bne cr7, L(duLcr7)
1235 srw r0, rWORD6, rSHR
1236 slw rWORD6_SHIFT, rWORD6, rSHL
1237 or rWORD6, r0, rWORD4_SHIFT
04067002 1238L(duLoop3):
fe6e95d7
AM
1239#ifdef __LITTLE_ENDIAN__
1240 lwbrx rWORD7, 0, rSTR1
1241 lwbrx rWORD8, 0, rSTR2
1242 addi rSTR1, rSTR1, 4
1243 addi rSTR2, rSTR2, 4
1244#else
04067002
UD
1245 lwzu rWORD7, 16(rSTR1)
1246 lwzu rWORD8, 16(rSTR2)
fe6e95d7
AM
1247#endif
1248 cmplw cr7, rWORD1, rWORD2
04067002 1249 bne- cr1, L(duLcr1)
fe6e95d7
AM
1250 srw r12, rWORD8, rSHR
1251 slw rWORD8_SHIFT, rWORD8, rSHL
1252 or rWORD8, r12, rWORD6_SHIFT
9c84384c
JM
1253 bdnz+ L(duLoop)
1254
04067002 1255L(duL4):
fe6e95d7
AM
1256#if 0
1257/* Huh? We've already branched on cr1! */
04067002 1258 bne cr1, L(duLcr1)
fe6e95d7 1259#endif
04067002
UD
1260 cmplw cr1, rWORD3, rWORD4
1261 bne cr6, L(duLcr6)
1262 cmplw cr6, rWORD5, rWORD6
1263 bne cr5, L(duLcr5)
1264 cmplw cr5, rWORD7, rWORD8
1265L(du44):
fe6e95d7 1266 bne cr7, L(duLcr7)
04067002
UD
1267L(du34):
1268 bne cr1, L(duLcr1)
1269L(du24):
1270 bne cr6, L(duLcr6)
1271L(du14):
1272 slwi. rN, rN, 3
1273 bne cr5, L(duLcr5)
1274/* At this point we have a remainder of 1 to 3 bytes to compare. We use
9c84384c 1275 shift right to eliminate bits beyond the compare length.
fe6e95d7 1276 This allows the use of word subtract to compute the final result.
04067002 1277
9c84384c 1278 However it may not be safe to load rWORD2 which may be beyond the
04067002
UD
1279 string length. So we compare the bit length of the remainder to
1280 the right shift count (rSHR). If the bit count is less than or equal
1281 we do not need to load rWORD2 (all significant bits are already in
fe6e95d7 1282 rWORD8_SHIFT). */
04067002
UD
1283 cmplw cr7, rN, rSHR
1284 beq L(duZeroReturn)
fe6e95d7 1285 li r0, 0
04067002 1286 ble cr7, L(dutrim)
fe6e95d7
AM
1287#ifdef __LITTLE_ENDIAN__
1288 lwbrx rWORD2, 0, rSTR2
1289 addi rSTR2, rSTR2, 4
1290#else
04067002 1291 lwz rWORD2, 4(rSTR2)
fe6e95d7
AM
1292#endif
1293 srw r0, rWORD2, rSHR
1294 .align 4
04067002 1295L(dutrim):
fe6e95d7
AM
1296#ifdef __LITTLE_ENDIAN__
1297 lwbrx rWORD1, 0, rSTR1
1298#else
04067002 1299 lwz rWORD1, 4(rSTR1)
fe6e95d7
AM
1300#endif
1301 lwz rWORD8, 48(r1)
9c84384c 1302 subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
fe6e95d7
AM
1303 or rWORD2, r0, rWORD8_SHIFT
1304 lwz rWORD7, 44(r1)
1305 lwz rSHL, 40(r1)
04067002
UD
1306 srw rWORD1, rWORD1, rN
1307 srw rWORD2, rWORD2, rN
fe6e95d7
AM
1308 lwz rSHR, 36(r1)
1309 lwz rWORD8_SHIFT, 32(r1)
1310 sub rRTN, rWORD1, rWORD2
1311 b L(dureturn26)
1312 .align 4
1313L(duLcr7):
1314 lwz rWORD8, 48(r1)
1315 lwz rWORD7, 44(r1)
04067002 1316 li rRTN, 1
fe6e95d7
AM
1317 bgt cr7, L(dureturn29)
1318 lwz rSHL, 40(r1)
1319 lwz rSHR, 36(r1)
04067002
UD
1320 li rRTN, -1
1321 b L(dureturn27)
fe6e95d7 1322 .align 4
04067002 1323L(duLcr1):
fe6e95d7
AM
1324 lwz rWORD8, 48(r1)
1325 lwz rWORD7, 44(r1)
04067002 1326 li rRTN, 1
9c84384c 1327 bgt cr1, L(dureturn29)
fe6e95d7
AM
1328 lwz rSHL, 40(r1)
1329 lwz rSHR, 36(r1)
04067002
UD
1330 li rRTN, -1
1331 b L(dureturn27)
fe6e95d7 1332 .align 4
04067002 1333L(duLcr6):
fe6e95d7
AM
1334 lwz rWORD8, 48(r1)
1335 lwz rWORD7, 44(r1)
04067002 1336 li rRTN, 1
9c84384c 1337 bgt cr6, L(dureturn29)
fe6e95d7
AM
1338 lwz rSHL, 40(r1)
1339 lwz rSHR, 36(r1)
04067002
UD
1340 li rRTN, -1
1341 b L(dureturn27)
fe6e95d7 1342 .align 4
04067002 1343L(duLcr5):
fe6e95d7
AM
1344 lwz rWORD8, 48(r1)
1345 lwz rWORD7, 44(r1)
04067002 1346 li rRTN, 1
9c84384c 1347 bgt cr5, L(dureturn29)
fe6e95d7
AM
1348 lwz rSHL, 40(r1)
1349 lwz rSHR, 36(r1)
04067002
UD
1350 li rRTN, -1
1351 b L(dureturn27)
1352 .align 3
1353L(duZeroReturn):
fe6e95d7 1354 li rRTN, 0
04067002
UD
1355 .align 4
1356L(dureturn):
fe6e95d7
AM
1357 lwz rWORD8, 48(r1)
1358 lwz rWORD7, 44(r1)
9c84384c 1359L(dureturn29):
fe6e95d7
AM
1360 lwz rSHL, 40(r1)
1361 lwz rSHR, 36(r1)
9c84384c 1362L(dureturn27):
fe6e95d7 1363 lwz rWORD8_SHIFT, 32(r1)
9c84384c 1364L(dureturn26):
fe6e95d7 1365 lwz rWORD2_SHIFT, 28(r1)
9c84384c 1366L(dureturn25):
fe6e95d7
AM
1367 lwz rWORD4_SHIFT, 24(r1)
1368 lwz rWORD6_SHIFT, 20(r1)
1369 addi 1, 1, 64
1370 cfi_adjust_cfa_offset(-64)
04067002 1371 blr
b5510883 1372END (memcmp)
04067002
UD
1373
1374libc_hidden_builtin_def (memcmp)
1375weak_alias (memcmp, bcmp)