]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power7/memcmp.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memcmp.S
CommitLineData
158db122 1/* Optimized memcmp implementation for POWER7/PowerPC64.
04277e02 2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
158db122
LM
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
158db122
LM
18
19#include <sysdep.h>
158db122
LM
20
21/* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
b6a66222
WSM
24#ifndef MEMCMP
25# define MEMCMP memcmp
26#endif
158db122 27 .machine power7
d5b41185 28ENTRY_TOCLESS (MEMCMP, 4)
158db122
LM
29 CALL_MCOUNT 3
30
ce6615c9
AZ
31#define rRTN r3
32#define rSTR1 r3 /* first string arg */
33#define rSTR2 r4 /* second string arg */
34#define rN r5 /* max string length */
35#define rWORD1 r6 /* current word in s1 */
36#define rWORD2 r7 /* current word in s2 */
37#define rWORD3 r8 /* next word in s1 */
38#define rWORD4 r9 /* next word in s2 */
39#define rWORD5 r10 /* next word in s1 */
40#define rWORD6 r11 /* next word in s2 */
41
42#define rOFF8 r20 /* 8 bytes offset. */
43#define rOFF16 r21 /* 16 bytes offset. */
44#define rOFF24 r22 /* 24 bytes offset. */
45#define rOFF32 r23 /* 24 bytes offset. */
46#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
47#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
48#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
49#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
50#define rSHR r28 /* Unaligned shift right count. */
51#define rSHL r29 /* Unaligned shift left count. */
52#define rWORD7 r30 /* next word in s1 */
53#define rWORD8 r31 /* next word in s2 */
54
55#define rWORD8SAVE (-8)
56#define rWORD7SAVE (-16)
57#define rOFF8SAVE (-24)
58#define rOFF16SAVE (-32)
59#define rOFF24SAVE (-40)
60#define rOFF32SAVE (-48)
61#define rSHRSAVE (-56)
62#define rSHLSAVE (-64)
63#define rWORD8SHIFTSAVE (-72)
64#define rWORD2SHIFTSAVE (-80)
65#define rWORD4SHIFTSAVE (-88)
66#define rWORD6SHIFTSAVE (-96)
67
68#ifdef __LITTLE_ENDIAN__
69# define LD ldbrx
70#else
71# define LD ldx
72#endif
158db122 73
fe6e95d7
AM
74 xor r0, rSTR2, rSTR1
75 cmpldi cr6, rN, 0
76 cmpldi cr1, rN, 12
77 clrldi. r0, r0, 61
78 clrldi r12, rSTR1, 61
79 cmpldi cr5, r12, 0
80 beq- cr6, L(zeroLength)
81 dcbt 0, rSTR1
82 dcbt 0, rSTR2
2ccdea26 83/* If less than 8 bytes or not aligned, use the unaligned
158db122 84 byte loop. */
fe6e95d7 85 blt cr1, L(bytealigned)
ce6615c9 86 std rWORD8, rWORD8SAVE(r1)
ce6615c9 87 std rWORD7, rWORD7SAVE(r1)
ce6615c9 88 std rOFF8, rOFF8SAVE(r1)
ce6615c9 89 std rOFF16, rOFF16SAVE(r1)
ce6615c9 90 std rOFF24, rOFF24SAVE(r1)
ce6615c9 91 std rOFF32, rOFF32SAVE(r1)
869d7180
RS
92 cfi_offset(rWORD8, rWORD8SAVE)
93 cfi_offset(rWORD7, rWORD7SAVE)
94 cfi_offset(rOFF8, rOFF8SAVE)
95 cfi_offset(rOFF16, rOFF16SAVE)
96 cfi_offset(rOFF24, rOFF24SAVE)
97 cfi_offset(rOFF32, rOFF32SAVE)
ce6615c9
AZ
98
99 li rOFF8,8
100 li rOFF16,16
101 li rOFF24,24
102 li rOFF32,32
103
158db122
LM
104 bne L(unaligned)
105/* At this point we know both strings have the same alignment and the
fe6e95d7 106 compare length is at least 8 bytes. r12 contains the low order
158db122 107 3 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7
AM
108 of r12 to 0. If r12 == 0 then we are already double word
109 aligned and can perform the DW aligned loop.
1dc24235 110
158db122 111 Otherwise we know the two strings have the same alignment (but not
fe6e95d7
AM
112 yet DW). So we force the string addresses to the next lower DW
113 boundary and special case this first DW using shift left to
2ccdea26 114 eliminate bits preceding the first byte. Since we want to join the
fe6e95d7 115 normal (DW aligned) compare loop, starting at the second double word,
158db122 116 we need to adjust the length (rN) and special case the loop
fe6e95d7
AM
117 versioning for the first DW. This ensures that the loop count is
118 correct and the first DW (shifted) is in the expected register pair. */
158db122
LM
119 .align 4
120L(samealignment):
fe6e95d7
AM
121 clrrdi rSTR1, rSTR1, 3
122 clrrdi rSTR2, rSTR2, 3
123 beq cr5, L(DWaligned)
124 add rN, rN, r12
125 sldi rWORD6, r12, 3
126 srdi r0, rN, 5 /* Divide by 32 */
127 andi. r12, rN, 24 /* Get the DW remainder */
ce6615c9
AZ
128 LD rWORD1, 0, rSTR1
129 LD rWORD2, 0, rSTR2
fe6e95d7
AM
130 cmpldi cr1, r12, 16
131 cmpldi cr7, rN, 32
132 clrldi rN, rN, 61
158db122 133 beq L(dPs4)
fe6e95d7
AM
134 mtctr r0
135 bgt cr1, L(dPs3)
136 beq cr1, L(dPs2)
158db122
LM
137
138/* Remainder is 8 */
139 .align 3
140L(dsP1):
fe6e95d7
AM
141 sld rWORD5, rWORD1, rWORD6
142 sld rWORD6, rWORD2, rWORD6
143 cmpld cr5, rWORD5, rWORD6
144 blt cr7, L(dP1x)
158db122 145/* Do something useful in this cycle since we have to branch anyway. */
ce6615c9
AZ
146 LD rWORD1, rOFF8, rSTR1
147 LD rWORD2, rOFF8, rSTR2
fe6e95d7 148 cmpld cr7, rWORD1, rWORD2
158db122
LM
149 b L(dP1e)
150/* Remainder is 16 */
151 .align 4
152L(dPs2):
fe6e95d7
AM
153 sld rWORD5, rWORD1, rWORD6
154 sld rWORD6, rWORD2, rWORD6
155 cmpld cr6, rWORD5, rWORD6
156 blt cr7, L(dP2x)
158db122 157/* Do something useful in this cycle since we have to branch anyway. */
ce6615c9
AZ
158 LD rWORD7, rOFF8, rSTR1
159 LD rWORD8, rOFF8, rSTR2
fe6e95d7 160 cmpld cr5, rWORD7, rWORD8
158db122
LM
161 b L(dP2e)
162/* Remainder is 24 */
163 .align 4
164L(dPs3):
fe6e95d7
AM
165 sld rWORD3, rWORD1, rWORD6
166 sld rWORD4, rWORD2, rWORD6
167 cmpld cr1, rWORD3, rWORD4
158db122
LM
168 b L(dP3e)
169/* Count is a multiple of 32, remainder is 0 */
170 .align 4
171L(dPs4):
fe6e95d7
AM
172 mtctr r0
173 sld rWORD1, rWORD1, rWORD6
174 sld rWORD2, rWORD2, rWORD6
175 cmpld cr7, rWORD1, rWORD2
158db122
LM
176 b L(dP4e)
177
178/* At this point we know both strings are double word aligned and the
179 compare length is at least 8 bytes. */
180 .align 4
181L(DWaligned):
fe6e95d7
AM
182 andi. r12, rN, 24 /* Get the DW remainder */
183 srdi r0, rN, 5 /* Divide by 32 */
184 cmpldi cr1, r12, 16
185 cmpldi cr7, rN, 32
186 clrldi rN, rN, 61
158db122 187 beq L(dP4)
fe6e95d7
AM
188 bgt cr1, L(dP3)
189 beq cr1, L(dP2)
158db122
LM
190
191/* Remainder is 8 */
192 .align 4
193L(dP1):
fe6e95d7 194 mtctr r0
158db122 195/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
2ccdea26
AB
196 (8-15 byte compare), we want to use only volatile registers. This
197 means we can avoid restoring non-volatile registers since we did not
158db122 198 change any on the early exit path. The key here is the non-early
1dc24235 199 exit path only cares about the condition code (cr5), not about which
158db122 200 register pair was used. */
ce6615c9
AZ
201 LD rWORD5, 0, rSTR1
202 LD rWORD6, 0, rSTR2
fe6e95d7
AM
203 cmpld cr5, rWORD5, rWORD6
204 blt cr7, L(dP1x)
ce6615c9
AZ
205 LD rWORD1, rOFF8, rSTR1
206 LD rWORD2, rOFF8, rSTR2
fe6e95d7 207 cmpld cr7, rWORD1, rWORD2
158db122 208L(dP1e):
ce6615c9
AZ
209 LD rWORD3, rOFF16, rSTR1
210 LD rWORD4, rOFF16, rSTR2
fe6e95d7 211 cmpld cr1, rWORD3, rWORD4
ce6615c9
AZ
212 LD rWORD5, rOFF24, rSTR1
213 LD rWORD6, rOFF24, rSTR2
fe6e95d7
AM
214 cmpld cr6, rWORD5, rWORD6
215 bne cr5, L(dLcr5x)
216 bne cr7, L(dLcr7x)
217
ce6615c9
AZ
218 LD rWORD7, rOFF32, rSTR1
219 LD rWORD8, rOFF32, rSTR2
220 addi rSTR1, rSTR1, 32
221 addi rSTR2, rSTR2, 32
fe6e95d7
AM
222 bne cr1, L(dLcr1)
223 cmpld cr5, rWORD7, rWORD8
158db122 224 bdnz L(dLoop)
fe6e95d7 225 bne cr6, L(dLcr6)
ce6615c9
AZ
226 ld rWORD8, rWORD8SAVE(r1)
227 ld rWORD7, rWORD7SAVE(r1)
158db122
LM
228 .align 3
229L(dP1x):
fe6e95d7
AM
230 sldi. r12, rN, 3
231 bne cr5, L(dLcr5x)
232 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
158db122 233 bne L(d00)
ce6615c9
AZ
234 ld rOFF8, rOFF8SAVE(r1)
235 ld rOFF16, rOFF16SAVE(r1)
236 ld rOFF24, rOFF24SAVE(r1)
237 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 238 li rRTN, 0
158db122
LM
239 blr
240
241/* Remainder is 16 */
242 .align 4
243L(dP2):
fe6e95d7 244 mtctr r0
ce6615c9
AZ
245 LD rWORD5, 0, rSTR1
246 LD rWORD6, 0, rSTR2
fe6e95d7
AM
247 cmpld cr6, rWORD5, rWORD6
248 blt cr7, L(dP2x)
ce6615c9
AZ
249 LD rWORD7, rOFF8, rSTR1
250 LD rWORD8, rOFF8, rSTR2
fe6e95d7 251 cmpld cr5, rWORD7, rWORD8
158db122 252L(dP2e):
ce6615c9
AZ
253 LD rWORD1, rOFF16, rSTR1
254 LD rWORD2, rOFF16, rSTR2
fe6e95d7 255 cmpld cr7, rWORD1, rWORD2
ce6615c9
AZ
256 LD rWORD3, rOFF24, rSTR1
257 LD rWORD4, rOFF24, rSTR2
fe6e95d7 258 cmpld cr1, rWORD3, rWORD4
fe6e95d7
AM
259 addi rSTR1, rSTR1, 8
260 addi rSTR2, rSTR2, 8
fe6e95d7
AM
261 bne cr6, L(dLcr6)
262 bne cr5, L(dLcr5)
158db122 263 b L(dLoop2)
158db122
LM
264 .align 4
265L(dP2x):
ce6615c9
AZ
266 LD rWORD3, rOFF8, rSTR1
267 LD rWORD4, rOFF8, rSTR2
fe6e95d7
AM
268 cmpld cr1, rWORD3, rWORD4
269 sldi. r12, rN, 3
270 bne cr6, L(dLcr6x)
fe6e95d7
AM
271 addi rSTR1, rSTR1, 8
272 addi rSTR2, rSTR2, 8
fe6e95d7
AM
273 bne cr1, L(dLcr1x)
274 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
158db122 275 bne L(d00)
ce6615c9
AZ
276 ld rOFF8, rOFF8SAVE(r1)
277 ld rOFF16, rOFF16SAVE(r1)
278 ld rOFF24, rOFF24SAVE(r1)
279 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 280 li rRTN, 0
158db122
LM
281 blr
282
283/* Remainder is 24 */
284 .align 4
285L(dP3):
fe6e95d7 286 mtctr r0
ce6615c9
AZ
287 LD rWORD3, 0, rSTR1
288 LD rWORD4, 0, rSTR2
fe6e95d7 289 cmpld cr1, rWORD3, rWORD4
158db122 290L(dP3e):
ce6615c9
AZ
291 LD rWORD5, rOFF8, rSTR1
292 LD rWORD6, rOFF8, rSTR2
fe6e95d7
AM
293 cmpld cr6, rWORD5, rWORD6
294 blt cr7, L(dP3x)
ce6615c9
AZ
295 LD rWORD7, rOFF16, rSTR1
296 LD rWORD8, rOFF16, rSTR2
fe6e95d7 297 cmpld cr5, rWORD7, rWORD8
ce6615c9
AZ
298 LD rWORD1, rOFF24, rSTR1
299 LD rWORD2, rOFF24, rSTR2
fe6e95d7 300 cmpld cr7, rWORD1, rWORD2
fe6e95d7
AM
301 addi rSTR1, rSTR1, 16
302 addi rSTR2, rSTR2, 16
fe6e95d7
AM
303 bne cr1, L(dLcr1)
304 bne cr6, L(dLcr6)
158db122
LM
305 b L(dLoop1)
306/* Again we are on a early exit path (24-31 byte compare), we want to
2ccdea26 307 only use volatile registers and avoid restoring non-volatile
158db122
LM
308 registers. */
309 .align 4
310L(dP3x):
ce6615c9
AZ
311 LD rWORD1, rOFF16, rSTR1
312 LD rWORD2, rOFF16, rSTR2
fe6e95d7
AM
313 cmpld cr7, rWORD1, rWORD2
314 sldi. r12, rN, 3
315 bne cr1, L(dLcr1x)
fe6e95d7
AM
316 addi rSTR1, rSTR1, 16
317 addi rSTR2, rSTR2, 16
fe6e95d7
AM
318 bne cr6, L(dLcr6x)
319 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
320 bne cr7, L(dLcr7x)
158db122 321 bne L(d00)
ce6615c9
AZ
322 ld rOFF8, rOFF8SAVE(r1)
323 ld rOFF16, rOFF16SAVE(r1)
324 ld rOFF24, rOFF24SAVE(r1)
325 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 326 li rRTN, 0
158db122
LM
327 blr
328
329/* Count is a multiple of 32, remainder is 0 */
330 .align 4
331L(dP4):
fe6e95d7 332 mtctr r0
ce6615c9
AZ
333 LD rWORD1, 0, rSTR1
334 LD rWORD2, 0, rSTR2
fe6e95d7 335 cmpld cr7, rWORD1, rWORD2
158db122 336L(dP4e):
ce6615c9
AZ
337 LD rWORD3, rOFF8, rSTR1
338 LD rWORD4, rOFF8, rSTR2
fe6e95d7 339 cmpld cr1, rWORD3, rWORD4
ce6615c9
AZ
340 LD rWORD5, rOFF16, rSTR1
341 LD rWORD6, rOFF16, rSTR2
fe6e95d7 342 cmpld cr6, rWORD5, rWORD6
ce6615c9
AZ
343 LD rWORD7, rOFF24, rSTR1
344 LD rWORD8, rOFF24, rSTR2
345 addi rSTR1, rSTR1, 24
346 addi rSTR2, rSTR2, 24
fe6e95d7
AM
347 cmpld cr5, rWORD7, rWORD8
348 bne cr7, L(dLcr7)
349 bne cr1, L(dLcr1)
158db122
LM
350 bdz- L(d24) /* Adjust CTR as we start with +4 */
351/* This is the primary loop */
352 .align 4
353L(dLoop):
ce6615c9
AZ
354 LD rWORD1, rOFF8, rSTR1
355 LD rWORD2, rOFF8, rSTR2
fe6e95d7
AM
356 cmpld cr1, rWORD3, rWORD4
357 bne cr6, L(dLcr6)
158db122 358L(dLoop1):
ce6615c9
AZ
359 LD rWORD3, rOFF16, rSTR1
360 LD rWORD4, rOFF16, rSTR2
fe6e95d7
AM
361 cmpld cr6, rWORD5, rWORD6
362 bne cr5, L(dLcr5)
158db122 363L(dLoop2):
ce6615c9
AZ
364 LD rWORD5, rOFF24, rSTR1
365 LD rWORD6, rOFF24, rSTR2
fe6e95d7
AM
366 cmpld cr5, rWORD7, rWORD8
367 bne cr7, L(dLcr7)
158db122 368L(dLoop3):
ce6615c9
AZ
369 LD rWORD7, rOFF32, rSTR1
370 LD rWORD8, rOFF32, rSTR2
371 addi rSTR1, rSTR1, 32
372 addi rSTR2, rSTR2, 32
fe6e95d7
AM
373 bne cr1, L(dLcr1)
374 cmpld cr7, rWORD1, rWORD2
158db122
LM
375 bdnz L(dLoop)
376
377L(dL4):
fe6e95d7
AM
378 cmpld cr1, rWORD3, rWORD4
379 bne cr6, L(dLcr6)
380 cmpld cr6, rWORD5, rWORD6
381 bne cr5, L(dLcr5)
382 cmpld cr5, rWORD7, rWORD8
158db122 383L(d44):
fe6e95d7 384 bne cr7, L(dLcr7)
158db122 385L(d34):
fe6e95d7 386 bne cr1, L(dLcr1)
158db122 387L(d24):
fe6e95d7 388 bne cr6, L(dLcr6)
158db122 389L(d14):
fe6e95d7
AM
390 sldi. r12, rN, 3
391 bne cr5, L(dLcr5)
158db122 392L(d04):
ce6615c9
AZ
393 ld rWORD8, rWORD8SAVE(r1)
394 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7 395 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
ce6615c9 396 beq L(duzeroLength)
158db122
LM
397/* At this point we have a remainder of 1 to 7 bytes to compare. Since
398 we are aligned it is safe to load the whole double word, and use
2ccdea26 399 shift right double to eliminate bits beyond the compare length. */
158db122 400L(d00):
ce6615c9
AZ
401 LD rWORD1, rOFF8, rSTR1
402 LD rWORD2, rOFF8, rSTR2
fe6e95d7
AM
403 srd rWORD1, rWORD1, rN
404 srd rWORD2, rWORD2, rN
405 cmpld cr7, rWORD1, rWORD2
406 bne cr7, L(dLcr7x)
ce6615c9
AZ
407 ld rOFF8, rOFF8SAVE(r1)
408 ld rOFF16, rOFF16SAVE(r1)
409 ld rOFF24, rOFF24SAVE(r1)
410 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 411 li rRTN, 0
158db122 412 blr
fe6e95d7 413
158db122 414 .align 4
fe6e95d7 415L(dLcr7):
ce6615c9
AZ
416 ld rWORD8, rWORD8SAVE(r1)
417 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7 418L(dLcr7x):
ce6615c9
AZ
419 ld rOFF8, rOFF8SAVE(r1)
420 ld rOFF16, rOFF16SAVE(r1)
421 ld rOFF24, rOFF24SAVE(r1)
422 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7
AM
423 li rRTN, 1
424 bgtlr cr7
425 li rRTN, -1
158db122
LM
426 blr
427 .align 4
428L(dLcr1):
ce6615c9
AZ
429 ld rWORD8, rWORD8SAVE(r1)
430 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7 431L(dLcr1x):
ce6615c9
AZ
432 ld rOFF8, rOFF8SAVE(r1)
433 ld rOFF16, rOFF16SAVE(r1)
434 ld rOFF24, rOFF24SAVE(r1)
435 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 436 li rRTN, 1
158db122 437 bgtlr cr1
fe6e95d7 438 li rRTN, -1
158db122
LM
439 blr
440 .align 4
441L(dLcr6):
ce6615c9
AZ
442 ld rWORD8, rWORD8SAVE(r1)
443 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7 444L(dLcr6x):
ce6615c9
AZ
445 ld rOFF8, rOFF8SAVE(r1)
446 ld rOFF16, rOFF16SAVE(r1)
447 ld rOFF24, rOFF24SAVE(r1)
448 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 449 li rRTN, 1
158db122 450 bgtlr cr6
fe6e95d7 451 li rRTN, -1
158db122
LM
452 blr
453 .align 4
454L(dLcr5):
ce6615c9
AZ
455 ld rWORD8, rWORD8SAVE(r1)
456 ld rWORD7, rWORD7SAVE(r1)
158db122 457L(dLcr5x):
ce6615c9
AZ
458 ld rOFF8, rOFF8SAVE(r1)
459 ld rOFF16, rOFF16SAVE(r1)
460 ld rOFF24, rOFF24SAVE(r1)
461 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 462 li rRTN, 1
158db122 463 bgtlr cr5
fe6e95d7 464 li rRTN, -1
158db122
LM
465 blr
466
467 .align 4
468L(bytealigned):
469 mtctr rN
158db122
LM
470
471/* We need to prime this loop. This loop is swing modulo scheduled
1dc24235 472 to avoid pipe delays. The dependent instruction latencies (load to
158db122
LM
473 compare to conditional branch) is 2 to 3 cycles. In this loop each
474 dispatch group ends in a branch and takes 1 cycle. Effectively
1dc24235
UD
475 the first iteration of the loop only serves to load operands and
476 branches based on compares are delayed until the next loop.
158db122
LM
477
478 So we must precondition some registers and condition codes so that
479 we don't exit the loop early on the first iteration. */
1dc24235 480
fe6e95d7
AM
481 lbz rWORD1, 0(rSTR1)
482 lbz rWORD2, 0(rSTR2)
158db122 483 bdz L(b11)
fe6e95d7
AM
484 cmpld cr7, rWORD1, rWORD2
485 lbz rWORD3, 1(rSTR1)
486 lbz rWORD4, 1(rSTR2)
158db122 487 bdz L(b12)
fe6e95d7
AM
488 cmpld cr1, rWORD3, rWORD4
489 lbzu rWORD5, 2(rSTR1)
490 lbzu rWORD6, 2(rSTR2)
158db122
LM
491 bdz L(b13)
492 .align 4
493L(bLoop):
fe6e95d7
AM
494 lbzu rWORD1, 1(rSTR1)
495 lbzu rWORD2, 1(rSTR2)
496 bne cr7, L(bLcr7)
158db122 497
fe6e95d7 498 cmpld cr6, rWORD5, rWORD6
158db122
LM
499 bdz L(b3i)
500
fe6e95d7
AM
501 lbzu rWORD3, 1(rSTR1)
502 lbzu rWORD4, 1(rSTR2)
503 bne cr1, L(bLcr1)
158db122 504
fe6e95d7 505 cmpld cr7, rWORD1, rWORD2
158db122
LM
506 bdz L(b2i)
507
fe6e95d7
AM
508 lbzu rWORD5, 1(rSTR1)
509 lbzu rWORD6, 1(rSTR2)
510 bne cr6, L(bLcr6)
158db122 511
fe6e95d7 512 cmpld cr1, rWORD3, rWORD4
158db122
LM
513 bdnz L(bLoop)
514
515/* We speculatively loading bytes before we have tested the previous
516 bytes. But we must avoid overrunning the length (in the ctr) to
1dc24235 517 prevent these speculative loads from causing a segfault. In this
158db122
LM
518 case the loop will exit early (before the all pending bytes are
519 tested. In this case we must complete the pending operations
520 before returning. */
521L(b1i):
fe6e95d7
AM
522 bne cr7, L(bLcr7)
523 bne cr1, L(bLcr1)
158db122
LM
524 b L(bx56)
525 .align 4
526L(b2i):
fe6e95d7
AM
527 bne cr6, L(bLcr6)
528 bne cr7, L(bLcr7)
158db122
LM
529 b L(bx34)
530 .align 4
531L(b3i):
fe6e95d7
AM
532 bne cr1, L(bLcr1)
533 bne cr6, L(bLcr6)
158db122
LM
534 b L(bx12)
535 .align 4
fe6e95d7
AM
536L(bLcr7):
537 li rRTN, 1
538 bgtlr cr7
539 li rRTN, -1
158db122
LM
540 blr
541L(bLcr1):
fe6e95d7 542 li rRTN, 1
158db122 543 bgtlr cr1
fe6e95d7 544 li rRTN, -1
158db122
LM
545 blr
546L(bLcr6):
fe6e95d7 547 li rRTN, 1
158db122 548 bgtlr cr6
fe6e95d7 549 li rRTN, -1
158db122
LM
550 blr
551
552L(b13):
fe6e95d7
AM
553 bne cr7, L(bx12)
554 bne cr1, L(bx34)
158db122 555L(bx56):
fe6e95d7 556 sub rRTN, rWORD5, rWORD6
158db122
LM
557 blr
558 nop
559L(b12):
fe6e95d7 560 bne cr7, L(bx12)
158db122 561L(bx34):
fe6e95d7 562 sub rRTN, rWORD3, rWORD4
158db122
LM
563 blr
564L(b11):
565L(bx12):
fe6e95d7 566 sub rRTN, rWORD1, rWORD2
158db122 567 blr
ce6615c9 568
1dc24235 569 .align 4
158db122 570L(zeroLength):
fe6e95d7 571 li rRTN, 0
158db122
LM
572 blr
573
574 .align 4
575/* At this point we know the strings have different alignment and the
fe6e95d7 576 compare length is at least 8 bytes. r12 contains the low order
158db122 577 3 bits of rSTR1 and cr5 contains the result of the logical compare
fe6e95d7 578 of r12 to 0. If r12 == 0 then rStr1 is double word
158db122 579 aligned and can perform the DWunaligned loop.
1dc24235 580
2ccdea26 581 Otherwise we know that rSTR1 is not already DW aligned yet.
158db122 582 So we can force the string addresses to the next lower DW
fe6e95d7 583 boundary and special case this first DW using shift left to
2ccdea26 584 eliminate bits preceding the first byte. Since we want to join the
158db122
LM
585 normal (DWaligned) compare loop, starting at the second double word,
586 we need to adjust the length (rN) and special case the loop
fe6e95d7 587 versioning for the first DW. This ensures that the loop count is
158db122 588 correct and the first DW (shifted) is in the expected resister pair. */
158db122 589L(unaligned):
ce6615c9
AZ
590 std rSHL, rSHLSAVE(r1)
591 cfi_offset(rSHL, rSHLSAVE)
fe6e95d7
AM
592 clrldi rSHL, rSTR2, 61
593 beq cr6, L(duzeroLength)
ce6615c9
AZ
594 std rSHR, rSHRSAVE(r1)
595 cfi_offset(rSHR, rSHRSAVE)
fe6e95d7 596 beq cr5, L(DWunaligned)
ce6615c9
AZ
597 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
598 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
fe6e95d7 599/* Adjust the logical start of rSTR2 to compensate for the extra bits
158db122 600 in the 1st rSTR1 DW. */
fe6e95d7 601 sub rWORD8_SHIFT, rSTR2, r12
158db122
LM
602/* But do not attempt to address the DW before that DW that contains
603 the actual start of rSTR2. */
fe6e95d7 604 clrrdi rSTR2, rSTR2, 3
ce6615c9 605 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
2ccdea26 606/* Compute the left/right shift counts for the unaligned rSTR2,
1dc24235 607 compensating for the logical (DW aligned) start of rSTR1. */
fe6e95d7
AM
608 clrldi rSHL, rWORD8_SHIFT, 61
609 clrrdi rSTR1, rSTR1, 3
ce6615c9 610 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
fe6e95d7
AM
611 sldi rSHL, rSHL, 3
612 cmpld cr5, rWORD8_SHIFT, rSTR2
613 add rN, rN, r12
614 sldi rWORD6, r12, 3
ce6615c9 615 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
869d7180
RS
616 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
617 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
ce6615c9 618 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
fe6e95d7
AM
619 subfic rSHR, rSHL, 64
620 srdi r0, rN, 5 /* Divide by 32 */
621 andi. r12, rN, 24 /* Get the DW remainder */
158db122
LM
622/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
623 this special case those bits may be discarded anyway. Also we
624 must avoid loading a DW where none of the bits are part of rSTR2 as
625 this may cross a page boundary and cause a page fault. */
fe6e95d7
AM
626 li rWORD8, 0
627 blt cr5, L(dus0)
ce6615c9 628 LD rWORD8, 0, rSTR2
fe6e95d7 629 addi rSTR2, rSTR2, 8
fe6e95d7 630 sld rWORD8, rWORD8, rSHL
158db122
LM
631
632L(dus0):
ce6615c9
AZ
633 LD rWORD1, 0, rSTR1
634 LD rWORD2, 0, rSTR2
fe6e95d7
AM
635 cmpldi cr1, r12, 16
636 cmpldi cr7, rN, 32
637 srd r12, rWORD2, rSHR
638 clrldi rN, rN, 61
158db122 639 beq L(duPs4)
fe6e95d7
AM
640 mtctr r0
641 or rWORD8, r12, rWORD8
642 bgt cr1, L(duPs3)
643 beq cr1, L(duPs2)
158db122
LM
644
645/* Remainder is 8 */
646 .align 4
647L(dusP1):
fe6e95d7
AM
648 sld rWORD8_SHIFT, rWORD2, rSHL
649 sld rWORD7, rWORD1, rWORD6
650 sld rWORD8, rWORD8, rWORD6
651 bge cr7, L(duP1e)
158db122
LM
652/* At this point we exit early with the first double word compare
653 complete and remainder of 0 to 7 bytes. See L(du14) for details on
654 how we handle the remaining bytes. */
fe6e95d7
AM
655 cmpld cr5, rWORD7, rWORD8
656 sldi. rN, rN, 3
657 bne cr5, L(duLcr5)
658 cmpld cr7, rN, rSHR
158db122 659 beq L(duZeroReturn)
fe6e95d7
AM
660 li r0, 0
661 ble cr7, L(dutrim)
ce6615c9 662 LD rWORD2, rOFF8, rSTR2
fe6e95d7 663 srd r0, rWORD2, rSHR
158db122
LM
664 b L(dutrim)
665/* Remainder is 16 */
666 .align 4
667L(duPs2):
fe6e95d7
AM
668 sld rWORD6_SHIFT, rWORD2, rSHL
669 sld rWORD5, rWORD1, rWORD6
670 sld rWORD6, rWORD8, rWORD6
158db122
LM
671 b L(duP2e)
672/* Remainder is 24 */
673 .align 4
674L(duPs3):
fe6e95d7
AM
675 sld rWORD4_SHIFT, rWORD2, rSHL
676 sld rWORD3, rWORD1, rWORD6
677 sld rWORD4, rWORD8, rWORD6
158db122
LM
678 b L(duP3e)
679/* Count is a multiple of 32, remainder is 0 */
680 .align 4
681L(duPs4):
fe6e95d7
AM
682 mtctr r0
683 or rWORD8, r12, rWORD8
684 sld rWORD2_SHIFT, rWORD2, rSHL
685 sld rWORD1, rWORD1, rWORD6
686 sld rWORD2, rWORD8, rWORD6
158db122
LM
687 b L(duP4e)
688
689/* At this point we know rSTR1 is double word aligned and the
690 compare length is at least 8 bytes. */
691 .align 4
692L(DWunaligned):
ce6615c9 693 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
fe6e95d7 694 clrrdi rSTR2, rSTR2, 3
ce6615c9 695 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
fe6e95d7 696 srdi r0, rN, 5 /* Divide by 32 */
ce6615c9 697 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
fe6e95d7 698 andi. r12, rN, 24 /* Get the DW remainder */
ce6615c9 699 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
869d7180
RS
700 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
701 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
702 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
ce6615c9 703 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
fe6e95d7 704 sldi rSHL, rSHL, 3
ce6615c9
AZ
705 LD rWORD6, 0, rSTR2
706 LD rWORD8, rOFF8, rSTR2
fe6e95d7 707 addi rSTR2, rSTR2, 8
fe6e95d7
AM
708 cmpldi cr1, r12, 16
709 cmpldi cr7, rN, 32
710 clrldi rN, rN, 61
711 subfic rSHR, rSHL, 64
712 sld rWORD6_SHIFT, rWORD6, rSHL
158db122 713 beq L(duP4)
fe6e95d7
AM
714 mtctr r0
715 bgt cr1, L(duP3)
716 beq cr1, L(duP2)
158db122
LM
717
718/* Remainder is 8 */
719 .align 4
720L(duP1):
fe6e95d7 721 srd r12, rWORD8, rSHR
ce6615c9 722 LD rWORD7, 0, rSTR1
fe6e95d7
AM
723 sld rWORD8_SHIFT, rWORD8, rSHL
724 or rWORD8, r12, rWORD6_SHIFT
725 blt cr7, L(duP1x)
158db122 726L(duP1e):
ce6615c9
AZ
727 LD rWORD1, rOFF8, rSTR1
728 LD rWORD2, rOFF8, rSTR2
fe6e95d7
AM
729 cmpld cr5, rWORD7, rWORD8
730 srd r0, rWORD2, rSHR
731 sld rWORD2_SHIFT, rWORD2, rSHL
732 or rWORD2, r0, rWORD8_SHIFT
ce6615c9
AZ
733 LD rWORD3, rOFF16, rSTR1
734 LD rWORD4, rOFF16, rSTR2
fe6e95d7
AM
735 cmpld cr7, rWORD1, rWORD2
736 srd r12, rWORD4, rSHR
737 sld rWORD4_SHIFT, rWORD4, rSHL
738 bne cr5, L(duLcr5)
739 or rWORD4, r12, rWORD2_SHIFT
ce6615c9
AZ
740 LD rWORD5, rOFF24, rSTR1
741 LD rWORD6, rOFF24, rSTR2
fe6e95d7
AM
742 cmpld cr1, rWORD3, rWORD4
743 srd r0, rWORD6, rSHR
744 sld rWORD6_SHIFT, rWORD6, rSHL
745 bne cr7, L(duLcr7)
746 or rWORD6, r0, rWORD4_SHIFT
747 cmpld cr6, rWORD5, rWORD6
158db122
LM
748 b L(duLoop3)
749 .align 4
750/* At this point we exit early with the first double word compare
751 complete and remainder of 0 to 7 bytes. See L(du14) for details on
752 how we handle the remaining bytes. */
753L(duP1x):
fe6e95d7
AM
754 cmpld cr5, rWORD7, rWORD8
755 sldi. rN, rN, 3
756 bne cr5, L(duLcr5)
757 cmpld cr7, rN, rSHR
158db122 758 beq L(duZeroReturn)
fe6e95d7
AM
759 li r0, 0
760 ble cr7, L(dutrim)
ce6615c9 761 LD rWORD2, rOFF8, rSTR2
fe6e95d7 762 srd r0, rWORD2, rSHR
158db122
LM
763 b L(dutrim)
764/* Remainder is 16 */
765 .align 4
766L(duP2):
fe6e95d7 767 srd r0, rWORD8, rSHR
ce6615c9 768 LD rWORD5, 0, rSTR1
fe6e95d7
AM
769 or rWORD6, r0, rWORD6_SHIFT
770 sld rWORD6_SHIFT, rWORD8, rSHL
158db122 771L(duP2e):
ce6615c9
AZ
772 LD rWORD7, rOFF8, rSTR1
773 LD rWORD8, rOFF8, rSTR2
fe6e95d7
AM
774 cmpld cr6, rWORD5, rWORD6
775 srd r12, rWORD8, rSHR
776 sld rWORD8_SHIFT, rWORD8, rSHL
777 or rWORD8, r12, rWORD6_SHIFT
778 blt cr7, L(duP2x)
ce6615c9
AZ
779 LD rWORD1, rOFF16, rSTR1
780 LD rWORD2, rOFF16, rSTR2
fe6e95d7
AM
781 cmpld cr5, rWORD7, rWORD8
782 bne cr6, L(duLcr6)
783 srd r0, rWORD2, rSHR
784 sld rWORD2_SHIFT, rWORD2, rSHL
785 or rWORD2, r0, rWORD8_SHIFT
ce6615c9
AZ
786 LD rWORD3, rOFF24, rSTR1
787 LD rWORD4, rOFF24, rSTR2
fe6e95d7
AM
788 cmpld cr7, rWORD1, rWORD2
789 bne cr5, L(duLcr5)
790 srd r12, rWORD4, rSHR
791 sld rWORD4_SHIFT, rWORD4, rSHL
792 or rWORD4, r12, rWORD2_SHIFT
fe6e95d7
AM
793 addi rSTR1, rSTR1, 8
794 addi rSTR2, rSTR2, 8
fe6e95d7 795 cmpld cr1, rWORD3, rWORD4
158db122
LM
796 b L(duLoop2)
797 .align 4
798L(duP2x):
fe6e95d7 799 cmpld cr5, rWORD7, rWORD8
fe6e95d7
AM
800 addi rSTR1, rSTR1, 8
801 addi rSTR2, rSTR2, 8
fe6e95d7
AM
802 bne cr6, L(duLcr6)
803 sldi. rN, rN, 3
804 bne cr5, L(duLcr5)
805 cmpld cr7, rN, rSHR
158db122 806 beq L(duZeroReturn)
fe6e95d7
AM
807 li r0, 0
808 ble cr7, L(dutrim)
ce6615c9 809 LD rWORD2, rOFF8, rSTR2
fe6e95d7 810 srd r0, rWORD2, rSHR
158db122
LM
811 b L(dutrim)
812
813/* Remainder is 24 */
814 .align 4
815L(duP3):
fe6e95d7 816 srd r12, rWORD8, rSHR
ce6615c9 817 LD rWORD3, 0, rSTR1
fe6e95d7
AM
818 sld rWORD4_SHIFT, rWORD8, rSHL
819 or rWORD4, r12, rWORD6_SHIFT
158db122 820L(duP3e):
ce6615c9
AZ
821 LD rWORD5, rOFF8, rSTR1
822 LD rWORD6, rOFF8, rSTR2
fe6e95d7
AM
823 cmpld cr1, rWORD3, rWORD4
824 srd r0, rWORD6, rSHR
825 sld rWORD6_SHIFT, rWORD6, rSHL
826 or rWORD6, r0, rWORD4_SHIFT
ce6615c9
AZ
827 LD rWORD7, rOFF16, rSTR1
828 LD rWORD8, rOFF16, rSTR2
fe6e95d7
AM
829 cmpld cr6, rWORD5, rWORD6
830 bne cr1, L(duLcr1)
831 srd r12, rWORD8, rSHR
832 sld rWORD8_SHIFT, rWORD8, rSHL
833 or rWORD8, r12, rWORD6_SHIFT
834 blt cr7, L(duP3x)
ce6615c9
AZ
835 LD rWORD1, rOFF24, rSTR1
836 LD rWORD2, rOFF24, rSTR2
fe6e95d7
AM
837 cmpld cr5, rWORD7, rWORD8
838 bne cr6, L(duLcr6)
839 srd r0, rWORD2, rSHR
840 sld rWORD2_SHIFT, rWORD2, rSHL
841 or rWORD2, r0, rWORD8_SHIFT
fe6e95d7
AM
842 addi rSTR1, rSTR1, 16
843 addi rSTR2, rSTR2, 16
fe6e95d7 844 cmpld cr7, rWORD1, rWORD2
158db122
LM
845 b L(duLoop1)
846 .align 4
847L(duP3x):
fe6e95d7
AM
848 addi rSTR1, rSTR1, 16
849 addi rSTR2, rSTR2, 16
fe6e95d7
AM
850 cmpld cr5, rWORD7, rWORD8
851 bne cr6, L(duLcr6)
852 sldi. rN, rN, 3
853 bne cr5, L(duLcr5)
854 cmpld cr7, rN, rSHR
158db122 855 beq L(duZeroReturn)
fe6e95d7
AM
856 li r0, 0
857 ble cr7, L(dutrim)
ce6615c9 858 LD rWORD2, rOFF8, rSTR2
fe6e95d7 859 srd r0, rWORD2, rSHR
158db122
LM
860 b L(dutrim)
861
862/* Count is a multiple of 32, remainder is 0 */
863 .align 4
864L(duP4):
fe6e95d7
AM
865 mtctr r0
866 srd r0, rWORD8, rSHR
ce6615c9 867 LD rWORD1, 0, rSTR1
fe6e95d7
AM
868 sld rWORD2_SHIFT, rWORD8, rSHL
869 or rWORD2, r0, rWORD6_SHIFT
158db122 870L(duP4e):
ce6615c9
AZ
871 LD rWORD3, rOFF8, rSTR1
872 LD rWORD4, rOFF8, rSTR2
fe6e95d7
AM
873 cmpld cr7, rWORD1, rWORD2
874 srd r12, rWORD4, rSHR
875 sld rWORD4_SHIFT, rWORD4, rSHL
876 or rWORD4, r12, rWORD2_SHIFT
ce6615c9
AZ
877 LD rWORD5, rOFF16, rSTR1
878 LD rWORD6, rOFF16, rSTR2
fe6e95d7
AM
879 cmpld cr1, rWORD3, rWORD4
880 bne cr7, L(duLcr7)
881 srd r0, rWORD6, rSHR
882 sld rWORD6_SHIFT, rWORD6, rSHL
883 or rWORD6, r0, rWORD4_SHIFT
ce6615c9
AZ
884 LD rWORD7, rOFF24, rSTR1
885 LD rWORD8, rOFF24, rSTR2
886 addi rSTR1, rSTR1, 24
887 addi rSTR2, rSTR2, 24
fe6e95d7
AM
888 cmpld cr6, rWORD5, rWORD6
889 bne cr1, L(duLcr1)
890 srd r12, rWORD8, rSHR
891 sld rWORD8_SHIFT, rWORD8, rSHL
892 or rWORD8, r12, rWORD6_SHIFT
893 cmpld cr5, rWORD7, rWORD8
158db122
LM
894 bdz L(du24) /* Adjust CTR as we start with +4 */
895/* This is the primary loop */
896 .align 4
897L(duLoop):
ce6615c9
AZ
898 LD rWORD1, rOFF8, rSTR1
899 LD rWORD2, rOFF8, rSTR2
fe6e95d7
AM
900 cmpld cr1, rWORD3, rWORD4
901 bne cr6, L(duLcr6)
902 srd r0, rWORD2, rSHR
903 sld rWORD2_SHIFT, rWORD2, rSHL
904 or rWORD2, r0, rWORD8_SHIFT
158db122 905L(duLoop1):
ce6615c9
AZ
906 LD rWORD3, rOFF16, rSTR1
907 LD rWORD4, rOFF16, rSTR2
fe6e95d7
AM
908 cmpld cr6, rWORD5, rWORD6
909 bne cr5, L(duLcr5)
910 srd r12, rWORD4, rSHR
911 sld rWORD4_SHIFT, rWORD4, rSHL
912 or rWORD4, r12, rWORD2_SHIFT
158db122 913L(duLoop2):
ce6615c9
AZ
914 LD rWORD5, rOFF24, rSTR1
915 LD rWORD6, rOFF24, rSTR2
fe6e95d7
AM
916 cmpld cr5, rWORD7, rWORD8
917 bne cr7, L(duLcr7)
918 srd r0, rWORD6, rSHR
919 sld rWORD6_SHIFT, rWORD6, rSHL
920 or rWORD6, r0, rWORD4_SHIFT
158db122 921L(duLoop3):
ce6615c9
AZ
922 LD rWORD7, rOFF32, rSTR1
923 LD rWORD8, rOFF32, rSTR2
924 addi rSTR1, rSTR1, 32
925 addi rSTR2, rSTR2, 32
fe6e95d7
AM
926 cmpld cr7, rWORD1, rWORD2
927 bne cr1, L(duLcr1)
928 srd r12, rWORD8, rSHR
929 sld rWORD8_SHIFT, rWORD8, rSHL
930 or rWORD8, r12, rWORD6_SHIFT
158db122
LM
931 bdnz L(duLoop)
932
933L(duL4):
fe6e95d7
AM
934 cmpld cr1, rWORD3, rWORD4
935 bne cr6, L(duLcr6)
936 cmpld cr6, rWORD5, rWORD6
937 bne cr5, L(duLcr5)
938 cmpld cr5, rWORD7, rWORD8
158db122 939L(du44):
fe6e95d7 940 bne cr7, L(duLcr7)
158db122 941L(du34):
fe6e95d7 942 bne cr1, L(duLcr1)
158db122 943L(du24):
fe6e95d7 944 bne cr6, L(duLcr6)
158db122 945L(du14):
fe6e95d7
AM
946 sldi. rN, rN, 3
947 bne cr5, L(duLcr5)
158db122 948/* At this point we have a remainder of 1 to 7 bytes to compare. We use
2ccdea26 949 shift right double to eliminate bits beyond the compare length.
158db122 950
1dc24235 951 However it may not be safe to load rWORD2 which may be beyond the
158db122
LM
952 string length. So we compare the bit length of the remainder to
953 the right shift count (rSHR). If the bit count is less than or equal
954 we do not need to load rWORD2 (all significant bits are already in
fe6e95d7
AM
955 rWORD8_SHIFT). */
956 cmpld cr7, rN, rSHR
158db122 957 beq L(duZeroReturn)
fe6e95d7
AM
958 li r0, 0
959 ble cr7, L(dutrim)
ce6615c9 960 LD rWORD2, rOFF8, rSTR2
fe6e95d7 961 srd r0, rWORD2, rSHR
158db122
LM
962 .align 4
963L(dutrim):
ce6615c9 964 LD rWORD1, rOFF8, rSTR1
fe6e95d7
AM
965 ld rWORD8, -8(r1)
966 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
967 or rWORD2, r0, rWORD8_SHIFT
ce6615c9
AZ
968 ld rWORD7, rWORD7SAVE(r1)
969 ld rSHL, rSHLSAVE(r1)
fe6e95d7
AM
970 srd rWORD1, rWORD1, rN
971 srd rWORD2, rWORD2, rN
ce6615c9
AZ
972 ld rSHR, rSHRSAVE(r1)
973 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
fe6e95d7
AM
974 li rRTN, 0
975 cmpld cr7, rWORD1, rWORD2
ce6615c9
AZ
976 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
977 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
fe6e95d7
AM
978 beq cr7, L(dureturn24)
979 li rRTN, 1
ce6615c9
AZ
980 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
981 ld rOFF8, rOFF8SAVE(r1)
982 ld rOFF16, rOFF16SAVE(r1)
983 ld rOFF24, rOFF24SAVE(r1)
984 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7
AM
985 bgtlr cr7
986 li rRTN, -1
158db122
LM
987 blr
988 .align 4
fe6e95d7 989L(duLcr7):
ce6615c9
AZ
990 ld rWORD8, rWORD8SAVE(r1)
991 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7
AM
992 li rRTN, 1
993 bgt cr7, L(dureturn29)
ce6615c9
AZ
994 ld rSHL, rSHLSAVE(r1)
995 ld rSHR, rSHRSAVE(r1)
fe6e95d7 996 li rRTN, -1
158db122
LM
997 b L(dureturn27)
998 .align 4
999L(duLcr1):
ce6615c9
AZ
1000 ld rWORD8, rWORD8SAVE(r1)
1001 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7
AM
1002 li rRTN, 1
1003 bgt cr1, L(dureturn29)
ce6615c9
AZ
1004 ld rSHL, rSHLSAVE(r1)
1005 ld rSHR, rSHRSAVE(r1)
fe6e95d7 1006 li rRTN, -1
158db122
LM
1007 b L(dureturn27)
1008 .align 4
1009L(duLcr6):
ce6615c9
AZ
1010 ld rWORD8, rWORD8SAVE(r1)
1011 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7
AM
1012 li rRTN, 1
1013 bgt cr6, L(dureturn29)
ce6615c9
AZ
1014 ld rSHL, rSHLSAVE(r1)
1015 ld rSHR, rSHRSAVE(r1)
fe6e95d7 1016 li rRTN, -1
158db122
LM
1017 b L(dureturn27)
1018 .align 4
1019L(duLcr5):
ce6615c9
AZ
1020 ld rWORD8, rWORD8SAVE(r1)
1021 ld rWORD7, rWORD7SAVE(r1)
fe6e95d7
AM
1022 li rRTN, 1
1023 bgt cr5, L(dureturn29)
ce6615c9
AZ
1024 ld rSHL, rSHLSAVE(r1)
1025 ld rSHR, rSHRSAVE(r1)
fe6e95d7 1026 li rRTN, -1
158db122 1027 b L(dureturn27)
ce6615c9 1028
158db122
LM
1029 .align 3
1030L(duZeroReturn):
fe6e95d7 1031 li rRTN, 0
158db122
LM
1032 .align 4
1033L(dureturn):
ce6615c9
AZ
1034 ld rWORD8, rWORD8SAVE(r1)
1035 ld rWORD7, rWORD7SAVE(r1)
158db122 1036L(dureturn29):
ce6615c9
AZ
1037 ld rSHL, rSHLSAVE(r1)
1038 ld rSHR, rSHRSAVE(r1)
158db122 1039L(dureturn27):
ce6615c9
AZ
1040 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1041 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1042 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
158db122 1043L(dureturn24):
ce6615c9
AZ
1044 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1045 ld rOFF8, rOFF8SAVE(r1)
1046 ld rOFF16, rOFF16SAVE(r1)
1047 ld rOFF24, rOFF24SAVE(r1)
1048 ld rOFF32, rOFF32SAVE(r1)
158db122 1049 blr
ce6615c9 1050
158db122 1051L(duzeroLength):
ce6615c9
AZ
1052 ld rOFF8, rOFF8SAVE(r1)
1053 ld rOFF16, rOFF16SAVE(r1)
1054 ld rOFF24, rOFF24SAVE(r1)
1055 ld rOFF32, rOFF32SAVE(r1)
fe6e95d7 1056 li rRTN, 0
158db122
LM
1057 blr
1058
b6a66222 1059END (MEMCMP)
158db122 1060libc_hidden_builtin_def (memcmp)
fe6e95d7 1061weak_alias (memcmp, bcmp)