]>
Commit | Line | Data |
---|---|---|
158db122 | 1 | /* Optimized memcmp implementation for POWER7/PowerPC64. |
04277e02 | 2 | Copyright (C) 2010-2019 Free Software Foundation, Inc. |
158db122 LM |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
158db122 LM |
18 | |
19 | #include <sysdep.h> | |
158db122 LM |
20 | |
21 | /* int [r3] memcmp (const char *s1 [r3], | |
22 | const char *s2 [r4], | |
23 | size_t size [r5]) */ | |
b6a66222 WSM |
24 | #ifndef MEMCMP |
25 | # define MEMCMP memcmp | |
26 | #endif | |
158db122 | 27 | .machine power7 |
d5b41185 | 28 | ENTRY_TOCLESS (MEMCMP, 4) |
158db122 LM |
29 | CALL_MCOUNT 3 |
30 | ||
ce6615c9 AZ |
31 | #define rRTN r3 |
32 | #define rSTR1 r3 /* first string arg */ | |
33 | #define rSTR2 r4 /* second string arg */ | |
34 | #define rN r5 /* max string length */ | |
35 | #define rWORD1 r6 /* current word in s1 */ | |
36 | #define rWORD2 r7 /* current word in s2 */ | |
37 | #define rWORD3 r8 /* next word in s1 */ | |
38 | #define rWORD4 r9 /* next word in s2 */ | |
39 | #define rWORD5 r10 /* next word in s1 */ | |
40 | #define rWORD6 r11 /* next word in s2 */ | |
41 | ||
42 | #define rOFF8 r20 /* 8 bytes offset. */ | |
43 | #define rOFF16 r21 /* 16 bytes offset. */ | |
44 | #define rOFF24 r22 /* 24 bytes offset. */ | |
45 | #define rOFF32 r23 /* 24 bytes offset. */ | |
46 | #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ | |
47 | #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ | |
48 | #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ | |
49 | #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ | |
50 | #define rSHR r28 /* Unaligned shift right count. */ | |
51 | #define rSHL r29 /* Unaligned shift left count. */ | |
52 | #define rWORD7 r30 /* next word in s1 */ | |
53 | #define rWORD8 r31 /* next word in s2 */ | |
54 | ||
55 | #define rWORD8SAVE (-8) | |
56 | #define rWORD7SAVE (-16) | |
57 | #define rOFF8SAVE (-24) | |
58 | #define rOFF16SAVE (-32) | |
59 | #define rOFF24SAVE (-40) | |
60 | #define rOFF32SAVE (-48) | |
61 | #define rSHRSAVE (-56) | |
62 | #define rSHLSAVE (-64) | |
63 | #define rWORD8SHIFTSAVE (-72) | |
64 | #define rWORD2SHIFTSAVE (-80) | |
65 | #define rWORD4SHIFTSAVE (-88) | |
66 | #define rWORD6SHIFTSAVE (-96) | |
67 | ||
68 | #ifdef __LITTLE_ENDIAN__ | |
69 | # define LD ldbrx | |
70 | #else | |
71 | # define LD ldx | |
72 | #endif | |
158db122 | 73 | |
fe6e95d7 AM |
74 | xor r0, rSTR2, rSTR1 |
75 | cmpldi cr6, rN, 0 | |
76 | cmpldi cr1, rN, 12 | |
77 | clrldi. r0, r0, 61 | |
78 | clrldi r12, rSTR1, 61 | |
79 | cmpldi cr5, r12, 0 | |
80 | beq- cr6, L(zeroLength) | |
81 | dcbt 0, rSTR1 | |
82 | dcbt 0, rSTR2 | |
2ccdea26 | 83 | /* If less than 8 bytes or not aligned, use the unaligned |
158db122 | 84 | byte loop. */ |
fe6e95d7 | 85 | blt cr1, L(bytealigned) |
ce6615c9 | 86 | std rWORD8, rWORD8SAVE(r1) |
ce6615c9 | 87 | std rWORD7, rWORD7SAVE(r1) |
ce6615c9 | 88 | std rOFF8, rOFF8SAVE(r1) |
ce6615c9 | 89 | std rOFF16, rOFF16SAVE(r1) |
ce6615c9 | 90 | std rOFF24, rOFF24SAVE(r1) |
ce6615c9 | 91 | std rOFF32, rOFF32SAVE(r1) |
869d7180 RS |
92 | cfi_offset(rWORD8, rWORD8SAVE) |
93 | cfi_offset(rWORD7, rWORD7SAVE) | |
94 | cfi_offset(rOFF8, rOFF8SAVE) | |
95 | cfi_offset(rOFF16, rOFF16SAVE) | |
96 | cfi_offset(rOFF24, rOFF24SAVE) | |
97 | cfi_offset(rOFF32, rOFF32SAVE) | |
ce6615c9 AZ |
98 | |
99 | li rOFF8,8 | |
100 | li rOFF16,16 | |
101 | li rOFF24,24 | |
102 | li rOFF32,32 | |
103 | ||
158db122 LM |
104 | bne L(unaligned) |
105 | /* At this point we know both strings have the same alignment and the | |
fe6e95d7 | 106 | compare length is at least 8 bytes. r12 contains the low order |
158db122 | 107 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 AM |
108 | of r12 to 0. If r12 == 0 then we are already double word |
109 | aligned and can perform the DW aligned loop. | |
1dc24235 | 110 | |
158db122 | 111 | Otherwise we know the two strings have the same alignment (but not |
fe6e95d7 AM |
112 | yet DW). So we force the string addresses to the next lower DW |
113 | boundary and special case this first DW using shift left to | |
2ccdea26 | 114 | eliminate bits preceding the first byte. Since we want to join the |
fe6e95d7 | 115 | normal (DW aligned) compare loop, starting at the second double word, |
158db122 | 116 | we need to adjust the length (rN) and special case the loop |
fe6e95d7 AM |
117 | versioning for the first DW. This ensures that the loop count is |
118 | correct and the first DW (shifted) is in the expected register pair. */ | |
158db122 LM |
119 | .align 4 |
120 | L(samealignment): | |
fe6e95d7 AM |
121 | clrrdi rSTR1, rSTR1, 3 |
122 | clrrdi rSTR2, rSTR2, 3 | |
123 | beq cr5, L(DWaligned) | |
124 | add rN, rN, r12 | |
125 | sldi rWORD6, r12, 3 | |
126 | srdi r0, rN, 5 /* Divide by 32 */ | |
127 | andi. r12, rN, 24 /* Get the DW remainder */ | |
ce6615c9 AZ |
128 | LD rWORD1, 0, rSTR1 |
129 | LD rWORD2, 0, rSTR2 | |
fe6e95d7 AM |
130 | cmpldi cr1, r12, 16 |
131 | cmpldi cr7, rN, 32 | |
132 | clrldi rN, rN, 61 | |
158db122 | 133 | beq L(dPs4) |
fe6e95d7 AM |
134 | mtctr r0 |
135 | bgt cr1, L(dPs3) | |
136 | beq cr1, L(dPs2) | |
158db122 LM |
137 | |
138 | /* Remainder is 8 */ | |
139 | .align 3 | |
140 | L(dsP1): | |
fe6e95d7 AM |
141 | sld rWORD5, rWORD1, rWORD6 |
142 | sld rWORD6, rWORD2, rWORD6 | |
143 | cmpld cr5, rWORD5, rWORD6 | |
144 | blt cr7, L(dP1x) | |
158db122 | 145 | /* Do something useful in this cycle since we have to branch anyway. */ |
ce6615c9 AZ |
146 | LD rWORD1, rOFF8, rSTR1 |
147 | LD rWORD2, rOFF8, rSTR2 | |
fe6e95d7 | 148 | cmpld cr7, rWORD1, rWORD2 |
158db122 LM |
149 | b L(dP1e) |
150 | /* Remainder is 16 */ | |
151 | .align 4 | |
152 | L(dPs2): | |
fe6e95d7 AM |
153 | sld rWORD5, rWORD1, rWORD6 |
154 | sld rWORD6, rWORD2, rWORD6 | |
155 | cmpld cr6, rWORD5, rWORD6 | |
156 | blt cr7, L(dP2x) | |
158db122 | 157 | /* Do something useful in this cycle since we have to branch anyway. */ |
ce6615c9 AZ |
158 | LD rWORD7, rOFF8, rSTR1 |
159 | LD rWORD8, rOFF8, rSTR2 | |
fe6e95d7 | 160 | cmpld cr5, rWORD7, rWORD8 |
158db122 LM |
161 | b L(dP2e) |
162 | /* Remainder is 24 */ | |
163 | .align 4 | |
164 | L(dPs3): | |
fe6e95d7 AM |
165 | sld rWORD3, rWORD1, rWORD6 |
166 | sld rWORD4, rWORD2, rWORD6 | |
167 | cmpld cr1, rWORD3, rWORD4 | |
158db122 LM |
168 | b L(dP3e) |
169 | /* Count is a multiple of 32, remainder is 0 */ | |
170 | .align 4 | |
171 | L(dPs4): | |
fe6e95d7 AM |
172 | mtctr r0 |
173 | sld rWORD1, rWORD1, rWORD6 | |
174 | sld rWORD2, rWORD2, rWORD6 | |
175 | cmpld cr7, rWORD1, rWORD2 | |
158db122 LM |
176 | b L(dP4e) |
177 | ||
178 | /* At this point we know both strings are double word aligned and the | |
179 | compare length is at least 8 bytes. */ | |
180 | .align 4 | |
181 | L(DWaligned): | |
fe6e95d7 AM |
182 | andi. r12, rN, 24 /* Get the DW remainder */ |
183 | srdi r0, rN, 5 /* Divide by 32 */ | |
184 | cmpldi cr1, r12, 16 | |
185 | cmpldi cr7, rN, 32 | |
186 | clrldi rN, rN, 61 | |
158db122 | 187 | beq L(dP4) |
fe6e95d7 AM |
188 | bgt cr1, L(dP3) |
189 | beq cr1, L(dP2) | |
158db122 LM |
190 | |
191 | /* Remainder is 8 */ | |
192 | .align 4 | |
193 | L(dP1): | |
fe6e95d7 | 194 | mtctr r0 |
158db122 | 195 | /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early |
2ccdea26 AB |
196 | (8-15 byte compare), we want to use only volatile registers. This |
197 | means we can avoid restoring non-volatile registers since we did not | |
158db122 | 198 | change any on the early exit path. The key here is the non-early |
1dc24235 | 199 | exit path only cares about the condition code (cr5), not about which |
158db122 | 200 | register pair was used. */ |
ce6615c9 AZ |
201 | LD rWORD5, 0, rSTR1 |
202 | LD rWORD6, 0, rSTR2 | |
fe6e95d7 AM |
203 | cmpld cr5, rWORD5, rWORD6 |
204 | blt cr7, L(dP1x) | |
ce6615c9 AZ |
205 | LD rWORD1, rOFF8, rSTR1 |
206 | LD rWORD2, rOFF8, rSTR2 | |
fe6e95d7 | 207 | cmpld cr7, rWORD1, rWORD2 |
158db122 | 208 | L(dP1e): |
ce6615c9 AZ |
209 | LD rWORD3, rOFF16, rSTR1 |
210 | LD rWORD4, rOFF16, rSTR2 | |
fe6e95d7 | 211 | cmpld cr1, rWORD3, rWORD4 |
ce6615c9 AZ |
212 | LD rWORD5, rOFF24, rSTR1 |
213 | LD rWORD6, rOFF24, rSTR2 | |
fe6e95d7 AM |
214 | cmpld cr6, rWORD5, rWORD6 |
215 | bne cr5, L(dLcr5x) | |
216 | bne cr7, L(dLcr7x) | |
217 | ||
ce6615c9 AZ |
218 | LD rWORD7, rOFF32, rSTR1 |
219 | LD rWORD8, rOFF32, rSTR2 | |
220 | addi rSTR1, rSTR1, 32 | |
221 | addi rSTR2, rSTR2, 32 | |
fe6e95d7 AM |
222 | bne cr1, L(dLcr1) |
223 | cmpld cr5, rWORD7, rWORD8 | |
158db122 | 224 | bdnz L(dLoop) |
fe6e95d7 | 225 | bne cr6, L(dLcr6) |
ce6615c9 AZ |
226 | ld rWORD8, rWORD8SAVE(r1) |
227 | ld rWORD7, rWORD7SAVE(r1) | |
158db122 LM |
228 | .align 3 |
229 | L(dP1x): | |
fe6e95d7 AM |
230 | sldi. r12, rN, 3 |
231 | bne cr5, L(dLcr5x) | |
232 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ | |
158db122 | 233 | bne L(d00) |
ce6615c9 AZ |
234 | ld rOFF8, rOFF8SAVE(r1) |
235 | ld rOFF16, rOFF16SAVE(r1) | |
236 | ld rOFF24, rOFF24SAVE(r1) | |
237 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 238 | li rRTN, 0 |
158db122 LM |
239 | blr |
240 | ||
241 | /* Remainder is 16 */ | |
242 | .align 4 | |
243 | L(dP2): | |
fe6e95d7 | 244 | mtctr r0 |
ce6615c9 AZ |
245 | LD rWORD5, 0, rSTR1 |
246 | LD rWORD6, 0, rSTR2 | |
fe6e95d7 AM |
247 | cmpld cr6, rWORD5, rWORD6 |
248 | blt cr7, L(dP2x) | |
ce6615c9 AZ |
249 | LD rWORD7, rOFF8, rSTR1 |
250 | LD rWORD8, rOFF8, rSTR2 | |
fe6e95d7 | 251 | cmpld cr5, rWORD7, rWORD8 |
158db122 | 252 | L(dP2e): |
ce6615c9 AZ |
253 | LD rWORD1, rOFF16, rSTR1 |
254 | LD rWORD2, rOFF16, rSTR2 | |
fe6e95d7 | 255 | cmpld cr7, rWORD1, rWORD2 |
ce6615c9 AZ |
256 | LD rWORD3, rOFF24, rSTR1 |
257 | LD rWORD4, rOFF24, rSTR2 | |
fe6e95d7 | 258 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
259 | addi rSTR1, rSTR1, 8 |
260 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
261 | bne cr6, L(dLcr6) |
262 | bne cr5, L(dLcr5) | |
158db122 | 263 | b L(dLoop2) |
158db122 LM |
264 | .align 4 |
265 | L(dP2x): | |
ce6615c9 AZ |
266 | LD rWORD3, rOFF8, rSTR1 |
267 | LD rWORD4, rOFF8, rSTR2 | |
fe6e95d7 AM |
268 | cmpld cr1, rWORD3, rWORD4 |
269 | sldi. r12, rN, 3 | |
270 | bne cr6, L(dLcr6x) | |
fe6e95d7 AM |
271 | addi rSTR1, rSTR1, 8 |
272 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
273 | bne cr1, L(dLcr1x) |
274 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ | |
158db122 | 275 | bne L(d00) |
ce6615c9 AZ |
276 | ld rOFF8, rOFF8SAVE(r1) |
277 | ld rOFF16, rOFF16SAVE(r1) | |
278 | ld rOFF24, rOFF24SAVE(r1) | |
279 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 280 | li rRTN, 0 |
158db122 LM |
281 | blr |
282 | ||
283 | /* Remainder is 24 */ | |
284 | .align 4 | |
285 | L(dP3): | |
fe6e95d7 | 286 | mtctr r0 |
ce6615c9 AZ |
287 | LD rWORD3, 0, rSTR1 |
288 | LD rWORD4, 0, rSTR2 | |
fe6e95d7 | 289 | cmpld cr1, rWORD3, rWORD4 |
158db122 | 290 | L(dP3e): |
ce6615c9 AZ |
291 | LD rWORD5, rOFF8, rSTR1 |
292 | LD rWORD6, rOFF8, rSTR2 | |
fe6e95d7 AM |
293 | cmpld cr6, rWORD5, rWORD6 |
294 | blt cr7, L(dP3x) | |
ce6615c9 AZ |
295 | LD rWORD7, rOFF16, rSTR1 |
296 | LD rWORD8, rOFF16, rSTR2 | |
fe6e95d7 | 297 | cmpld cr5, rWORD7, rWORD8 |
ce6615c9 AZ |
298 | LD rWORD1, rOFF24, rSTR1 |
299 | LD rWORD2, rOFF24, rSTR2 | |
fe6e95d7 | 300 | cmpld cr7, rWORD1, rWORD2 |
fe6e95d7 AM |
301 | addi rSTR1, rSTR1, 16 |
302 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
303 | bne cr1, L(dLcr1) |
304 | bne cr6, L(dLcr6) | |
158db122 LM |
305 | b L(dLoop1) |
306 | /* Again we are on a early exit path (24-31 byte compare), we want to | |
2ccdea26 | 307 | only use volatile registers and avoid restoring non-volatile |
158db122 LM |
308 | registers. */ |
309 | .align 4 | |
310 | L(dP3x): | |
ce6615c9 AZ |
311 | LD rWORD1, rOFF16, rSTR1 |
312 | LD rWORD2, rOFF16, rSTR2 | |
fe6e95d7 AM |
313 | cmpld cr7, rWORD1, rWORD2 |
314 | sldi. r12, rN, 3 | |
315 | bne cr1, L(dLcr1x) | |
fe6e95d7 AM |
316 | addi rSTR1, rSTR1, 16 |
317 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
318 | bne cr6, L(dLcr6x) |
319 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ | |
320 | bne cr7, L(dLcr7x) | |
158db122 | 321 | bne L(d00) |
ce6615c9 AZ |
322 | ld rOFF8, rOFF8SAVE(r1) |
323 | ld rOFF16, rOFF16SAVE(r1) | |
324 | ld rOFF24, rOFF24SAVE(r1) | |
325 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 326 | li rRTN, 0 |
158db122 LM |
327 | blr |
328 | ||
329 | /* Count is a multiple of 32, remainder is 0 */ | |
330 | .align 4 | |
331 | L(dP4): | |
fe6e95d7 | 332 | mtctr r0 |
ce6615c9 AZ |
333 | LD rWORD1, 0, rSTR1 |
334 | LD rWORD2, 0, rSTR2 | |
fe6e95d7 | 335 | cmpld cr7, rWORD1, rWORD2 |
158db122 | 336 | L(dP4e): |
ce6615c9 AZ |
337 | LD rWORD3, rOFF8, rSTR1 |
338 | LD rWORD4, rOFF8, rSTR2 | |
fe6e95d7 | 339 | cmpld cr1, rWORD3, rWORD4 |
ce6615c9 AZ |
340 | LD rWORD5, rOFF16, rSTR1 |
341 | LD rWORD6, rOFF16, rSTR2 | |
fe6e95d7 | 342 | cmpld cr6, rWORD5, rWORD6 |
ce6615c9 AZ |
343 | LD rWORD7, rOFF24, rSTR1 |
344 | LD rWORD8, rOFF24, rSTR2 | |
345 | addi rSTR1, rSTR1, 24 | |
346 | addi rSTR2, rSTR2, 24 | |
fe6e95d7 AM |
347 | cmpld cr5, rWORD7, rWORD8 |
348 | bne cr7, L(dLcr7) | |
349 | bne cr1, L(dLcr1) | |
158db122 LM |
350 | bdz- L(d24) /* Adjust CTR as we start with +4 */ |
351 | /* This is the primary loop */ | |
352 | .align 4 | |
353 | L(dLoop): | |
ce6615c9 AZ |
354 | LD rWORD1, rOFF8, rSTR1 |
355 | LD rWORD2, rOFF8, rSTR2 | |
fe6e95d7 AM |
356 | cmpld cr1, rWORD3, rWORD4 |
357 | bne cr6, L(dLcr6) | |
158db122 | 358 | L(dLoop1): |
ce6615c9 AZ |
359 | LD rWORD3, rOFF16, rSTR1 |
360 | LD rWORD4, rOFF16, rSTR2 | |
fe6e95d7 AM |
361 | cmpld cr6, rWORD5, rWORD6 |
362 | bne cr5, L(dLcr5) | |
158db122 | 363 | L(dLoop2): |
ce6615c9 AZ |
364 | LD rWORD5, rOFF24, rSTR1 |
365 | LD rWORD6, rOFF24, rSTR2 | |
fe6e95d7 AM |
366 | cmpld cr5, rWORD7, rWORD8 |
367 | bne cr7, L(dLcr7) | |
158db122 | 368 | L(dLoop3): |
ce6615c9 AZ |
369 | LD rWORD7, rOFF32, rSTR1 |
370 | LD rWORD8, rOFF32, rSTR2 | |
371 | addi rSTR1, rSTR1, 32 | |
372 | addi rSTR2, rSTR2, 32 | |
fe6e95d7 AM |
373 | bne cr1, L(dLcr1) |
374 | cmpld cr7, rWORD1, rWORD2 | |
158db122 LM |
375 | bdnz L(dLoop) |
376 | ||
377 | L(dL4): | |
fe6e95d7 AM |
378 | cmpld cr1, rWORD3, rWORD4 |
379 | bne cr6, L(dLcr6) | |
380 | cmpld cr6, rWORD5, rWORD6 | |
381 | bne cr5, L(dLcr5) | |
382 | cmpld cr5, rWORD7, rWORD8 | |
158db122 | 383 | L(d44): |
fe6e95d7 | 384 | bne cr7, L(dLcr7) |
158db122 | 385 | L(d34): |
fe6e95d7 | 386 | bne cr1, L(dLcr1) |
158db122 | 387 | L(d24): |
fe6e95d7 | 388 | bne cr6, L(dLcr6) |
158db122 | 389 | L(d14): |
fe6e95d7 AM |
390 | sldi. r12, rN, 3 |
391 | bne cr5, L(dLcr5) | |
158db122 | 392 | L(d04): |
ce6615c9 AZ |
393 | ld rWORD8, rWORD8SAVE(r1) |
394 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 | 395 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
ce6615c9 | 396 | beq L(duzeroLength) |
158db122 LM |
397 | /* At this point we have a remainder of 1 to 7 bytes to compare. Since |
398 | we are aligned it is safe to load the whole double word, and use | |
2ccdea26 | 399 | shift right double to eliminate bits beyond the compare length. */ |
158db122 | 400 | L(d00): |
ce6615c9 AZ |
401 | LD rWORD1, rOFF8, rSTR1 |
402 | LD rWORD2, rOFF8, rSTR2 | |
fe6e95d7 AM |
403 | srd rWORD1, rWORD1, rN |
404 | srd rWORD2, rWORD2, rN | |
405 | cmpld cr7, rWORD1, rWORD2 | |
406 | bne cr7, L(dLcr7x) | |
ce6615c9 AZ |
407 | ld rOFF8, rOFF8SAVE(r1) |
408 | ld rOFF16, rOFF16SAVE(r1) | |
409 | ld rOFF24, rOFF24SAVE(r1) | |
410 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 411 | li rRTN, 0 |
158db122 | 412 | blr |
fe6e95d7 | 413 | |
158db122 | 414 | .align 4 |
fe6e95d7 | 415 | L(dLcr7): |
ce6615c9 AZ |
416 | ld rWORD8, rWORD8SAVE(r1) |
417 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 | 418 | L(dLcr7x): |
ce6615c9 AZ |
419 | ld rOFF8, rOFF8SAVE(r1) |
420 | ld rOFF16, rOFF16SAVE(r1) | |
421 | ld rOFF24, rOFF24SAVE(r1) | |
422 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 AM |
423 | li rRTN, 1 |
424 | bgtlr cr7 | |
425 | li rRTN, -1 | |
158db122 LM |
426 | blr |
427 | .align 4 | |
428 | L(dLcr1): | |
ce6615c9 AZ |
429 | ld rWORD8, rWORD8SAVE(r1) |
430 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 | 431 | L(dLcr1x): |
ce6615c9 AZ |
432 | ld rOFF8, rOFF8SAVE(r1) |
433 | ld rOFF16, rOFF16SAVE(r1) | |
434 | ld rOFF24, rOFF24SAVE(r1) | |
435 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 436 | li rRTN, 1 |
158db122 | 437 | bgtlr cr1 |
fe6e95d7 | 438 | li rRTN, -1 |
158db122 LM |
439 | blr |
440 | .align 4 | |
441 | L(dLcr6): | |
ce6615c9 AZ |
442 | ld rWORD8, rWORD8SAVE(r1) |
443 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 | 444 | L(dLcr6x): |
ce6615c9 AZ |
445 | ld rOFF8, rOFF8SAVE(r1) |
446 | ld rOFF16, rOFF16SAVE(r1) | |
447 | ld rOFF24, rOFF24SAVE(r1) | |
448 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 449 | li rRTN, 1 |
158db122 | 450 | bgtlr cr6 |
fe6e95d7 | 451 | li rRTN, -1 |
158db122 LM |
452 | blr |
453 | .align 4 | |
454 | L(dLcr5): | |
ce6615c9 AZ |
455 | ld rWORD8, rWORD8SAVE(r1) |
456 | ld rWORD7, rWORD7SAVE(r1) | |
158db122 | 457 | L(dLcr5x): |
ce6615c9 AZ |
458 | ld rOFF8, rOFF8SAVE(r1) |
459 | ld rOFF16, rOFF16SAVE(r1) | |
460 | ld rOFF24, rOFF24SAVE(r1) | |
461 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 462 | li rRTN, 1 |
158db122 | 463 | bgtlr cr5 |
fe6e95d7 | 464 | li rRTN, -1 |
158db122 LM |
465 | blr |
466 | ||
467 | .align 4 | |
468 | L(bytealigned): | |
469 | mtctr rN | |
158db122 LM |
470 | |
471 | /* We need to prime this loop. This loop is swing modulo scheduled | |
1dc24235 | 472 | to avoid pipe delays. The dependent instruction latencies (load to |
158db122 LM |
473 | compare to conditional branch) is 2 to 3 cycles. In this loop each |
474 | dispatch group ends in a branch and takes 1 cycle. Effectively | |
1dc24235 UD |
475 | the first iteration of the loop only serves to load operands and |
476 | branches based on compares are delayed until the next loop. | |
158db122 LM |
477 | |
478 | So we must precondition some registers and condition codes so that | |
479 | we don't exit the loop early on the first iteration. */ | |
1dc24235 | 480 | |
fe6e95d7 AM |
481 | lbz rWORD1, 0(rSTR1) |
482 | lbz rWORD2, 0(rSTR2) | |
158db122 | 483 | bdz L(b11) |
fe6e95d7 AM |
484 | cmpld cr7, rWORD1, rWORD2 |
485 | lbz rWORD3, 1(rSTR1) | |
486 | lbz rWORD4, 1(rSTR2) | |
158db122 | 487 | bdz L(b12) |
fe6e95d7 AM |
488 | cmpld cr1, rWORD3, rWORD4 |
489 | lbzu rWORD5, 2(rSTR1) | |
490 | lbzu rWORD6, 2(rSTR2) | |
158db122 LM |
491 | bdz L(b13) |
492 | .align 4 | |
493 | L(bLoop): | |
fe6e95d7 AM |
494 | lbzu rWORD1, 1(rSTR1) |
495 | lbzu rWORD2, 1(rSTR2) | |
496 | bne cr7, L(bLcr7) | |
158db122 | 497 | |
fe6e95d7 | 498 | cmpld cr6, rWORD5, rWORD6 |
158db122 LM |
499 | bdz L(b3i) |
500 | ||
fe6e95d7 AM |
501 | lbzu rWORD3, 1(rSTR1) |
502 | lbzu rWORD4, 1(rSTR2) | |
503 | bne cr1, L(bLcr1) | |
158db122 | 504 | |
fe6e95d7 | 505 | cmpld cr7, rWORD1, rWORD2 |
158db122 LM |
506 | bdz L(b2i) |
507 | ||
fe6e95d7 AM |
508 | lbzu rWORD5, 1(rSTR1) |
509 | lbzu rWORD6, 1(rSTR2) | |
510 | bne cr6, L(bLcr6) | |
158db122 | 511 | |
fe6e95d7 | 512 | cmpld cr1, rWORD3, rWORD4 |
158db122 LM |
513 | bdnz L(bLoop) |
514 | ||
515 | /* We speculatively loading bytes before we have tested the previous | |
516 | bytes. But we must avoid overrunning the length (in the ctr) to | |
1dc24235 | 517 | prevent these speculative loads from causing a segfault. In this |
158db122 LM |
518 | case the loop will exit early (before the all pending bytes are |
519 | tested. In this case we must complete the pending operations | |
520 | before returning. */ | |
521 | L(b1i): | |
fe6e95d7 AM |
522 | bne cr7, L(bLcr7) |
523 | bne cr1, L(bLcr1) | |
158db122 LM |
524 | b L(bx56) |
525 | .align 4 | |
526 | L(b2i): | |
fe6e95d7 AM |
527 | bne cr6, L(bLcr6) |
528 | bne cr7, L(bLcr7) | |
158db122 LM |
529 | b L(bx34) |
530 | .align 4 | |
531 | L(b3i): | |
fe6e95d7 AM |
532 | bne cr1, L(bLcr1) |
533 | bne cr6, L(bLcr6) | |
158db122 LM |
534 | b L(bx12) |
535 | .align 4 | |
fe6e95d7 AM |
536 | L(bLcr7): |
537 | li rRTN, 1 | |
538 | bgtlr cr7 | |
539 | li rRTN, -1 | |
158db122 LM |
540 | blr |
541 | L(bLcr1): | |
fe6e95d7 | 542 | li rRTN, 1 |
158db122 | 543 | bgtlr cr1 |
fe6e95d7 | 544 | li rRTN, -1 |
158db122 LM |
545 | blr |
546 | L(bLcr6): | |
fe6e95d7 | 547 | li rRTN, 1 |
158db122 | 548 | bgtlr cr6 |
fe6e95d7 | 549 | li rRTN, -1 |
158db122 LM |
550 | blr |
551 | ||
552 | L(b13): | |
fe6e95d7 AM |
553 | bne cr7, L(bx12) |
554 | bne cr1, L(bx34) | |
158db122 | 555 | L(bx56): |
fe6e95d7 | 556 | sub rRTN, rWORD5, rWORD6 |
158db122 LM |
557 | blr |
558 | nop | |
559 | L(b12): | |
fe6e95d7 | 560 | bne cr7, L(bx12) |
158db122 | 561 | L(bx34): |
fe6e95d7 | 562 | sub rRTN, rWORD3, rWORD4 |
158db122 LM |
563 | blr |
564 | L(b11): | |
565 | L(bx12): | |
fe6e95d7 | 566 | sub rRTN, rWORD1, rWORD2 |
158db122 | 567 | blr |
ce6615c9 | 568 | |
1dc24235 | 569 | .align 4 |
158db122 | 570 | L(zeroLength): |
fe6e95d7 | 571 | li rRTN, 0 |
158db122 LM |
572 | blr |
573 | ||
574 | .align 4 | |
575 | /* At this point we know the strings have different alignment and the | |
fe6e95d7 | 576 | compare length is at least 8 bytes. r12 contains the low order |
158db122 | 577 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 | 578 | of r12 to 0. If r12 == 0 then rStr1 is double word |
158db122 | 579 | aligned and can perform the DWunaligned loop. |
1dc24235 | 580 | |
2ccdea26 | 581 | Otherwise we know that rSTR1 is not already DW aligned yet. |
158db122 | 582 | So we can force the string addresses to the next lower DW |
fe6e95d7 | 583 | boundary and special case this first DW using shift left to |
2ccdea26 | 584 | eliminate bits preceding the first byte. Since we want to join the |
158db122 LM |
585 | normal (DWaligned) compare loop, starting at the second double word, |
586 | we need to adjust the length (rN) and special case the loop | |
fe6e95d7 | 587 | versioning for the first DW. This ensures that the loop count is |
158db122 | 588 | correct and the first DW (shifted) is in the expected resister pair. */ |
158db122 | 589 | L(unaligned): |
ce6615c9 AZ |
590 | std rSHL, rSHLSAVE(r1) |
591 | cfi_offset(rSHL, rSHLSAVE) | |
fe6e95d7 AM |
592 | clrldi rSHL, rSTR2, 61 |
593 | beq cr6, L(duzeroLength) | |
ce6615c9 AZ |
594 | std rSHR, rSHRSAVE(r1) |
595 | cfi_offset(rSHR, rSHRSAVE) | |
fe6e95d7 | 596 | beq cr5, L(DWunaligned) |
ce6615c9 AZ |
597 | std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) |
598 | cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) | |
fe6e95d7 | 599 | /* Adjust the logical start of rSTR2 to compensate for the extra bits |
158db122 | 600 | in the 1st rSTR1 DW. */ |
fe6e95d7 | 601 | sub rWORD8_SHIFT, rSTR2, r12 |
158db122 LM |
602 | /* But do not attempt to address the DW before that DW that contains |
603 | the actual start of rSTR2. */ | |
fe6e95d7 | 604 | clrrdi rSTR2, rSTR2, 3 |
ce6615c9 | 605 | std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) |
2ccdea26 | 606 | /* Compute the left/right shift counts for the unaligned rSTR2, |
1dc24235 | 607 | compensating for the logical (DW aligned) start of rSTR1. */ |
fe6e95d7 AM |
608 | clrldi rSHL, rWORD8_SHIFT, 61 |
609 | clrrdi rSTR1, rSTR1, 3 | |
ce6615c9 | 610 | std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) |
fe6e95d7 AM |
611 | sldi rSHL, rSHL, 3 |
612 | cmpld cr5, rWORD8_SHIFT, rSTR2 | |
613 | add rN, rN, r12 | |
614 | sldi rWORD6, r12, 3 | |
ce6615c9 | 615 | std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
869d7180 RS |
616 | cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) |
617 | cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) | |
ce6615c9 | 618 | cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) |
fe6e95d7 AM |
619 | subfic rSHR, rSHL, 64 |
620 | srdi r0, rN, 5 /* Divide by 32 */ | |
621 | andi. r12, rN, 24 /* Get the DW remainder */ | |
158db122 LM |
622 | /* We normally need to load 2 DWs to start the unaligned rSTR2, but in |
623 | this special case those bits may be discarded anyway. Also we | |
624 | must avoid loading a DW where none of the bits are part of rSTR2 as | |
625 | this may cross a page boundary and cause a page fault. */ | |
fe6e95d7 AM |
626 | li rWORD8, 0 |
627 | blt cr5, L(dus0) | |
ce6615c9 | 628 | LD rWORD8, 0, rSTR2 |
fe6e95d7 | 629 | addi rSTR2, rSTR2, 8 |
fe6e95d7 | 630 | sld rWORD8, rWORD8, rSHL |
158db122 LM |
631 | |
632 | L(dus0): | |
ce6615c9 AZ |
633 | LD rWORD1, 0, rSTR1 |
634 | LD rWORD2, 0, rSTR2 | |
fe6e95d7 AM |
635 | cmpldi cr1, r12, 16 |
636 | cmpldi cr7, rN, 32 | |
637 | srd r12, rWORD2, rSHR | |
638 | clrldi rN, rN, 61 | |
158db122 | 639 | beq L(duPs4) |
fe6e95d7 AM |
640 | mtctr r0 |
641 | or rWORD8, r12, rWORD8 | |
642 | bgt cr1, L(duPs3) | |
643 | beq cr1, L(duPs2) | |
158db122 LM |
644 | |
645 | /* Remainder is 8 */ | |
646 | .align 4 | |
647 | L(dusP1): | |
fe6e95d7 AM |
648 | sld rWORD8_SHIFT, rWORD2, rSHL |
649 | sld rWORD7, rWORD1, rWORD6 | |
650 | sld rWORD8, rWORD8, rWORD6 | |
651 | bge cr7, L(duP1e) | |
158db122 LM |
652 | /* At this point we exit early with the first double word compare |
653 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
654 | how we handle the remaining bytes. */ | |
fe6e95d7 AM |
655 | cmpld cr5, rWORD7, rWORD8 |
656 | sldi. rN, rN, 3 | |
657 | bne cr5, L(duLcr5) | |
658 | cmpld cr7, rN, rSHR | |
158db122 | 659 | beq L(duZeroReturn) |
fe6e95d7 AM |
660 | li r0, 0 |
661 | ble cr7, L(dutrim) | |
ce6615c9 | 662 | LD rWORD2, rOFF8, rSTR2 |
fe6e95d7 | 663 | srd r0, rWORD2, rSHR |
158db122 LM |
664 | b L(dutrim) |
665 | /* Remainder is 16 */ | |
666 | .align 4 | |
667 | L(duPs2): | |
fe6e95d7 AM |
668 | sld rWORD6_SHIFT, rWORD2, rSHL |
669 | sld rWORD5, rWORD1, rWORD6 | |
670 | sld rWORD6, rWORD8, rWORD6 | |
158db122 LM |
671 | b L(duP2e) |
672 | /* Remainder is 24 */ | |
673 | .align 4 | |
674 | L(duPs3): | |
fe6e95d7 AM |
675 | sld rWORD4_SHIFT, rWORD2, rSHL |
676 | sld rWORD3, rWORD1, rWORD6 | |
677 | sld rWORD4, rWORD8, rWORD6 | |
158db122 LM |
678 | b L(duP3e) |
679 | /* Count is a multiple of 32, remainder is 0 */ | |
680 | .align 4 | |
681 | L(duPs4): | |
fe6e95d7 AM |
682 | mtctr r0 |
683 | or rWORD8, r12, rWORD8 | |
684 | sld rWORD2_SHIFT, rWORD2, rSHL | |
685 | sld rWORD1, rWORD1, rWORD6 | |
686 | sld rWORD2, rWORD8, rWORD6 | |
158db122 LM |
687 | b L(duP4e) |
688 | ||
689 | /* At this point we know rSTR1 is double word aligned and the | |
690 | compare length is at least 8 bytes. */ | |
691 | .align 4 | |
692 | L(DWunaligned): | |
ce6615c9 | 693 | std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) |
fe6e95d7 | 694 | clrrdi rSTR2, rSTR2, 3 |
ce6615c9 | 695 | std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) |
fe6e95d7 | 696 | srdi r0, rN, 5 /* Divide by 32 */ |
ce6615c9 | 697 | std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) |
fe6e95d7 | 698 | andi. r12, rN, 24 /* Get the DW remainder */ |
ce6615c9 | 699 | std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
869d7180 RS |
700 | cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) |
701 | cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) | |
702 | cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) | |
ce6615c9 | 703 | cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) |
fe6e95d7 | 704 | sldi rSHL, rSHL, 3 |
ce6615c9 AZ |
705 | LD rWORD6, 0, rSTR2 |
706 | LD rWORD8, rOFF8, rSTR2 | |
fe6e95d7 | 707 | addi rSTR2, rSTR2, 8 |
fe6e95d7 AM |
708 | cmpldi cr1, r12, 16 |
709 | cmpldi cr7, rN, 32 | |
710 | clrldi rN, rN, 61 | |
711 | subfic rSHR, rSHL, 64 | |
712 | sld rWORD6_SHIFT, rWORD6, rSHL | |
158db122 | 713 | beq L(duP4) |
fe6e95d7 AM |
714 | mtctr r0 |
715 | bgt cr1, L(duP3) | |
716 | beq cr1, L(duP2) | |
158db122 LM |
717 | |
718 | /* Remainder is 8 */ | |
719 | .align 4 | |
720 | L(duP1): | |
fe6e95d7 | 721 | srd r12, rWORD8, rSHR |
ce6615c9 | 722 | LD rWORD7, 0, rSTR1 |
fe6e95d7 AM |
723 | sld rWORD8_SHIFT, rWORD8, rSHL |
724 | or rWORD8, r12, rWORD6_SHIFT | |
725 | blt cr7, L(duP1x) | |
158db122 | 726 | L(duP1e): |
ce6615c9 AZ |
727 | LD rWORD1, rOFF8, rSTR1 |
728 | LD rWORD2, rOFF8, rSTR2 | |
fe6e95d7 AM |
729 | cmpld cr5, rWORD7, rWORD8 |
730 | srd r0, rWORD2, rSHR | |
731 | sld rWORD2_SHIFT, rWORD2, rSHL | |
732 | or rWORD2, r0, rWORD8_SHIFT | |
ce6615c9 AZ |
733 | LD rWORD3, rOFF16, rSTR1 |
734 | LD rWORD4, rOFF16, rSTR2 | |
fe6e95d7 AM |
735 | cmpld cr7, rWORD1, rWORD2 |
736 | srd r12, rWORD4, rSHR | |
737 | sld rWORD4_SHIFT, rWORD4, rSHL | |
738 | bne cr5, L(duLcr5) | |
739 | or rWORD4, r12, rWORD2_SHIFT | |
ce6615c9 AZ |
740 | LD rWORD5, rOFF24, rSTR1 |
741 | LD rWORD6, rOFF24, rSTR2 | |
fe6e95d7 AM |
742 | cmpld cr1, rWORD3, rWORD4 |
743 | srd r0, rWORD6, rSHR | |
744 | sld rWORD6_SHIFT, rWORD6, rSHL | |
745 | bne cr7, L(duLcr7) | |
746 | or rWORD6, r0, rWORD4_SHIFT | |
747 | cmpld cr6, rWORD5, rWORD6 | |
158db122 LM |
748 | b L(duLoop3) |
749 | .align 4 | |
750 | /* At this point we exit early with the first double word compare | |
751 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
752 | how we handle the remaining bytes. */ | |
753 | L(duP1x): | |
fe6e95d7 AM |
754 | cmpld cr5, rWORD7, rWORD8 |
755 | sldi. rN, rN, 3 | |
756 | bne cr5, L(duLcr5) | |
757 | cmpld cr7, rN, rSHR | |
158db122 | 758 | beq L(duZeroReturn) |
fe6e95d7 AM |
759 | li r0, 0 |
760 | ble cr7, L(dutrim) | |
ce6615c9 | 761 | LD rWORD2, rOFF8, rSTR2 |
fe6e95d7 | 762 | srd r0, rWORD2, rSHR |
158db122 LM |
763 | b L(dutrim) |
764 | /* Remainder is 16 */ | |
765 | .align 4 | |
766 | L(duP2): | |
fe6e95d7 | 767 | srd r0, rWORD8, rSHR |
ce6615c9 | 768 | LD rWORD5, 0, rSTR1 |
fe6e95d7 AM |
769 | or rWORD6, r0, rWORD6_SHIFT |
770 | sld rWORD6_SHIFT, rWORD8, rSHL | |
158db122 | 771 | L(duP2e): |
ce6615c9 AZ |
772 | LD rWORD7, rOFF8, rSTR1 |
773 | LD rWORD8, rOFF8, rSTR2 | |
fe6e95d7 AM |
774 | cmpld cr6, rWORD5, rWORD6 |
775 | srd r12, rWORD8, rSHR | |
776 | sld rWORD8_SHIFT, rWORD8, rSHL | |
777 | or rWORD8, r12, rWORD6_SHIFT | |
778 | blt cr7, L(duP2x) | |
ce6615c9 AZ |
779 | LD rWORD1, rOFF16, rSTR1 |
780 | LD rWORD2, rOFF16, rSTR2 | |
fe6e95d7 AM |
781 | cmpld cr5, rWORD7, rWORD8 |
782 | bne cr6, L(duLcr6) | |
783 | srd r0, rWORD2, rSHR | |
784 | sld rWORD2_SHIFT, rWORD2, rSHL | |
785 | or rWORD2, r0, rWORD8_SHIFT | |
ce6615c9 AZ |
786 | LD rWORD3, rOFF24, rSTR1 |
787 | LD rWORD4, rOFF24, rSTR2 | |
fe6e95d7 AM |
788 | cmpld cr7, rWORD1, rWORD2 |
789 | bne cr5, L(duLcr5) | |
790 | srd r12, rWORD4, rSHR | |
791 | sld rWORD4_SHIFT, rWORD4, rSHL | |
792 | or rWORD4, r12, rWORD2_SHIFT | |
fe6e95d7 AM |
793 | addi rSTR1, rSTR1, 8 |
794 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 795 | cmpld cr1, rWORD3, rWORD4 |
158db122 LM |
796 | b L(duLoop2) |
797 | .align 4 | |
798 | L(duP2x): | |
fe6e95d7 | 799 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
800 | addi rSTR1, rSTR1, 8 |
801 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
802 | bne cr6, L(duLcr6) |
803 | sldi. rN, rN, 3 | |
804 | bne cr5, L(duLcr5) | |
805 | cmpld cr7, rN, rSHR | |
158db122 | 806 | beq L(duZeroReturn) |
fe6e95d7 AM |
807 | li r0, 0 |
808 | ble cr7, L(dutrim) | |
ce6615c9 | 809 | LD rWORD2, rOFF8, rSTR2 |
fe6e95d7 | 810 | srd r0, rWORD2, rSHR |
158db122 LM |
811 | b L(dutrim) |
812 | ||
813 | /* Remainder is 24 */ | |
814 | .align 4 | |
815 | L(duP3): | |
fe6e95d7 | 816 | srd r12, rWORD8, rSHR |
ce6615c9 | 817 | LD rWORD3, 0, rSTR1 |
fe6e95d7 AM |
818 | sld rWORD4_SHIFT, rWORD8, rSHL |
819 | or rWORD4, r12, rWORD6_SHIFT | |
158db122 | 820 | L(duP3e): |
ce6615c9 AZ |
821 | LD rWORD5, rOFF8, rSTR1 |
822 | LD rWORD6, rOFF8, rSTR2 | |
fe6e95d7 AM |
823 | cmpld cr1, rWORD3, rWORD4 |
824 | srd r0, rWORD6, rSHR | |
825 | sld rWORD6_SHIFT, rWORD6, rSHL | |
826 | or rWORD6, r0, rWORD4_SHIFT | |
ce6615c9 AZ |
827 | LD rWORD7, rOFF16, rSTR1 |
828 | LD rWORD8, rOFF16, rSTR2 | |
fe6e95d7 AM |
829 | cmpld cr6, rWORD5, rWORD6 |
830 | bne cr1, L(duLcr1) | |
831 | srd r12, rWORD8, rSHR | |
832 | sld rWORD8_SHIFT, rWORD8, rSHL | |
833 | or rWORD8, r12, rWORD6_SHIFT | |
834 | blt cr7, L(duP3x) | |
ce6615c9 AZ |
835 | LD rWORD1, rOFF24, rSTR1 |
836 | LD rWORD2, rOFF24, rSTR2 | |
fe6e95d7 AM |
837 | cmpld cr5, rWORD7, rWORD8 |
838 | bne cr6, L(duLcr6) | |
839 | srd r0, rWORD2, rSHR | |
840 | sld rWORD2_SHIFT, rWORD2, rSHL | |
841 | or rWORD2, r0, rWORD8_SHIFT | |
fe6e95d7 AM |
842 | addi rSTR1, rSTR1, 16 |
843 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 | 844 | cmpld cr7, rWORD1, rWORD2 |
158db122 LM |
845 | b L(duLoop1) |
846 | .align 4 | |
847 | L(duP3x): | |
fe6e95d7 AM |
848 | addi rSTR1, rSTR1, 16 |
849 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
850 | cmpld cr5, rWORD7, rWORD8 |
851 | bne cr6, L(duLcr6) | |
852 | sldi. rN, rN, 3 | |
853 | bne cr5, L(duLcr5) | |
854 | cmpld cr7, rN, rSHR | |
158db122 | 855 | beq L(duZeroReturn) |
fe6e95d7 AM |
856 | li r0, 0 |
857 | ble cr7, L(dutrim) | |
ce6615c9 | 858 | LD rWORD2, rOFF8, rSTR2 |
fe6e95d7 | 859 | srd r0, rWORD2, rSHR |
158db122 LM |
860 | b L(dutrim) |
861 | ||
862 | /* Count is a multiple of 32, remainder is 0 */ | |
863 | .align 4 | |
864 | L(duP4): | |
fe6e95d7 AM |
865 | mtctr r0 |
866 | srd r0, rWORD8, rSHR | |
ce6615c9 | 867 | LD rWORD1, 0, rSTR1 |
fe6e95d7 AM |
868 | sld rWORD2_SHIFT, rWORD8, rSHL |
869 | or rWORD2, r0, rWORD6_SHIFT | |
158db122 | 870 | L(duP4e): |
ce6615c9 AZ |
871 | LD rWORD3, rOFF8, rSTR1 |
872 | LD rWORD4, rOFF8, rSTR2 | |
fe6e95d7 AM |
873 | cmpld cr7, rWORD1, rWORD2 |
874 | srd r12, rWORD4, rSHR | |
875 | sld rWORD4_SHIFT, rWORD4, rSHL | |
876 | or rWORD4, r12, rWORD2_SHIFT | |
ce6615c9 AZ |
877 | LD rWORD5, rOFF16, rSTR1 |
878 | LD rWORD6, rOFF16, rSTR2 | |
fe6e95d7 AM |
879 | cmpld cr1, rWORD3, rWORD4 |
880 | bne cr7, L(duLcr7) | |
881 | srd r0, rWORD6, rSHR | |
882 | sld rWORD6_SHIFT, rWORD6, rSHL | |
883 | or rWORD6, r0, rWORD4_SHIFT | |
ce6615c9 AZ |
884 | LD rWORD7, rOFF24, rSTR1 |
885 | LD rWORD8, rOFF24, rSTR2 | |
886 | addi rSTR1, rSTR1, 24 | |
887 | addi rSTR2, rSTR2, 24 | |
fe6e95d7 AM |
888 | cmpld cr6, rWORD5, rWORD6 |
889 | bne cr1, L(duLcr1) | |
890 | srd r12, rWORD8, rSHR | |
891 | sld rWORD8_SHIFT, rWORD8, rSHL | |
892 | or rWORD8, r12, rWORD6_SHIFT | |
893 | cmpld cr5, rWORD7, rWORD8 | |
158db122 LM |
894 | bdz L(du24) /* Adjust CTR as we start with +4 */ |
895 | /* This is the primary loop */ | |
896 | .align 4 | |
897 | L(duLoop): | |
ce6615c9 AZ |
898 | LD rWORD1, rOFF8, rSTR1 |
899 | LD rWORD2, rOFF8, rSTR2 | |
fe6e95d7 AM |
900 | cmpld cr1, rWORD3, rWORD4 |
901 | bne cr6, L(duLcr6) | |
902 | srd r0, rWORD2, rSHR | |
903 | sld rWORD2_SHIFT, rWORD2, rSHL | |
904 | or rWORD2, r0, rWORD8_SHIFT | |
158db122 | 905 | L(duLoop1): |
ce6615c9 AZ |
906 | LD rWORD3, rOFF16, rSTR1 |
907 | LD rWORD4, rOFF16, rSTR2 | |
fe6e95d7 AM |
908 | cmpld cr6, rWORD5, rWORD6 |
909 | bne cr5, L(duLcr5) | |
910 | srd r12, rWORD4, rSHR | |
911 | sld rWORD4_SHIFT, rWORD4, rSHL | |
912 | or rWORD4, r12, rWORD2_SHIFT | |
158db122 | 913 | L(duLoop2): |
ce6615c9 AZ |
914 | LD rWORD5, rOFF24, rSTR1 |
915 | LD rWORD6, rOFF24, rSTR2 | |
fe6e95d7 AM |
916 | cmpld cr5, rWORD7, rWORD8 |
917 | bne cr7, L(duLcr7) | |
918 | srd r0, rWORD6, rSHR | |
919 | sld rWORD6_SHIFT, rWORD6, rSHL | |
920 | or rWORD6, r0, rWORD4_SHIFT | |
158db122 | 921 | L(duLoop3): |
ce6615c9 AZ |
922 | LD rWORD7, rOFF32, rSTR1 |
923 | LD rWORD8, rOFF32, rSTR2 | |
924 | addi rSTR1, rSTR1, 32 | |
925 | addi rSTR2, rSTR2, 32 | |
fe6e95d7 AM |
926 | cmpld cr7, rWORD1, rWORD2 |
927 | bne cr1, L(duLcr1) | |
928 | srd r12, rWORD8, rSHR | |
929 | sld rWORD8_SHIFT, rWORD8, rSHL | |
930 | or rWORD8, r12, rWORD6_SHIFT | |
158db122 LM |
931 | bdnz L(duLoop) |
932 | ||
933 | L(duL4): | |
fe6e95d7 AM |
934 | cmpld cr1, rWORD3, rWORD4 |
935 | bne cr6, L(duLcr6) | |
936 | cmpld cr6, rWORD5, rWORD6 | |
937 | bne cr5, L(duLcr5) | |
938 | cmpld cr5, rWORD7, rWORD8 | |
158db122 | 939 | L(du44): |
fe6e95d7 | 940 | bne cr7, L(duLcr7) |
158db122 | 941 | L(du34): |
fe6e95d7 | 942 | bne cr1, L(duLcr1) |
158db122 | 943 | L(du24): |
fe6e95d7 | 944 | bne cr6, L(duLcr6) |
158db122 | 945 | L(du14): |
fe6e95d7 AM |
946 | sldi. rN, rN, 3 |
947 | bne cr5, L(duLcr5) | |
158db122 | 948 | /* At this point we have a remainder of 1 to 7 bytes to compare. We use |
2ccdea26 | 949 | shift right double to eliminate bits beyond the compare length. |
158db122 | 950 | |
1dc24235 | 951 | However it may not be safe to load rWORD2 which may be beyond the |
158db122 LM |
952 | string length. So we compare the bit length of the remainder to |
953 | the right shift count (rSHR). If the bit count is less than or equal | |
954 | we do not need to load rWORD2 (all significant bits are already in | |
fe6e95d7 AM |
955 | rWORD8_SHIFT). */ |
956 | cmpld cr7, rN, rSHR | |
158db122 | 957 | beq L(duZeroReturn) |
fe6e95d7 AM |
958 | li r0, 0 |
959 | ble cr7, L(dutrim) | |
ce6615c9 | 960 | LD rWORD2, rOFF8, rSTR2 |
fe6e95d7 | 961 | srd r0, rWORD2, rSHR |
158db122 LM |
962 | .align 4 |
963 | L(dutrim): | |
ce6615c9 | 964 | LD rWORD1, rOFF8, rSTR1 |
fe6e95d7 AM |
965 | ld rWORD8, -8(r1) |
966 | subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ | |
967 | or rWORD2, r0, rWORD8_SHIFT | |
ce6615c9 AZ |
968 | ld rWORD7, rWORD7SAVE(r1) |
969 | ld rSHL, rSHLSAVE(r1) | |
fe6e95d7 AM |
970 | srd rWORD1, rWORD1, rN |
971 | srd rWORD2, rWORD2, rN | |
ce6615c9 AZ |
972 | ld rSHR, rSHRSAVE(r1) |
973 | ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) | |
fe6e95d7 AM |
974 | li rRTN, 0 |
975 | cmpld cr7, rWORD1, rWORD2 | |
ce6615c9 AZ |
976 | ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) |
977 | ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) | |
fe6e95d7 AM |
978 | beq cr7, L(dureturn24) |
979 | li rRTN, 1 | |
ce6615c9 AZ |
980 | ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
981 | ld rOFF8, rOFF8SAVE(r1) | |
982 | ld rOFF16, rOFF16SAVE(r1) | |
983 | ld rOFF24, rOFF24SAVE(r1) | |
984 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 AM |
985 | bgtlr cr7 |
986 | li rRTN, -1 | |
158db122 LM |
987 | blr |
988 | .align 4 | |
fe6e95d7 | 989 | L(duLcr7): |
ce6615c9 AZ |
990 | ld rWORD8, rWORD8SAVE(r1) |
991 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 AM |
992 | li rRTN, 1 |
993 | bgt cr7, L(dureturn29) | |
ce6615c9 AZ |
994 | ld rSHL, rSHLSAVE(r1) |
995 | ld rSHR, rSHRSAVE(r1) | |
fe6e95d7 | 996 | li rRTN, -1 |
158db122 LM |
997 | b L(dureturn27) |
998 | .align 4 | |
999 | L(duLcr1): | |
ce6615c9 AZ |
1000 | ld rWORD8, rWORD8SAVE(r1) |
1001 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 AM |
1002 | li rRTN, 1 |
1003 | bgt cr1, L(dureturn29) | |
ce6615c9 AZ |
1004 | ld rSHL, rSHLSAVE(r1) |
1005 | ld rSHR, rSHRSAVE(r1) | |
fe6e95d7 | 1006 | li rRTN, -1 |
158db122 LM |
1007 | b L(dureturn27) |
1008 | .align 4 | |
1009 | L(duLcr6): | |
ce6615c9 AZ |
1010 | ld rWORD8, rWORD8SAVE(r1) |
1011 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 AM |
1012 | li rRTN, 1 |
1013 | bgt cr6, L(dureturn29) | |
ce6615c9 AZ |
1014 | ld rSHL, rSHLSAVE(r1) |
1015 | ld rSHR, rSHRSAVE(r1) | |
fe6e95d7 | 1016 | li rRTN, -1 |
158db122 LM |
1017 | b L(dureturn27) |
1018 | .align 4 | |
1019 | L(duLcr5): | |
ce6615c9 AZ |
1020 | ld rWORD8, rWORD8SAVE(r1) |
1021 | ld rWORD7, rWORD7SAVE(r1) | |
fe6e95d7 AM |
1022 | li rRTN, 1 |
1023 | bgt cr5, L(dureturn29) | |
ce6615c9 AZ |
1024 | ld rSHL, rSHLSAVE(r1) |
1025 | ld rSHR, rSHRSAVE(r1) | |
fe6e95d7 | 1026 | li rRTN, -1 |
158db122 | 1027 | b L(dureturn27) |
ce6615c9 | 1028 | |
158db122 LM |
1029 | .align 3 |
1030 | L(duZeroReturn): | |
fe6e95d7 | 1031 | li rRTN, 0 |
158db122 LM |
1032 | .align 4 |
1033 | L(dureturn): | |
ce6615c9 AZ |
1034 | ld rWORD8, rWORD8SAVE(r1) |
1035 | ld rWORD7, rWORD7SAVE(r1) | |
158db122 | 1036 | L(dureturn29): |
ce6615c9 AZ |
1037 | ld rSHL, rSHLSAVE(r1) |
1038 | ld rSHR, rSHRSAVE(r1) | |
158db122 | 1039 | L(dureturn27): |
ce6615c9 AZ |
1040 | ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) |
1041 | ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) | |
1042 | ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) | |
158db122 | 1043 | L(dureturn24): |
ce6615c9 AZ |
1044 | ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) |
1045 | ld rOFF8, rOFF8SAVE(r1) | |
1046 | ld rOFF16, rOFF16SAVE(r1) | |
1047 | ld rOFF24, rOFF24SAVE(r1) | |
1048 | ld rOFF32, rOFF32SAVE(r1) | |
158db122 | 1049 | blr |
ce6615c9 | 1050 | |
158db122 | 1051 | L(duzeroLength): |
ce6615c9 AZ |
1052 | ld rOFF8, rOFF8SAVE(r1) |
1053 | ld rOFF16, rOFF16SAVE(r1) | |
1054 | ld rOFF24, rOFF24SAVE(r1) | |
1055 | ld rOFF32, rOFF32SAVE(r1) | |
fe6e95d7 | 1056 | li rRTN, 0 |
158db122 LM |
1057 | blr |
1058 | ||
b6a66222 | 1059 | END (MEMCMP) |
158db122 | 1060 | libc_hidden_builtin_def (memcmp) |
fe6e95d7 | 1061 | weak_alias (memcmp, bcmp) |