]>
Commit | Line | Data |
---|---|---|
0ecb606c JJ |
1 | /* Optimized strcmp implementation for PowerPC64. |
2 | Copyright (C) 2003, 2006 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, write to the Free | |
17 | Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA | |
18 | 02110-1301 USA. */ | |
19 | ||
20 | #include <sysdep.h> | |
21 | #include <bp-sym.h> | |
22 | #include <bp-asm.h> | |
23 | ||
24 | /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */ | |
25 | ||
26 | EALIGN (BP_SYM(memcmp), 4, 0) | |
27 | CALL_MCOUNT | |
28 | ||
29 | #define rTMP r0 | |
30 | #define rRTN r3 | |
31 | #define rSTR1 r3 /* first string arg */ | |
32 | #define rSTR2 r4 /* second string arg */ | |
33 | #define rN r5 /* max string length */ | |
34 | #define rWORD1 r6 /* current word in s1 */ | |
35 | #define rWORD2 r7 /* current word in s2 */ | |
36 | #define rWORD3 r8 /* next word in s1 */ | |
37 | #define rWORD4 r9 /* next word in s2 */ | |
38 | #define rWORD5 r10 /* next word in s1 */ | |
39 | #define rWORD6 r11 /* next word in s2 */ | |
40 | #define rBITDIF r12 /* bits that differ in s1 & s2 words */ | |
41 | #define rWORD7 r30 /* next word in s1 */ | |
42 | #define rWORD8 r31 /* next word in s2 */ | |
43 | ||
44 | xor rTMP, rSTR2, rSTR1 | |
45 | cmplwi cr6, rN, 0 | |
46 | cmplwi cr1, rN, 12 | |
47 | clrlwi. rTMP, rTMP, 30 | |
48 | clrlwi rBITDIF, rSTR1, 30 | |
49 | cmplwi cr5, rBITDIF, 0 | |
50 | beq- cr6, L(zeroLength) | |
51 | dcbt 0,rSTR1 | |
52 | dcbt 0,rSTR2 | |
53 | /* If less than 8 bytes or not aligned, use the unaligned | |
54 | byte loop. */ | |
55 | blt cr1, L(bytealigned) | |
56 | stwu 1,-64(1) | |
57 | cfi_adjust_cfa_offset(64) | |
58 | stw r31,48(1) | |
59 | cfi_offset(31,(48-64)) | |
60 | stw r30,44(1) | |
61 | cfi_offset(30,(44-64)) | |
62 | bne L(unaligned) | |
63 | /* At this point we know both strings have the same alignment and the | |
64 | compare length is at least 8 bytes. rBITDIF contains the low order | |
65 | 2 bits of rSTR1 and cr5 contains the result of the logical compare | |
66 | of rBITDIF to 0. If rBITDIF == 0 then we are already word | |
67 | aligned and can perform the word aligned loop. | |
68 | ||
69 | Otherwise we know the two strings have the same alignment (but not | |
70 | yet word aligned). So we force the string addresses to the next lower | |
71 | word boundary and special case this first word using shift left to | |
72 | eliminate bits preceeding the first byte. Since we want to join the | |
73 | normal (word aligned) compare loop, starting at the second word, | |
74 | we need to adjust the length (rN) and special case the loop | |
75 | versioning for the first word. This insures that the loop count is | |
76 | correct and the first word (shifted) is in the expected register pair. */ | |
77 | .align 4 | |
78 | L(samealignment): | |
79 | clrrwi rSTR1, rSTR1, 2 | |
80 | clrrwi rSTR2, rSTR2, 2 | |
81 | beq cr5, L(Waligned) | |
82 | add rN, rN, rBITDIF | |
83 | slwi r11, rBITDIF, 3 | |
84 | srwi rTMP, rN, 4 /* Divide by 16 */ | |
85 | andi. rBITDIF, rN, 12 /* Get the word remainder */ | |
86 | lwz rWORD1, 0(rSTR1) | |
87 | lwz rWORD2, 0(rSTR2) | |
88 | cmplwi cr1, rBITDIF, 8 | |
89 | cmplwi cr7, rN, 16 | |
90 | clrlwi rN, rN, 30 | |
91 | beq L(dPs4) | |
92 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
93 | bgt cr1, L(dPs3) | |
94 | beq cr1, L(dPs2) | |
95 | ||
96 | /* Remainder is 4 */ | |
97 | .align 3 | |
98 | L(dsP1): | |
99 | slw rWORD5, rWORD1, r11 | |
100 | slw rWORD6, rWORD2, r11 | |
101 | cmplw cr5, rWORD5, rWORD6 | |
102 | blt cr7, L(dP1x) | |
103 | /* Do something useful in this cycle since we have to branch anyway. */ | |
104 | lwz rWORD1, 4(rSTR1) | |
105 | lwz rWORD2, 4(rSTR2) | |
106 | cmplw cr0, rWORD1, rWORD2 | |
107 | b L(dP1e) | |
108 | /* Remainder is 8 */ | |
109 | .align 4 | |
110 | L(dPs2): | |
111 | slw rWORD5, rWORD1, r11 | |
112 | slw rWORD6, rWORD2, r11 | |
113 | cmplw cr6, rWORD5, rWORD6 | |
114 | blt cr7, L(dP2x) | |
115 | /* Do something useful in this cycle since we have to branch anyway. */ | |
116 | lwz rWORD7, 4(rSTR1) | |
117 | lwz rWORD8, 4(rSTR2) | |
118 | cmplw cr5, rWORD7, rWORD8 | |
119 | b L(dP2e) | |
120 | /* Remainder is 12 */ | |
121 | .align 4 | |
122 | L(dPs3): | |
123 | slw rWORD3, rWORD1, r11 | |
124 | slw rWORD4, rWORD2, r11 | |
125 | cmplw cr1, rWORD3, rWORD4 | |
126 | b L(dP3e) | |
127 | /* Count is a multiple of 16, remainder is 0 */ | |
128 | .align 4 | |
129 | L(dPs4): | |
130 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
131 | slw rWORD1, rWORD1, r11 | |
132 | slw rWORD2, rWORD2, r11 | |
133 | cmplw cr0, rWORD1, rWORD2 | |
134 | b L(dP4e) | |
135 | ||
136 | /* At this point we know both strings are word aligned and the | |
137 | compare length is at least 8 bytes. */ | |
138 | .align 4 | |
139 | L(Waligned): | |
140 | andi. rBITDIF, rN, 12 /* Get the word remainder */ | |
141 | srwi rTMP, rN, 4 /* Divide by 16 */ | |
142 | cmplwi cr1, rBITDIF, 8 | |
143 | cmplwi cr7, rN, 16 | |
144 | clrlwi rN, rN, 30 | |
145 | beq L(dP4) | |
146 | bgt cr1, L(dP3) | |
147 | beq cr1, L(dP2) | |
148 | ||
149 | /* Remainder is 4 */ | |
150 | .align 4 | |
151 | L(dP1): | |
152 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
153 | /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early | |
154 | (8-15 byte compare), we want to use only volatile registers. This | |
155 | means we can avoid restoring non-volatile registers since we did not | |
156 | change any on the early exit path. The key here is the non-early | |
157 | exit path only cares about the condition code (cr5), not about which | |
158 | register pair was used. */ | |
159 | lwz rWORD5, 0(rSTR1) | |
160 | lwz rWORD6, 0(rSTR2) | |
161 | cmplw cr5, rWORD5, rWORD6 | |
162 | blt cr7, L(dP1x) | |
163 | lwz rWORD1, 4(rSTR1) | |
164 | lwz rWORD2, 4(rSTR2) | |
165 | cmplw cr0, rWORD1, rWORD2 | |
166 | L(dP1e): | |
167 | lwz rWORD3, 8(rSTR1) | |
168 | lwz rWORD4, 8(rSTR2) | |
169 | cmplw cr1, rWORD3, rWORD4 | |
170 | lwz rWORD5, 12(rSTR1) | |
171 | lwz rWORD6, 12(rSTR2) | |
172 | cmplw cr6, rWORD5, rWORD6 | |
173 | bne cr5, L(dLcr5) | |
174 | bne cr0, L(dLcr0) | |
175 | ||
176 | lwzu rWORD7, 16(rSTR1) | |
177 | lwzu rWORD8, 16(rSTR2) | |
178 | bne cr1, L(dLcr1) | |
179 | cmplw cr5, rWORD7, rWORD8 | |
180 | bdnz L(dLoop) | |
181 | bne cr6, L(dLcr6) | |
182 | lwz r30,44(1) | |
183 | lwz r31,48(1) | |
184 | .align 3 | |
185 | L(dP1x): | |
186 | slwi. r12, rN, 3 | |
187 | bne cr5, L(dLcr5) | |
188 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ | |
189 | lwz 1,0(1) | |
190 | bne L(d00) | |
191 | li rRTN, 0 | |
192 | blr | |
193 | ||
194 | /* Remainder is 8 */ | |
195 | .align 4 | |
196 | L(dP2): | |
197 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
198 | lwz rWORD5, 0(rSTR1) | |
199 | lwz rWORD6, 0(rSTR2) | |
200 | cmplw cr6, rWORD5, rWORD6 | |
201 | blt cr7, L(dP2x) | |
202 | lwz rWORD7, 4(rSTR1) | |
203 | lwz rWORD8, 4(rSTR2) | |
204 | cmplw cr5, rWORD7, rWORD8 | |
205 | L(dP2e): | |
206 | lwz rWORD1, 8(rSTR1) | |
207 | lwz rWORD2, 8(rSTR2) | |
208 | cmplw cr0, rWORD1, rWORD2 | |
209 | lwz rWORD3, 12(rSTR1) | |
210 | lwz rWORD4, 12(rSTR2) | |
211 | cmplw cr1, rWORD3, rWORD4 | |
212 | addi rSTR1, rSTR1, 4 | |
213 | addi rSTR2, rSTR2, 4 | |
214 | bne cr6, L(dLcr6) | |
215 | bne cr5, L(dLcr5) | |
216 | b L(dLoop2) | |
217 | /* Again we are on a early exit path (16-23 byte compare), we want to | |
218 | only use volatile registers and avoid restoring non-volatile | |
219 | registers. */ | |
220 | .align 4 | |
221 | L(dP2x): | |
222 | lwz rWORD3, 4(rSTR1) | |
223 | lwz rWORD4, 4(rSTR2) | |
224 | cmplw cr5, rWORD3, rWORD4 | |
225 | slwi. r12, rN, 3 | |
226 | bne cr6, L(dLcr6) | |
227 | addi rSTR1, rSTR1, 4 | |
228 | addi rSTR2, rSTR2, 4 | |
229 | bne cr5, L(dLcr5) | |
230 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ | |
231 | lwz 1,0(1) | |
232 | bne L(d00) | |
233 | li rRTN, 0 | |
234 | blr | |
235 | ||
236 | /* Remainder is 12 */ | |
237 | .align 4 | |
238 | L(dP3): | |
239 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
240 | lwz rWORD3, 0(rSTR1) | |
241 | lwz rWORD4, 0(rSTR2) | |
242 | cmplw cr1, rWORD3, rWORD4 | |
243 | L(dP3e): | |
244 | lwz rWORD5, 4(rSTR1) | |
245 | lwz rWORD6, 4(rSTR2) | |
246 | cmplw cr6, rWORD5, rWORD6 | |
247 | blt cr7, L(dP3x) | |
248 | lwz rWORD7, 8(rSTR1) | |
249 | lwz rWORD8, 8(rSTR2) | |
250 | cmplw cr5, rWORD7, rWORD8 | |
251 | lwz rWORD1, 12(rSTR1) | |
252 | lwz rWORD2, 12(rSTR2) | |
253 | cmplw cr0, rWORD1, rWORD2 | |
254 | addi rSTR1, rSTR1, 8 | |
255 | addi rSTR2, rSTR2, 8 | |
256 | bne cr1, L(dLcr1) | |
257 | bne cr6, L(dLcr6) | |
258 | b L(dLoop1) | |
259 | /* Again we are on a early exit path (24-31 byte compare), we want to | |
260 | only use volatile registers and avoid restoring non-volatile | |
261 | registers. */ | |
262 | .align 4 | |
263 | L(dP3x): | |
264 | lwz rWORD1, 8(rSTR1) | |
265 | lwz rWORD2, 8(rSTR2) | |
266 | cmplw cr5, rWORD1, rWORD2 | |
267 | slwi. r12, rN, 3 | |
268 | bne cr1, L(dLcr1) | |
269 | addi rSTR1, rSTR1, 8 | |
270 | addi rSTR2, rSTR2, 8 | |
271 | bne cr6, L(dLcr6) | |
272 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ | |
273 | bne cr5, L(dLcr5) | |
274 | lwz 1,0(1) | |
275 | bne L(d00) | |
276 | li rRTN, 0 | |
277 | blr | |
278 | ||
279 | /* Count is a multiple of 16, remainder is 0 */ | |
280 | .align 4 | |
281 | L(dP4): | |
282 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
283 | lwz rWORD1, 0(rSTR1) | |
284 | lwz rWORD2, 0(rSTR2) | |
285 | cmplw cr0, rWORD1, rWORD2 | |
286 | L(dP4e): | |
287 | lwz rWORD3, 4(rSTR1) | |
288 | lwz rWORD4, 4(rSTR2) | |
289 | cmplw cr1, rWORD3, rWORD4 | |
290 | lwz rWORD5, 8(rSTR1) | |
291 | lwz rWORD6, 8(rSTR2) | |
292 | cmplw cr6, rWORD5, rWORD6 | |
293 | lwzu rWORD7, 12(rSTR1) | |
294 | lwzu rWORD8, 12(rSTR2) | |
295 | cmplw cr5, rWORD7, rWORD8 | |
296 | bne cr0, L(dLcr0) | |
297 | bne cr1, L(dLcr1) | |
298 | bdz- L(d24) /* Adjust CTR as we start with +4 */ | |
299 | /* This is the primary loop */ | |
300 | .align 4 | |
301 | L(dLoop): | |
302 | lwz rWORD1, 4(rSTR1) | |
303 | lwz rWORD2, 4(rSTR2) | |
304 | cmplw cr1, rWORD3, rWORD4 | |
305 | bne cr6, L(dLcr6) | |
306 | L(dLoop1): | |
307 | lwz rWORD3, 8(rSTR1) | |
308 | lwz rWORD4, 8(rSTR2) | |
309 | cmplw cr6, rWORD5, rWORD6 | |
310 | bne cr5, L(dLcr5) | |
311 | L(dLoop2): | |
312 | lwz rWORD5, 12(rSTR1) | |
313 | lwz rWORD6, 12(rSTR2) | |
314 | cmplw cr5, rWORD7, rWORD8 | |
315 | bne cr0, L(dLcr0) | |
316 | L(dLoop3): | |
317 | lwzu rWORD7, 16(rSTR1) | |
318 | lwzu rWORD8, 16(rSTR2) | |
319 | bne- cr1, L(dLcr1) | |
320 | cmplw cr0, rWORD1, rWORD2 | |
321 | bdnz+ L(dLoop) | |
322 | ||
323 | L(dL4): | |
324 | cmplw cr1, rWORD3, rWORD4 | |
325 | bne cr6, L(dLcr6) | |
326 | cmplw cr6, rWORD5, rWORD6 | |
327 | bne cr5, L(dLcr5) | |
328 | cmplw cr5, rWORD7, rWORD8 | |
329 | L(d44): | |
330 | bne cr0, L(dLcr0) | |
331 | L(d34): | |
332 | bne cr1, L(dLcr1) | |
333 | L(d24): | |
334 | bne cr6, L(dLcr6) | |
335 | L(d14): | |
336 | slwi. r12, rN, 3 | |
337 | bne cr5, L(dLcr5) | |
338 | L(d04): | |
339 | lwz r30,44(1) | |
340 | lwz r31,48(1) | |
341 | lwz 1,0(1) | |
342 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ | |
343 | beq L(zeroLength) | |
344 | /* At this point we have a remainder of 1 to 3 bytes to compare. Since | |
345 | we are aligned it is safe to load the whole word, and use | |
346 | shift right to eliminate bits beyond the compare length. */ | |
347 | L(d00): | |
348 | lwz rWORD1, 4(rSTR1) | |
349 | lwz rWORD2, 4(rSTR2) | |
350 | srw rWORD1, rWORD1, rN | |
351 | srw rWORD2, rWORD2, rN | |
352 | cmplw rWORD1,rWORD2 | |
353 | li rRTN,0 | |
354 | beqlr | |
355 | li rRTN,1 | |
356 | bgtlr | |
357 | li rRTN,-1 | |
358 | blr | |
359 | ||
360 | .align 4 | |
361 | L(dLcr0): | |
362 | lwz r30,44(1) | |
363 | lwz r31,48(1) | |
364 | li rRTN, 1 | |
365 | lwz 1,0(1) | |
366 | bgtlr cr0 | |
367 | li rRTN, -1 | |
368 | blr | |
369 | .align 4 | |
370 | L(dLcr1): | |
371 | lwz r30,44(1) | |
372 | lwz r31,48(1) | |
373 | li rRTN, 1 | |
374 | lwz 1,0(1) | |
375 | bgtlr cr1 | |
376 | li rRTN, -1 | |
377 | blr | |
378 | .align 4 | |
379 | L(dLcr6): | |
380 | lwz r30,44(1) | |
381 | lwz r31,48(1) | |
382 | li rRTN, 1 | |
383 | lwz 1,0(1) | |
384 | bgtlr cr6 | |
385 | li rRTN, -1 | |
386 | blr | |
387 | .align 4 | |
388 | L(dLcr5): | |
389 | lwz r30,44(1) | |
390 | lwz r31,48(1) | |
391 | L(dLcr5x): | |
392 | li rRTN, 1 | |
393 | lwz 1,0(1) | |
394 | bgtlr cr5 | |
395 | li rRTN, -1 | |
396 | blr | |
397 | ||
398 | .align 4 | |
399 | L(bytealigned): | |
400 | cfi_adjust_cfa_offset(-64) | |
401 | mtctr rN /* Power4 wants mtctr 1st in dispatch group */ | |
402 | ||
403 | /* We need to prime this loop. This loop is swing modulo scheduled | |
404 | to avoid pipe delays. The dependent instruction latencies (load to | |
405 | compare to conditional branch) is 2 to 3 cycles. In this loop each | |
406 | dispatch group ends in a branch and takes 1 cycle. Effectively | |
407 | the first iteration of the loop only serves to load operands and | |
408 | branches based on compares are delayed until the next loop. | |
409 | ||
410 | So we must precondition some registers and condition codes so that | |
411 | we don't exit the loop early on the first iteration. */ | |
412 | ||
413 | lbz rWORD1, 0(rSTR1) | |
414 | lbz rWORD2, 0(rSTR2) | |
415 | bdz- L(b11) | |
416 | cmplw cr0, rWORD1, rWORD2 | |
417 | lbz rWORD3, 1(rSTR1) | |
418 | lbz rWORD4, 1(rSTR2) | |
419 | bdz- L(b12) | |
420 | cmplw cr1, rWORD3, rWORD4 | |
421 | lbzu rWORD5, 2(rSTR1) | |
422 | lbzu rWORD6, 2(rSTR2) | |
423 | bdz- L(b13) | |
424 | .align 4 | |
425 | L(bLoop): | |
426 | lbzu rWORD1, 1(rSTR1) | |
427 | lbzu rWORD2, 1(rSTR2) | |
428 | bne- cr0, L(bLcr0) | |
429 | ||
430 | cmplw cr6, rWORD5, rWORD6 | |
431 | bdz- L(b3i) | |
432 | ||
433 | lbzu rWORD3, 1(rSTR1) | |
434 | lbzu rWORD4, 1(rSTR2) | |
435 | bne- cr1, L(bLcr1) | |
436 | ||
437 | cmplw cr0, rWORD1, rWORD2 | |
438 | bdz- L(b2i) | |
439 | ||
440 | lbzu rWORD5, 1(rSTR1) | |
441 | lbzu rWORD6, 1(rSTR2) | |
442 | bne- cr6, L(bLcr6) | |
443 | ||
444 | cmplw cr1, rWORD3, rWORD4 | |
445 | bdnz+ L(bLoop) | |
446 | ||
447 | /* We speculatively loading bytes before we have tested the previous | |
448 | bytes. But we must avoid overrunning the length (in the ctr) to | |
449 | prevent these speculative loads from causing a segfault. In this | |
450 | case the loop will exit early (before the all pending bytes are | |
451 | tested. In this case we must complete the pending operations | |
452 | before returning. */ | |
453 | L(b1i): | |
454 | bne- cr0, L(bLcr0) | |
455 | bne- cr1, L(bLcr1) | |
456 | b L(bx56) | |
457 | .align 4 | |
458 | L(b2i): | |
459 | bne- cr6, L(bLcr6) | |
460 | bne- cr0, L(bLcr0) | |
461 | b L(bx34) | |
462 | .align 4 | |
463 | L(b3i): | |
464 | bne- cr1, L(bLcr1) | |
465 | bne- cr6, L(bLcr6) | |
466 | b L(bx12) | |
467 | .align 4 | |
468 | L(bLcr0): | |
469 | li rRTN, 1 | |
470 | bgtlr cr0 | |
471 | li rRTN, -1 | |
472 | blr | |
473 | L(bLcr1): | |
474 | li rRTN, 1 | |
475 | bgtlr cr1 | |
476 | li rRTN, -1 | |
477 | blr | |
478 | L(bLcr6): | |
479 | li rRTN, 1 | |
480 | bgtlr cr6 | |
481 | li rRTN, -1 | |
482 | blr | |
483 | ||
484 | L(b13): | |
485 | bne- cr0, L(bx12) | |
486 | bne- cr1, L(bx34) | |
487 | L(bx56): | |
488 | sub rRTN, rWORD5, rWORD6 | |
489 | blr | |
490 | nop | |
491 | L(b12): | |
492 | bne- cr0, L(bx12) | |
493 | L(bx34): | |
494 | sub rRTN, rWORD3, rWORD4 | |
495 | blr | |
496 | ||
497 | L(b11): | |
498 | L(bx12): | |
499 | sub rRTN, rWORD1, rWORD2 | |
500 | blr | |
501 | ||
502 | .align 4 | |
503 | L(zeroLengthReturn): | |
504 | ||
505 | L(zeroLength): | |
506 | li rRTN, 0 | |
507 | blr | |
508 | ||
509 | cfi_adjust_cfa_offset(64) | |
510 | .align 4 | |
511 | /* At this point we know the strings have different alignment and the | |
512 | compare length is at least 8 bytes. rBITDIF contains the low order | |
513 | 2 bits of rSTR1 and cr5 contains the result of the logical compare | |
514 | of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can | |
515 | perform the Wunaligned loop. | |
516 | ||
517 | Otherwise we know that rSTR1 is not aready word aligned yet. | |
518 | So we can force the string addresses to the next lower word | |
519 | boundary and special case this first word using shift left to | |
520 | eliminate bits preceeding the first byte. Since we want to join the | |
521 | normal (Wualigned) compare loop, starting at the second word, | |
522 | we need to adjust the length (rN) and special case the loop | |
523 | versioning for the first W. This insures that the loop count is | |
524 | correct and the first W (shifted) is in the expected resister pair. */ | |
525 | #define rSHL r29 /* Unaligned shift left count. */ | |
526 | #define rSHR r28 /* Unaligned shift right count. */ | |
527 | #define rB r27 /* Left rotation temp for rWORD2. */ | |
528 | #define rD r26 /* Left rotation temp for rWORD4. */ | |
529 | #define rF r25 /* Left rotation temp for rWORD6. */ | |
530 | #define rH r24 /* Left rotation temp for rWORD8. */ | |
531 | #define rA r0 /* Right rotation temp for rWORD2. */ | |
532 | #define rC r12 /* Right rotation temp for rWORD4. */ | |
533 | #define rE r0 /* Right rotation temp for rWORD6. */ | |
534 | #define rG r12 /* Right rotation temp for rWORD8. */ | |
535 | L(unaligned): | |
536 | stw r29,40(r1) | |
537 | cfi_offset(r29,(40-64)) | |
538 | clrlwi rSHL, rSTR2, 30 | |
539 | stw r28,36(r1) | |
540 | cfi_offset(r28,(36-64)) | |
541 | beq cr5, L(Wunaligned) | |
542 | stw r27,32(r1) | |
543 | cfi_offset(r27,(32-64)) | |
544 | /* Adjust the logical start of rSTR2 to compensate for the extra bits | |
545 | in the 1st rSTR1 W. */ | |
546 | sub r27, rSTR2, rBITDIF | |
547 | /* But do not attempt to address the W before that W that contains | |
548 | the actual start of rSTR2. */ | |
549 | clrrwi rSTR2, rSTR2, 2 | |
550 | stw r26,28(r1) | |
551 | cfi_offset(r26,(28-64)) | |
552 | /* Compute the left/right shift counts for the unalign rSTR2, | |
553 | compensating for the logical (W aligned) start of rSTR1. */ | |
554 | clrlwi rSHL, r27, 30 | |
555 | clrrwi rSTR1, rSTR1, 2 | |
556 | stw r25,24(r1) | |
557 | cfi_offset(r25,(24-64)) | |
558 | slwi rSHL, rSHL, 3 | |
559 | cmplw cr5, r27, rSTR2 | |
560 | add rN, rN, rBITDIF | |
561 | slwi r11, rBITDIF, 3 | |
562 | stw r24,20(r1) | |
563 | cfi_offset(r24,(20-64)) | |
564 | subfic rSHR, rSHL, 32 | |
565 | srwi rTMP, rN, 4 /* Divide by 16 */ | |
566 | andi. rBITDIF, rN, 12 /* Get the W remainder */ | |
567 | /* We normally need to load 2 Ws to start the unaligned rSTR2, but in | |
568 | this special case those bits may be discarded anyway. Also we | |
569 | must avoid loading a W where none of the bits are part of rSTR2 as | |
570 | this may cross a page boundary and cause a page fault. */ | |
571 | li rWORD8, 0 | |
572 | blt cr5, L(dus0) | |
573 | lwz rWORD8, 0(rSTR2) | |
574 | la rSTR2, 4(rSTR2) | |
575 | slw rWORD8, rWORD8, rSHL | |
576 | ||
577 | L(dus0): | |
578 | lwz rWORD1, 0(rSTR1) | |
579 | lwz rWORD2, 0(rSTR2) | |
580 | cmplwi cr1, rBITDIF, 8 | |
581 | cmplwi cr7, rN, 16 | |
582 | srw rG, rWORD2, rSHR | |
583 | clrlwi rN, rN, 30 | |
584 | beq L(duPs4) | |
585 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
586 | or rWORD8, rG, rWORD8 | |
587 | bgt cr1, L(duPs3) | |
588 | beq cr1, L(duPs2) | |
589 | ||
590 | /* Remainder is 4 */ | |
591 | .align 4 | |
592 | L(dusP1): | |
593 | slw rB, rWORD2, rSHL | |
594 | slw rWORD7, rWORD1, r11 | |
595 | slw rWORD8, rWORD8, r11 | |
596 | bge cr7, L(duP1e) | |
597 | /* At this point we exit early with the first word compare | |
598 | complete and remainder of 0 to 3 bytes. See L(du14) for details on | |
599 | how we handle the remaining bytes. */ | |
600 | cmplw cr5, rWORD7, rWORD8 | |
601 | slwi. rN, rN, 3 | |
602 | bne cr5, L(duLcr5) | |
603 | cmplw cr7, rN, rSHR | |
604 | beq L(duZeroReturn) | |
605 | li rA, 0 | |
606 | ble cr7, L(dutrim) | |
607 | lwz rWORD2, 4(rSTR2) | |
608 | srw rA, rWORD2, rSHR | |
609 | b L(dutrim) | |
610 | /* Remainder is 8 */ | |
611 | .align 4 | |
612 | L(duPs2): | |
613 | slw rH, rWORD2, rSHL | |
614 | slw rWORD5, rWORD1, r11 | |
615 | slw rWORD6, rWORD8, r11 | |
616 | b L(duP2e) | |
617 | /* Remainder is 12 */ | |
618 | .align 4 | |
619 | L(duPs3): | |
620 | slw rF, rWORD2, rSHL | |
621 | slw rWORD3, rWORD1, r11 | |
622 | slw rWORD4, rWORD8, r11 | |
623 | b L(duP3e) | |
624 | /* Count is a multiple of 16, remainder is 0 */ | |
625 | .align 4 | |
626 | L(duPs4): | |
627 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
628 | or rWORD8, rG, rWORD8 | |
629 | slw rD, rWORD2, rSHL | |
630 | slw rWORD1, rWORD1, r11 | |
631 | slw rWORD2, rWORD8, r11 | |
632 | b L(duP4e) | |
633 | ||
634 | /* At this point we know rSTR1 is word aligned and the | |
635 | compare length is at least 8 bytes. */ | |
636 | .align 4 | |
637 | L(Wunaligned): | |
638 | stw r27,32(r1) | |
639 | cfi_offset(r27,(32-64)) | |
640 | clrrwi rSTR2, rSTR2, 2 | |
641 | stw r26,28(r1) | |
642 | cfi_offset(r26,(28-64)) | |
643 | srwi rTMP, rN, 4 /* Divide by 16 */ | |
644 | stw r25,24(r1) | |
645 | cfi_offset(r25,(24-64)) | |
646 | andi. rBITDIF, rN, 12 /* Get the W remainder */ | |
647 | stw r24,20(r1) | |
648 | cfi_offset(r24,(24-64)) | |
649 | slwi rSHL, rSHL, 3 | |
650 | lwz rWORD6, 0(rSTR2) | |
651 | lwzu rWORD8, 4(rSTR2) | |
652 | cmplwi cr1, rBITDIF, 8 | |
653 | cmplwi cr7, rN, 16 | |
654 | clrlwi rN, rN, 30 | |
655 | subfic rSHR, rSHL, 32 | |
656 | slw rH, rWORD6, rSHL | |
657 | beq L(duP4) | |
658 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
659 | bgt cr1, L(duP3) | |
660 | beq cr1, L(duP2) | |
661 | ||
662 | /* Remainder is 4 */ | |
663 | .align 4 | |
664 | L(duP1): | |
665 | srw rG, rWORD8, rSHR | |
666 | lwz rWORD7, 0(rSTR1) | |
667 | slw rB, rWORD8, rSHL | |
668 | or rWORD8, rG, rH | |
669 | blt cr7, L(duP1x) | |
670 | L(duP1e): | |
671 | lwz rWORD1, 4(rSTR1) | |
672 | lwz rWORD2, 4(rSTR2) | |
673 | cmplw cr5, rWORD7, rWORD8 | |
674 | srw rA, rWORD2, rSHR | |
675 | slw rD, rWORD2, rSHL | |
676 | or rWORD2, rA, rB | |
677 | lwz rWORD3, 8(rSTR1) | |
678 | lwz rWORD4, 8(rSTR2) | |
679 | cmplw cr0, rWORD1, rWORD2 | |
680 | srw rC, rWORD4, rSHR | |
681 | slw rF, rWORD4, rSHL | |
682 | bne cr5, L(duLcr5) | |
683 | or rWORD4, rC, rD | |
684 | lwz rWORD5, 12(rSTR1) | |
685 | lwz rWORD6, 12(rSTR2) | |
686 | cmplw cr1, rWORD3, rWORD4 | |
687 | srw rE, rWORD6, rSHR | |
688 | slw rH, rWORD6, rSHL | |
689 | bne cr0, L(duLcr0) | |
690 | or rWORD6, rE, rF | |
691 | cmplw cr6, rWORD5, rWORD6 | |
692 | b L(duLoop3) | |
693 | .align 4 | |
694 | /* At this point we exit early with the first word compare | |
695 | complete and remainder of 0 to 3 bytes. See L(du14) for details on | |
696 | how we handle the remaining bytes. */ | |
697 | L(duP1x): | |
698 | cmplw cr5, rWORD7, rWORD8 | |
699 | slwi. rN, rN, 3 | |
700 | bne cr5, L(duLcr5) | |
701 | cmplw cr7, rN, rSHR | |
702 | beq L(duZeroReturn) | |
703 | li rA, 0 | |
704 | ble cr7, L(dutrim) | |
705 | ld rWORD2, 8(rSTR2) | |
706 | srw rA, rWORD2, rSHR | |
707 | b L(dutrim) | |
708 | /* Remainder is 8 */ | |
709 | .align 4 | |
710 | L(duP2): | |
711 | srw rE, rWORD8, rSHR | |
712 | lwz rWORD5, 0(rSTR1) | |
713 | or rWORD6, rE, rH | |
714 | slw rH, rWORD8, rSHL | |
715 | L(duP2e): | |
716 | lwz rWORD7, 4(rSTR1) | |
717 | lwz rWORD8, 4(rSTR2) | |
718 | cmplw cr6, rWORD5, rWORD6 | |
719 | srw rG, rWORD8, rSHR | |
720 | slw rB, rWORD8, rSHL | |
721 | or rWORD8, rG, rH | |
722 | blt cr7, L(duP2x) | |
723 | lwz rWORD1, 8(rSTR1) | |
724 | lwz rWORD2, 8(rSTR2) | |
725 | cmplw cr5, rWORD7, rWORD8 | |
726 | bne cr6, L(duLcr6) | |
727 | srw rA, rWORD2, rSHR | |
728 | slw rD, rWORD2, rSHL | |
729 | or rWORD2, rA, rB | |
730 | lwz rWORD3, 12(rSTR1) | |
731 | lwz rWORD4, 12(rSTR2) | |
732 | cmplw cr0, rWORD1, rWORD2 | |
733 | bne cr5, L(duLcr5) | |
734 | srw rC, rWORD4, rSHR | |
735 | slw rF, rWORD4, rSHL | |
736 | or rWORD4, rC, rD | |
737 | addi rSTR1, rSTR1, 4 | |
738 | addi rSTR2, rSTR2, 4 | |
739 | cmplw cr1, rWORD3, rWORD4 | |
740 | b L(duLoop2) | |
741 | .align 4 | |
742 | L(duP2x): | |
743 | cmplw cr5, rWORD7, rWORD8 | |
744 | addi rSTR1, rSTR1, 4 | |
745 | addi rSTR2, rSTR2, 4 | |
746 | bne cr6, L(duLcr6) | |
747 | slwi. rN, rN, 3 | |
748 | bne cr5, L(duLcr5) | |
749 | cmplw cr7, rN, rSHR | |
750 | beq L(duZeroReturn) | |
751 | li rA, 0 | |
752 | ble cr7, L(dutrim) | |
753 | lwz rWORD2, 4(rSTR2) | |
754 | srw rA, rWORD2, rSHR | |
755 | b L(dutrim) | |
756 | ||
757 | /* Remainder is 12 */ | |
758 | .align 4 | |
759 | L(duP3): | |
760 | srw rC, rWORD8, rSHR | |
761 | lwz rWORD3, 0(rSTR1) | |
762 | slw rF, rWORD8, rSHL | |
763 | or rWORD4, rC, rH | |
764 | L(duP3e): | |
765 | lwz rWORD5, 4(rSTR1) | |
766 | lwz rWORD6, 4(rSTR2) | |
767 | cmplw cr1, rWORD3, rWORD4 | |
768 | srw rE, rWORD6, rSHR | |
769 | slw rH, rWORD6, rSHL | |
770 | or rWORD6, rE, rF | |
771 | lwz rWORD7, 8(rSTR1) | |
772 | lwz rWORD8, 8(rSTR2) | |
773 | cmplw cr6, rWORD5, rWORD6 | |
774 | bne cr1, L(duLcr1) | |
775 | srw rG, rWORD8, rSHR | |
776 | slw rB, rWORD8, rSHL | |
777 | or rWORD8, rG, rH | |
778 | blt cr7, L(duP3x) | |
779 | lwz rWORD1, 12(rSTR1) | |
780 | lwz rWORD2, 12(rSTR2) | |
781 | cmplw cr5, rWORD7, rWORD8 | |
782 | bne cr6, L(duLcr6) | |
783 | srw rA, rWORD2, rSHR | |
784 | slw rD, rWORD2, rSHL | |
785 | or rWORD2, rA, rB | |
786 | addi rSTR1, rSTR1, 8 | |
787 | addi rSTR2, rSTR2, 8 | |
788 | cmplw cr0, rWORD1, rWORD2 | |
789 | b L(duLoop1) | |
790 | .align 4 | |
791 | L(duP3x): | |
792 | addi rSTR1, rSTR1, 8 | |
793 | addi rSTR2, rSTR2, 8 | |
794 | bne cr1, L(duLcr1) | |
795 | cmplw cr5, rWORD7, rWORD8 | |
796 | bne cr6, L(duLcr6) | |
797 | slwi. rN, rN, 3 | |
798 | bne cr5, L(duLcr5) | |
799 | cmplw cr7, rN, rSHR | |
800 | beq L(duZeroReturn) | |
801 | li rA, 0 | |
802 | ble cr7, L(dutrim) | |
803 | lwz rWORD2, 4(rSTR2) | |
804 | srw rA, rWORD2, rSHR | |
805 | b L(dutrim) | |
806 | ||
807 | /* Count is a multiple of 16, remainder is 0 */ | |
808 | .align 4 | |
809 | L(duP4): | |
810 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
811 | srw rA, rWORD8, rSHR | |
812 | lwz rWORD1, 0(rSTR1) | |
813 | slw rD, rWORD8, rSHL | |
814 | or rWORD2, rA, rH | |
815 | L(duP4e): | |
816 | lwz rWORD3, 4(rSTR1) | |
817 | lwz rWORD4, 4(rSTR2) | |
818 | cmplw cr0, rWORD1, rWORD2 | |
819 | srw rC, rWORD4, rSHR | |
820 | slw rF, rWORD4, rSHL | |
821 | or rWORD4, rC, rD | |
822 | lwz rWORD5, 8(rSTR1) | |
823 | lwz rWORD6, 8(rSTR2) | |
824 | cmplw cr1, rWORD3, rWORD4 | |
825 | bne cr0, L(duLcr0) | |
826 | srw rE, rWORD6, rSHR | |
827 | slw rH, rWORD6, rSHL | |
828 | or rWORD6, rE, rF | |
829 | lwzu rWORD7, 12(rSTR1) | |
830 | lwzu rWORD8, 12(rSTR2) | |
831 | cmplw cr6, rWORD5, rWORD6 | |
832 | bne cr1, L(duLcr1) | |
833 | srw rG, rWORD8, rSHR | |
834 | slw rB, rWORD8, rSHL | |
835 | or rWORD8, rG, rH | |
836 | cmplw cr5, rWORD7, rWORD8 | |
837 | bdz- L(du24) /* Adjust CTR as we start with +4 */ | |
838 | /* This is the primary loop */ | |
839 | .align 4 | |
840 | L(duLoop): | |
841 | lwz rWORD1, 4(rSTR1) | |
842 | lwz rWORD2, 4(rSTR2) | |
843 | cmplw cr1, rWORD3, rWORD4 | |
844 | bne cr6, L(duLcr6) | |
845 | srw rA, rWORD2, rSHR | |
846 | slw rD, rWORD2, rSHL | |
847 | or rWORD2, rA, rB | |
848 | L(duLoop1): | |
849 | lwz rWORD3, 8(rSTR1) | |
850 | lwz rWORD4, 8(rSTR2) | |
851 | cmplw cr6, rWORD5, rWORD6 | |
852 | bne cr5, L(duLcr5) | |
853 | srw rC, rWORD4, rSHR | |
854 | slw rF, rWORD4, rSHL | |
855 | or rWORD4, rC, rD | |
856 | L(duLoop2): | |
857 | lwz rWORD5, 12(rSTR1) | |
858 | lwz rWORD6, 12(rSTR2) | |
859 | cmplw cr5, rWORD7, rWORD8 | |
860 | bne cr0, L(duLcr0) | |
861 | srw rE, rWORD6, rSHR | |
862 | slw rH, rWORD6, rSHL | |
863 | or rWORD6, rE, rF | |
864 | L(duLoop3): | |
865 | lwzu rWORD7, 16(rSTR1) | |
866 | lwzu rWORD8, 16(rSTR2) | |
867 | cmplw cr0, rWORD1, rWORD2 | |
868 | bne- cr1, L(duLcr1) | |
869 | srw rG, rWORD8, rSHR | |
870 | slw rB, rWORD8, rSHL | |
871 | or rWORD8, rG, rH | |
872 | bdnz+ L(duLoop) | |
873 | ||
874 | L(duL4): | |
875 | bne cr1, L(duLcr1) | |
876 | cmplw cr1, rWORD3, rWORD4 | |
877 | bne cr6, L(duLcr6) | |
878 | cmplw cr6, rWORD5, rWORD6 | |
879 | bne cr5, L(duLcr5) | |
880 | cmplw cr5, rWORD7, rWORD8 | |
881 | L(du44): | |
882 | bne cr0, L(duLcr0) | |
883 | L(du34): | |
884 | bne cr1, L(duLcr1) | |
885 | L(du24): | |
886 | bne cr6, L(duLcr6) | |
887 | L(du14): | |
888 | slwi. rN, rN, 3 | |
889 | bne cr5, L(duLcr5) | |
890 | /* At this point we have a remainder of 1 to 3 bytes to compare. We use | |
891 | shift right to eliminate bits beyond the compare length. | |
892 | ||
893 | However it may not be safe to load rWORD2 which may be beyond the | |
894 | string length. So we compare the bit length of the remainder to | |
895 | the right shift count (rSHR). If the bit count is less than or equal | |
896 | we do not need to load rWORD2 (all significant bits are already in | |
897 | rB). */ | |
898 | cmplw cr7, rN, rSHR | |
899 | beq L(duZeroReturn) | |
900 | li rA, 0 | |
901 | ble cr7, L(dutrim) | |
902 | lwz rWORD2, 4(rSTR2) | |
903 | srw rA, rWORD2, rSHR | |
904 | .align 4 | |
905 | L(dutrim): | |
906 | lwz rWORD1, 4(rSTR1) | |
907 | lwz r31,48(1) | |
908 | subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */ | |
909 | or rWORD2, rA, rB | |
910 | lwz r30,44(1) | |
911 | lwz r29,40(r1) | |
912 | srw rWORD1, rWORD1, rN | |
913 | srw rWORD2, rWORD2, rN | |
914 | lwz r28,36(r1) | |
915 | lwz r27,32(r1) | |
916 | cmplw rWORD1,rWORD2 | |
917 | li rRTN,0 | |
918 | beq L(dureturn26) | |
919 | li rRTN,1 | |
920 | bgt L(dureturn26) | |
921 | li rRTN,-1 | |
922 | b L(dureturn26) | |
923 | .align 4 | |
924 | L(duLcr0): | |
925 | lwz r31,48(1) | |
926 | lwz r30,44(1) | |
927 | li rRTN, 1 | |
928 | bgt cr0, L(dureturn29) | |
929 | lwz r29,40(r1) | |
930 | lwz r28,36(r1) | |
931 | li rRTN, -1 | |
932 | b L(dureturn27) | |
933 | .align 4 | |
934 | L(duLcr1): | |
935 | lwz r31,48(1) | |
936 | lwz r30,44(1) | |
937 | li rRTN, 1 | |
938 | bgt cr1, L(dureturn29) | |
939 | lwz r29,40(r1) | |
940 | lwz r28,36(r1) | |
941 | li rRTN, -1 | |
942 | b L(dureturn27) | |
943 | .align 4 | |
944 | L(duLcr6): | |
945 | lwz r31,48(1) | |
946 | lwz r30,44(1) | |
947 | li rRTN, 1 | |
948 | bgt cr6, L(dureturn29) | |
949 | lwz r29,40(r1) | |
950 | lwz r28,36(r1) | |
951 | li rRTN, -1 | |
952 | b L(dureturn27) | |
953 | .align 4 | |
954 | L(duLcr5): | |
955 | lwz r31,48(1) | |
956 | lwz r30,44(1) | |
957 | li rRTN, 1 | |
958 | bgt cr5, L(dureturn29) | |
959 | lwz r29,40(r1) | |
960 | lwz r28,36(r1) | |
961 | li rRTN, -1 | |
962 | b L(dureturn27) | |
963 | .align 3 | |
964 | L(duZeroReturn): | |
965 | li rRTN,0 | |
966 | .align 4 | |
967 | L(dureturn): | |
968 | lwz r31,48(1) | |
969 | lwz r30,44(1) | |
970 | L(dureturn29): | |
971 | lwz r29,40(r1) | |
972 | lwz r28,36(r1) | |
973 | L(dureturn27): | |
974 | lwz r27,32(r1) | |
975 | L(dureturn26): | |
976 | lwz r26,28(r1) | |
977 | L(dureturn25): | |
978 | lwz r25,24(r1) | |
979 | lwz r24,20(r1) | |
980 | lwz 1,0(1) | |
981 | blr | |
982 | END (BP_SYM (memcmp)) | |
983 | ||
984 | libc_hidden_builtin_def (memcmp) | |
985 | weak_alias (memcmp, bcmp) |