]>
Commit | Line | Data |
---|---|---|
0ecb606c JJ |
1 | /* Optimized strcmp implementation for PowerPC64. |
2 | Copyright (C) 2003, 2006 Free Software Foundation, Inc. | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, write to the Free | |
17 | Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA | |
18 | 02110-1301 USA. */ | |
19 | ||
20 | #include <sysdep.h> | |
21 | #include <bp-sym.h> | |
22 | #include <bp-asm.h> | |
23 | ||
24 | /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */ | |
25 | ||
26 | EALIGN (BP_SYM(memcmp), 4, 0) | |
27 | CALL_MCOUNT 3 | |
28 | ||
29 | #define rTMP r0 | |
30 | #define rRTN r3 | |
31 | #define rSTR1 r3 /* first string arg */ | |
32 | #define rSTR2 r4 /* second string arg */ | |
33 | #define rN r5 /* max string length */ | |
34 | /* Note: The Bounded pointer support in this code is broken. This code | |
35 | was inherited from PPC32 and and that support was never completed. | |
36 | Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */ | |
37 | #define rWORD1 r6 /* current word in s1 */ | |
38 | #define rWORD2 r7 /* current word in s2 */ | |
39 | #define rWORD3 r8 /* next word in s1 */ | |
40 | #define rWORD4 r9 /* next word in s2 */ | |
41 | #define rWORD5 r10 /* next word in s1 */ | |
42 | #define rWORD6 r11 /* next word in s2 */ | |
43 | #define rBITDIF r12 /* bits that differ in s1 & s2 words */ | |
44 | #define rWORD7 r30 /* next word in s1 */ | |
45 | #define rWORD8 r31 /* next word in s2 */ | |
46 | ||
47 | xor rTMP, rSTR2, rSTR1 | |
48 | cmpldi cr6, rN, 0 | |
49 | cmpldi cr1, rN, 12 | |
50 | clrldi. rTMP, rTMP, 61 | |
51 | clrldi rBITDIF, rSTR1, 61 | |
52 | cmpldi cr5, rBITDIF, 0 | |
53 | beq- cr6, L(zeroLength) | |
54 | dcbt 0,rSTR1 | |
55 | dcbt 0,rSTR2 | |
56 | /* If less than 8 bytes or not aligned, use the unalligned | |
57 | byte loop. */ | |
58 | blt cr1, L(bytealigned) | |
59 | std rWORD8,-8(r1) | |
60 | cfi_offset(rWORD8,-8) | |
61 | std rWORD7,-16(r1) | |
62 | cfi_offset(rWORD7,-16) | |
63 | bne L(unaligned) | |
64 | /* At this point we know both strings have the same alignment and the | |
65 | compare length is at least 8 bytes. rBITDIF containes the low order | |
66 | 3 bits of rSTR1 and cr5 contains the result of the logical compare | |
67 | of rBITDIF to 0. If rBITDIF == 0 then we are already double word | |
68 | aligned and can perform the DWaligned loop. | |
69 | ||
70 | Otherwise we know the two strings have the same alignment (but not | |
71 | yet DW). So we can force the string addresses to the next lower DW | |
72 | boundary and special case this first DW word using shift left to | |
73 | ellimiate bits preceeding the first byte. Since we want to join the | |
74 | normal (DWaligned) compare loop, starting at the second double word, | |
75 | we need to adjust the length (rN) and special case the loop | |
76 | versioning for the first DW. This insures that the loop count is | |
77 | correct and the first DW (shifted) is in the expected resister pair. */ | |
78 | .align 4 | |
79 | L(samealignment): | |
80 | clrrdi rSTR1, rSTR1, 3 | |
81 | clrrdi rSTR2, rSTR2, 3 | |
82 | beq cr5, L(DWaligned) | |
83 | add rN, rN, rBITDIF | |
84 | sldi r11, rBITDIF, 3 | |
85 | srdi rTMP, rN, 5 /* Divide by 32 */ | |
86 | andi. rBITDIF, rN, 24 /* Get the DW remainder */ | |
87 | ld rWORD1, 0(rSTR1) | |
88 | ld rWORD2, 0(rSTR2) | |
89 | cmpldi cr1, rBITDIF, 16 | |
90 | cmpldi cr7, rN, 32 | |
91 | clrldi rN, rN, 61 | |
92 | beq L(dPs4) | |
93 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
94 | bgt cr1, L(dPs3) | |
95 | beq cr1, L(dPs2) | |
96 | ||
97 | /* Remainder is 8 */ | |
98 | .align 3 | |
99 | L(dsP1): | |
100 | sld rWORD5, rWORD1, r11 | |
101 | sld rWORD6, rWORD2, r11 | |
102 | cmpld cr5, rWORD5, rWORD6 | |
103 | blt cr7, L(dP1x) | |
104 | /* Do something useful in this cycle since we have to branch anyway. */ | |
105 | ld rWORD1, 8(rSTR1) | |
106 | ld rWORD2, 8(rSTR2) | |
107 | cmpld cr0, rWORD1, rWORD2 | |
108 | b L(dP1e) | |
109 | /* Remainder is 16 */ | |
110 | .align 4 | |
111 | L(dPs2): | |
112 | sld rWORD5, rWORD1, r11 | |
113 | sld rWORD6, rWORD2, r11 | |
114 | cmpld cr6, rWORD5, rWORD6 | |
115 | blt cr7, L(dP2x) | |
116 | /* Do something useful in this cycle since we have to branch anyway. */ | |
117 | ld rWORD7, 8(rSTR1) | |
118 | ld rWORD8, 8(rSTR2) | |
119 | cmpld cr5, rWORD7, rWORD8 | |
120 | b L(dP2e) | |
121 | /* Remainder is 24 */ | |
122 | .align 4 | |
123 | L(dPs3): | |
124 | sld rWORD3, rWORD1, r11 | |
125 | sld rWORD4, rWORD2, r11 | |
126 | cmpld cr1, rWORD3, rWORD4 | |
127 | b L(dP3e) | |
128 | /* Count is a multiple of 32, remainder is 0 */ | |
129 | .align 4 | |
130 | L(dPs4): | |
131 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
132 | sld rWORD1, rWORD1, r11 | |
133 | sld rWORD2, rWORD2, r11 | |
134 | cmpld cr0, rWORD1, rWORD2 | |
135 | b L(dP4e) | |
136 | ||
137 | /* At this point we know both strings are double word aligned and the | |
138 | compare length is at least 8 bytes. */ | |
139 | .align 4 | |
140 | L(DWaligned): | |
141 | andi. rBITDIF, rN, 24 /* Get the DW remainder */ | |
142 | srdi rTMP, rN, 5 /* Divide by 32 */ | |
143 | cmpldi cr1, rBITDIF, 16 | |
144 | cmpldi cr7, rN, 32 | |
145 | clrldi rN, rN, 61 | |
146 | beq L(dP4) | |
147 | bgt cr1, L(dP3) | |
148 | beq cr1, L(dP2) | |
149 | ||
150 | /* Remainder is 8 */ | |
151 | .align 4 | |
152 | L(dP1): | |
153 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
154 | /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early | |
155 | (8-15 byte compare), we want to use only volitile registers. This | |
156 | means we can avoid restoring non-volitile registers since we did not | |
157 | change any on the early exit path. The key here is the non-early | |
158 | exit path only cares about the condition code (cr5), not about which | |
159 | register pair was used. */ | |
160 | ld rWORD5, 0(rSTR1) | |
161 | ld rWORD6, 0(rSTR2) | |
162 | cmpld cr5, rWORD5, rWORD6 | |
163 | blt cr7, L(dP1x) | |
164 | ld rWORD1, 8(rSTR1) | |
165 | ld rWORD2, 8(rSTR2) | |
166 | cmpld cr0, rWORD1, rWORD2 | |
167 | L(dP1e): | |
168 | ld rWORD3, 16(rSTR1) | |
169 | ld rWORD4, 16(rSTR2) | |
170 | cmpld cr1, rWORD3, rWORD4 | |
171 | ld rWORD5, 24(rSTR1) | |
172 | ld rWORD6, 24(rSTR2) | |
173 | cmpld cr6, rWORD5, rWORD6 | |
174 | bne cr5, L(dLcr5) | |
175 | bne cr0, L(dLcr0) | |
176 | ||
177 | ldu rWORD7, 32(rSTR1) | |
178 | ldu rWORD8, 32(rSTR2) | |
179 | bne cr1, L(dLcr1) | |
180 | cmpld cr5, rWORD7, rWORD8 | |
181 | bdnz L(dLoop) | |
182 | bne cr6, L(dLcr6) | |
183 | ld rWORD8,-8(r1) | |
184 | ld rWORD7,-16(r1) | |
185 | .align 3 | |
186 | L(dP1x): | |
187 | sldi. r12, rN, 3 | |
188 | bne cr5, L(dLcr5) | |
189 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ | |
190 | bne L(d00) | |
191 | li rRTN, 0 | |
192 | blr | |
193 | ||
194 | /* Remainder is 16 */ | |
195 | .align 4 | |
196 | L(dP2): | |
197 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
198 | ld rWORD5, 0(rSTR1) | |
199 | ld rWORD6, 0(rSTR2) | |
200 | cmpld cr6, rWORD5, rWORD6 | |
201 | blt cr7, L(dP2x) | |
202 | ld rWORD7, 8(rSTR1) | |
203 | ld rWORD8, 8(rSTR2) | |
204 | cmpld cr5, rWORD7, rWORD8 | |
205 | L(dP2e): | |
206 | ld rWORD1, 16(rSTR1) | |
207 | ld rWORD2, 16(rSTR2) | |
208 | cmpld cr0, rWORD1, rWORD2 | |
209 | ld rWORD3, 24(rSTR1) | |
210 | ld rWORD4, 24(rSTR2) | |
211 | cmpld cr1, rWORD3, rWORD4 | |
212 | addi rSTR1, rSTR1, 8 | |
213 | addi rSTR2, rSTR2, 8 | |
214 | bne cr6, L(dLcr6) | |
215 | bne cr5, L(dLcr5) | |
216 | b L(dLoop2) | |
217 | /* Again we are on a early exit path (16-23 byte compare), we want to | |
218 | only use volitile registers and avoid restoring non-volitile | |
219 | registers. */ | |
220 | .align 4 | |
221 | L(dP2x): | |
222 | ld rWORD3, 8(rSTR1) | |
223 | ld rWORD4, 8(rSTR2) | |
224 | cmpld cr5, rWORD3, rWORD4 | |
225 | sldi. r12, rN, 3 | |
226 | bne cr6, L(dLcr6) | |
227 | addi rSTR1, rSTR1, 8 | |
228 | addi rSTR2, rSTR2, 8 | |
229 | bne cr5, L(dLcr5) | |
230 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ | |
231 | bne L(d00) | |
232 | li rRTN, 0 | |
233 | blr | |
234 | ||
235 | /* Remainder is 24 */ | |
236 | .align 4 | |
237 | L(dP3): | |
238 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
239 | ld rWORD3, 0(rSTR1) | |
240 | ld rWORD4, 0(rSTR2) | |
241 | cmpld cr1, rWORD3, rWORD4 | |
242 | L(dP3e): | |
243 | ld rWORD5, 8(rSTR1) | |
244 | ld rWORD6, 8(rSTR2) | |
245 | cmpld cr6, rWORD5, rWORD6 | |
246 | blt cr7, L(dP3x) | |
247 | ld rWORD7, 16(rSTR1) | |
248 | ld rWORD8, 16(rSTR2) | |
249 | cmpld cr5, rWORD7, rWORD8 | |
250 | ld rWORD1, 24(rSTR1) | |
251 | ld rWORD2, 24(rSTR2) | |
252 | cmpld cr0, rWORD1, rWORD2 | |
253 | addi rSTR1, rSTR1, 16 | |
254 | addi rSTR2, rSTR2, 16 | |
255 | bne cr1, L(dLcr1) | |
256 | bne cr6, L(dLcr6) | |
257 | b L(dLoop1) | |
258 | /* Again we are on a early exit path (24-31 byte compare), we want to | |
259 | only use volitile registers and avoid restoring non-volitile | |
260 | registers. */ | |
261 | .align 4 | |
262 | L(dP3x): | |
263 | ld rWORD1, 16(rSTR1) | |
264 | ld rWORD2, 16(rSTR2) | |
265 | cmpld cr5, rWORD1, rWORD2 | |
266 | sldi. r12, rN, 3 | |
267 | bne cr1, L(dLcr1) | |
268 | addi rSTR1, rSTR1, 16 | |
269 | addi rSTR2, rSTR2, 16 | |
270 | bne cr6, L(dLcr6) | |
271 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ | |
272 | bne cr5, L(dLcr5) | |
273 | bne L(d00) | |
274 | li rRTN, 0 | |
275 | blr | |
276 | ||
277 | /* Count is a multiple of 32, remainder is 0 */ | |
278 | .align 4 | |
279 | L(dP4): | |
280 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
281 | ld rWORD1, 0(rSTR1) | |
282 | ld rWORD2, 0(rSTR2) | |
283 | cmpld cr0, rWORD1, rWORD2 | |
284 | L(dP4e): | |
285 | ld rWORD3, 8(rSTR1) | |
286 | ld rWORD4, 8(rSTR2) | |
287 | cmpld cr1, rWORD3, rWORD4 | |
288 | ld rWORD5, 16(rSTR1) | |
289 | ld rWORD6, 16(rSTR2) | |
290 | cmpld cr6, rWORD5, rWORD6 | |
291 | ldu rWORD7, 24(rSTR1) | |
292 | ldu rWORD8, 24(rSTR2) | |
293 | cmpld cr5, rWORD7, rWORD8 | |
294 | bne cr0, L(dLcr0) | |
295 | bne cr1, L(dLcr1) | |
296 | bdz- L(d24) /* Adjust CTR as we start with +4 */ | |
297 | /* This is the primary loop */ | |
298 | .align 4 | |
299 | L(dLoop): | |
300 | ld rWORD1, 8(rSTR1) | |
301 | ld rWORD2, 8(rSTR2) | |
302 | cmpld cr1, rWORD3, rWORD4 | |
303 | bne cr6, L(dLcr6) | |
304 | L(dLoop1): | |
305 | ld rWORD3, 16(rSTR1) | |
306 | ld rWORD4, 16(rSTR2) | |
307 | cmpld cr6, rWORD5, rWORD6 | |
308 | bne cr5, L(dLcr5) | |
309 | L(dLoop2): | |
310 | ld rWORD5, 24(rSTR1) | |
311 | ld rWORD6, 24(rSTR2) | |
312 | cmpld cr5, rWORD7, rWORD8 | |
313 | bne cr0, L(dLcr0) | |
314 | L(dLoop3): | |
315 | ldu rWORD7, 32(rSTR1) | |
316 | ldu rWORD8, 32(rSTR2) | |
317 | bne- cr1, L(dLcr1) | |
318 | cmpld cr0, rWORD1, rWORD2 | |
319 | bdnz+ L(dLoop) | |
320 | ||
321 | L(dL4): | |
322 | cmpld cr1, rWORD3, rWORD4 | |
323 | bne cr6, L(dLcr6) | |
324 | cmpld cr6, rWORD5, rWORD6 | |
325 | bne cr5, L(dLcr5) | |
326 | cmpld cr5, rWORD7, rWORD8 | |
327 | L(d44): | |
328 | bne cr0, L(dLcr0) | |
329 | L(d34): | |
330 | bne cr1, L(dLcr1) | |
331 | L(d24): | |
332 | bne cr6, L(dLcr6) | |
333 | L(d14): | |
334 | sldi. r12, rN, 3 | |
335 | bne cr5, L(dLcr5) | |
336 | L(d04): | |
337 | ld rWORD8,-8(r1) | |
338 | ld rWORD7,-16(r1) | |
339 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ | |
340 | beq L(zeroLength) | |
341 | /* At this point we have a remainder of 1 to 7 bytes to compare. Since | |
342 | we are aligned it is safe to load the whole double word, and use | |
343 | shift right double to elliminate bits beyond the compare length. */ | |
344 | L(d00): | |
345 | ld rWORD1, 8(rSTR1) | |
346 | ld rWORD2, 8(rSTR2) | |
347 | srd rWORD1, rWORD1, rN | |
348 | srd rWORD2, rWORD2, rN | |
349 | cmpld cr5, rWORD1, rWORD2 | |
350 | bne cr5, L(dLcr5x) | |
351 | li rRTN, 0 | |
352 | blr | |
353 | .align 4 | |
354 | L(dLcr0): | |
355 | ld rWORD8,-8(r1) | |
356 | ld rWORD7,-16(r1) | |
357 | li rRTN, 1 | |
358 | bgtlr cr0 | |
359 | li rRTN, -1 | |
360 | blr | |
361 | .align 4 | |
362 | L(dLcr1): | |
363 | ld rWORD8,-8(r1) | |
364 | ld rWORD7,-16(r1) | |
365 | li rRTN, 1 | |
366 | bgtlr cr1 | |
367 | li rRTN, -1 | |
368 | blr | |
369 | .align 4 | |
370 | L(dLcr6): | |
371 | ld rWORD8,-8(r1) | |
372 | ld rWORD7,-16(r1) | |
373 | li rRTN, 1 | |
374 | bgtlr cr6 | |
375 | li rRTN, -1 | |
376 | blr | |
377 | .align 4 | |
378 | L(dLcr5): | |
379 | ld rWORD8,-8(r1) | |
380 | ld rWORD7,-16(r1) | |
381 | L(dLcr5x): | |
382 | li rRTN, 1 | |
383 | bgtlr cr5 | |
384 | li rRTN, -1 | |
385 | blr | |
386 | ||
387 | .align 4 | |
388 | L(bytealigned): | |
389 | mtctr rN /* Power4 wants mtctr 1st in dispatch group */ | |
390 | beq- cr6, L(zeroLength) | |
391 | ||
392 | /* We need to prime this loop. This loop is swing modulo scheduled | |
393 | to avoid pipe delays. The dependent instruction latencies (load to | |
394 | compare to conditional branch) is 2 to 3 cycles. In this loop each | |
395 | dispatch group ends in a branch and takes 1 cycle. Effectively | |
396 | the first iteration of the loop only serves to load operands and | |
397 | branches based on compares are delayed until the next loop. | |
398 | ||
399 | So we must precondition some registers and condition codes so that | |
400 | we don't exit the loop early on the first iteration. */ | |
401 | ||
402 | lbz rWORD1, 0(rSTR1) | |
403 | lbz rWORD2, 0(rSTR2) | |
404 | bdz- L(b11) | |
405 | cmpld cr0, rWORD1, rWORD2 | |
406 | lbz rWORD3, 1(rSTR1) | |
407 | lbz rWORD4, 1(rSTR2) | |
408 | bdz- L(b12) | |
409 | cmpld cr1, rWORD3, rWORD4 | |
410 | lbzu rWORD5, 2(rSTR1) | |
411 | lbzu rWORD6, 2(rSTR2) | |
412 | bdz- L(b13) | |
413 | .align 4 | |
414 | L(bLoop): | |
415 | lbzu rWORD1, 1(rSTR1) | |
416 | lbzu rWORD2, 1(rSTR2) | |
417 | bne- cr0, L(bLcr0) | |
418 | ||
419 | cmpld cr6, rWORD5, rWORD6 | |
420 | bdz- L(b3i) | |
421 | ||
422 | lbzu rWORD3, 1(rSTR1) | |
423 | lbzu rWORD4, 1(rSTR2) | |
424 | bne- cr1, L(bLcr1) | |
425 | ||
426 | cmpld cr0, rWORD1, rWORD2 | |
427 | bdz- L(b2i) | |
428 | ||
429 | lbzu rWORD5, 1(rSTR1) | |
430 | lbzu rWORD6, 1(rSTR2) | |
431 | bne- cr6, L(bLcr6) | |
432 | ||
433 | cmpld cr1, rWORD3, rWORD4 | |
434 | bdnz+ L(bLoop) | |
435 | ||
436 | /* We speculatively loading bytes before we have tested the previous | |
437 | bytes. But we must avoid overrunning the length (in the ctr) to | |
438 | prevent these speculative loads from causing a segfault. In this | |
439 | case the loop will exit early (before the all pending bytes are | |
440 | tested. In this case we must complete the pending operations | |
441 | before returning. */ | |
442 | L(b1i): | |
443 | bne- cr0, L(bLcr0) | |
444 | bne- cr1, L(bLcr1) | |
445 | b L(bx56) | |
446 | .align 4 | |
447 | L(b2i): | |
448 | bne- cr6, L(bLcr6) | |
449 | bne- cr0, L(bLcr0) | |
450 | b L(bx34) | |
451 | .align 4 | |
452 | L(b3i): | |
453 | bne- cr1, L(bLcr1) | |
454 | bne- cr6, L(bLcr6) | |
455 | b L(bx12) | |
456 | .align 4 | |
457 | L(bLcr0): | |
458 | li rRTN, 1 | |
459 | bgtlr cr0 | |
460 | li rRTN, -1 | |
461 | blr | |
462 | L(bLcr1): | |
463 | li rRTN, 1 | |
464 | bgtlr cr1 | |
465 | li rRTN, -1 | |
466 | blr | |
467 | L(bLcr6): | |
468 | li rRTN, 1 | |
469 | bgtlr cr6 | |
470 | li rRTN, -1 | |
471 | blr | |
472 | ||
473 | L(b13): | |
474 | bne- cr0, L(bx12) | |
475 | bne- cr1, L(bx34) | |
476 | L(bx56): | |
477 | sub rRTN, rWORD5, rWORD6 | |
478 | blr | |
479 | nop | |
480 | L(b12): | |
481 | bne- cr0, L(bx12) | |
482 | L(bx34): | |
483 | sub rRTN, rWORD3, rWORD4 | |
484 | blr | |
485 | L(b11): | |
486 | L(bx12): | |
487 | sub rRTN, rWORD1, rWORD2 | |
488 | blr | |
489 | .align 4 | |
490 | L(zeroLengthReturn): | |
491 | ld rWORD8,-8(r1) | |
492 | ld rWORD7,-16(r1) | |
493 | L(zeroLength): | |
494 | li rRTN, 0 | |
495 | blr | |
496 | ||
497 | .align 4 | |
498 | /* At this point we know the strings have different alignment and the | |
499 | compare length is at least 8 bytes. rBITDIF containes the low order | |
500 | 3 bits of rSTR1 and cr5 contains the result of the logical compare | |
501 | of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word | |
502 | aligned and can perform the DWunaligned loop. | |
503 | ||
504 | Otherwise we know that rSTR1 is not aready DW aligned yet. | |
505 | So we can force the string addresses to the next lower DW | |
506 | boundary and special case this first DW word using shift left to | |
507 | ellimiate bits preceeding the first byte. Since we want to join the | |
508 | normal (DWaligned) compare loop, starting at the second double word, | |
509 | we need to adjust the length (rN) and special case the loop | |
510 | versioning for the first DW. This insures that the loop count is | |
511 | correct and the first DW (shifted) is in the expected resister pair. */ | |
512 | #define rSHL r29 /* Unaligned shift left count. */ | |
513 | #define rSHR r28 /* Unaligned shift right count. */ | |
514 | #define rB r27 /* Left rotation temp for rWORD2. */ | |
515 | #define rD r26 /* Left rotation temp for rWORD4. */ | |
516 | #define rF r25 /* Left rotation temp for rWORD6. */ | |
517 | #define rH r24 /* Left rotation temp for rWORD8. */ | |
518 | #define rA r0 /* Right rotation temp for rWORD2. */ | |
519 | #define rC r12 /* Right rotation temp for rWORD4. */ | |
520 | #define rE r0 /* Right rotation temp for rWORD6. */ | |
521 | #define rG r12 /* Right rotation temp for rWORD8. */ | |
522 | L(unaligned): | |
523 | std r29,-24(r1) | |
524 | cfi_offset(r29,-24) | |
525 | clrldi rSHL, rSTR2, 61 | |
526 | beq- cr6, L(duzeroLength) | |
527 | std r28,-32(r1) | |
528 | cfi_offset(r28,-32) | |
529 | beq cr5, L(DWunaligned) | |
530 | std r27,-40(r1) | |
531 | cfi_offset(r27,-40) | |
532 | /* Adjust the logical start of rSTR2 ro compensate for the extra bits | |
533 | in the 1st rSTR1 DW. */ | |
534 | sub r27, rSTR2, rBITDIF | |
535 | /* But do not attempt to address the DW before that DW that contains | |
536 | the actual start of rSTR2. */ | |
537 | clrrdi rSTR2, rSTR2, 3 | |
538 | std r26,-48(r1) | |
539 | cfi_offset(r26,-48) | |
540 | /* Compute the leaft/right shift counts for the unalign rSTR2, | |
541 | compensating for the logical (DW aligned) start of rSTR1. */ | |
542 | clrldi rSHL, r27, 61 | |
543 | clrrdi rSTR1, rSTR1, 3 | |
544 | std r25,-56(r1) | |
545 | cfi_offset(r25,-56) | |
546 | sldi rSHL, rSHL, 3 | |
547 | cmpld cr5, r27, rSTR2 | |
548 | add rN, rN, rBITDIF | |
549 | sldi r11, rBITDIF, 3 | |
550 | std r24,-64(r1) | |
551 | cfi_offset(r24,-64) | |
552 | subfic rSHR, rSHL, 64 | |
553 | srdi rTMP, rN, 5 /* Divide by 32 */ | |
554 | andi. rBITDIF, rN, 24 /* Get the DW remainder */ | |
555 | /* We normally need to load 2 DWs to start the unaligned rSTR2, but in | |
556 | this special case those bits may be discarded anyway. Also we | |
557 | must avoid loading a DW where none of the bits are part of rSTR2 as | |
558 | this may cross a page boundary and cause a page fault. */ | |
559 | li rWORD8, 0 | |
560 | blt cr5, L(dus0) | |
561 | ld rWORD8, 0(rSTR2) | |
562 | la rSTR2, 8(rSTR2) | |
563 | sld rWORD8, rWORD8, rSHL | |
564 | ||
565 | L(dus0): | |
566 | ld rWORD1, 0(rSTR1) | |
567 | ld rWORD2, 0(rSTR2) | |
568 | cmpldi cr1, rBITDIF, 16 | |
569 | cmpldi cr7, rN, 32 | |
570 | srd rG, rWORD2, rSHR | |
571 | clrldi rN, rN, 61 | |
572 | beq L(duPs4) | |
573 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
574 | or rWORD8, rG, rWORD8 | |
575 | bgt cr1, L(duPs3) | |
576 | beq cr1, L(duPs2) | |
577 | ||
578 | /* Remainder is 8 */ | |
579 | .align 4 | |
580 | L(dusP1): | |
581 | sld rB, rWORD2, rSHL | |
582 | sld rWORD7, rWORD1, r11 | |
583 | sld rWORD8, rWORD8, r11 | |
584 | bge cr7, L(duP1e) | |
585 | /* At this point we exit early with the first double word compare | |
586 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
587 | how we handle the remaining bytes. */ | |
588 | cmpld cr5, rWORD7, rWORD8 | |
589 | sldi. rN, rN, 3 | |
590 | bne cr5, L(duLcr5) | |
591 | cmpld cr7, rN, rSHR | |
592 | beq L(duZeroReturn) | |
593 | li rA, 0 | |
594 | ble cr7, L(dutrim) | |
595 | ld rWORD2, 8(rSTR2) | |
596 | srd rA, rWORD2, rSHR | |
597 | b L(dutrim) | |
598 | /* Remainder is 16 */ | |
599 | .align 4 | |
600 | L(duPs2): | |
601 | sld rH, rWORD2, rSHL | |
602 | sld rWORD5, rWORD1, r11 | |
603 | sld rWORD6, rWORD8, r11 | |
604 | b L(duP2e) | |
605 | /* Remainder is 24 */ | |
606 | .align 4 | |
607 | L(duPs3): | |
608 | sld rF, rWORD2, rSHL | |
609 | sld rWORD3, rWORD1, r11 | |
610 | sld rWORD4, rWORD8, r11 | |
611 | b L(duP3e) | |
612 | /* Count is a multiple of 32, remainder is 0 */ | |
613 | .align 4 | |
614 | L(duPs4): | |
615 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
616 | or rWORD8, rG, rWORD8 | |
617 | sld rD, rWORD2, rSHL | |
618 | sld rWORD1, rWORD1, r11 | |
619 | sld rWORD2, rWORD8, r11 | |
620 | b L(duP4e) | |
621 | ||
622 | /* At this point we know rSTR1 is double word aligned and the | |
623 | compare length is at least 8 bytes. */ | |
624 | .align 4 | |
625 | L(DWunaligned): | |
626 | std r27,-40(r1) | |
627 | cfi_offset(r27,-40) | |
628 | clrrdi rSTR2, rSTR2, 3 | |
629 | std r26,-48(r1) | |
630 | cfi_offset(r26,-48) | |
631 | srdi rTMP, rN, 5 /* Divide by 32 */ | |
632 | std r25,-56(r1) | |
633 | cfi_offset(r25,-56) | |
634 | andi. rBITDIF, rN, 24 /* Get the DW remainder */ | |
635 | std r24,-64(r1) | |
636 | cfi_offset(r24,-64) | |
637 | sldi rSHL, rSHL, 3 | |
638 | ld rWORD6, 0(rSTR2) | |
639 | ldu rWORD8, 8(rSTR2) | |
640 | cmpldi cr1, rBITDIF, 16 | |
641 | cmpldi cr7, rN, 32 | |
642 | clrldi rN, rN, 61 | |
643 | subfic rSHR, rSHL, 64 | |
644 | sld rH, rWORD6, rSHL | |
645 | beq L(duP4) | |
646 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
647 | bgt cr1, L(duP3) | |
648 | beq cr1, L(duP2) | |
649 | ||
650 | /* Remainder is 8 */ | |
651 | .align 4 | |
652 | L(duP1): | |
653 | srd rG, rWORD8, rSHR | |
654 | ld rWORD7, 0(rSTR1) | |
655 | sld rB, rWORD8, rSHL | |
656 | or rWORD8, rG, rH | |
657 | blt cr7, L(duP1x) | |
658 | L(duP1e): | |
659 | ld rWORD1, 8(rSTR1) | |
660 | ld rWORD2, 8(rSTR2) | |
661 | cmpld cr5, rWORD7, rWORD8 | |
662 | srd rA, rWORD2, rSHR | |
663 | sld rD, rWORD2, rSHL | |
664 | or rWORD2, rA, rB | |
665 | ld rWORD3, 16(rSTR1) | |
666 | ld rWORD4, 16(rSTR2) | |
667 | cmpld cr0, rWORD1, rWORD2 | |
668 | srd rC, rWORD4, rSHR | |
669 | sld rF, rWORD4, rSHL | |
670 | bne cr5, L(duLcr5) | |
671 | or rWORD4, rC, rD | |
672 | ld rWORD5, 24(rSTR1) | |
673 | ld rWORD6, 24(rSTR2) | |
674 | cmpld cr1, rWORD3, rWORD4 | |
675 | srd rE, rWORD6, rSHR | |
676 | sld rH, rWORD6, rSHL | |
677 | bne cr0, L(duLcr0) | |
678 | or rWORD6, rE, rF | |
679 | cmpld cr6, rWORD5, rWORD6 | |
680 | b L(duLoop3) | |
681 | .align 4 | |
682 | /* At this point we exit early with the first double word compare | |
683 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
684 | how we handle the remaining bytes. */ | |
685 | L(duP1x): | |
686 | cmpld cr5, rWORD7, rWORD8 | |
687 | sldi. rN, rN, 3 | |
688 | bne cr5, L(duLcr5) | |
689 | cmpld cr7, rN, rSHR | |
690 | beq L(duZeroReturn) | |
691 | li rA, 0 | |
692 | ble cr7, L(dutrim) | |
693 | ld rWORD2, 8(rSTR2) | |
694 | srd rA, rWORD2, rSHR | |
695 | b L(dutrim) | |
696 | /* Remainder is 16 */ | |
697 | .align 4 | |
698 | L(duP2): | |
699 | srd rE, rWORD8, rSHR | |
700 | ld rWORD5, 0(rSTR1) | |
701 | or rWORD6, rE, rH | |
702 | sld rH, rWORD8, rSHL | |
703 | L(duP2e): | |
704 | ld rWORD7, 8(rSTR1) | |
705 | ld rWORD8, 8(rSTR2) | |
706 | cmpld cr6, rWORD5, rWORD6 | |
707 | srd rG, rWORD8, rSHR | |
708 | sld rB, rWORD8, rSHL | |
709 | or rWORD8, rG, rH | |
710 | blt cr7, L(duP2x) | |
711 | ld rWORD1, 16(rSTR1) | |
712 | ld rWORD2, 16(rSTR2) | |
713 | cmpld cr5, rWORD7, rWORD8 | |
714 | bne cr6, L(duLcr6) | |
715 | srd rA, rWORD2, rSHR | |
716 | sld rD, rWORD2, rSHL | |
717 | or rWORD2, rA, rB | |
718 | ld rWORD3, 24(rSTR1) | |
719 | ld rWORD4, 24(rSTR2) | |
720 | cmpld cr0, rWORD1, rWORD2 | |
721 | bne cr5, L(duLcr5) | |
722 | srd rC, rWORD4, rSHR | |
723 | sld rF, rWORD4, rSHL | |
724 | or rWORD4, rC, rD | |
725 | addi rSTR1, rSTR1, 8 | |
726 | addi rSTR2, rSTR2, 8 | |
727 | cmpld cr1, rWORD3, rWORD4 | |
728 | b L(duLoop2) | |
729 | .align 4 | |
730 | L(duP2x): | |
731 | cmpld cr5, rWORD7, rWORD8 | |
732 | addi rSTR1, rSTR1, 8 | |
733 | addi rSTR2, rSTR2, 8 | |
734 | bne cr6, L(duLcr6) | |
735 | sldi. rN, rN, 3 | |
736 | bne cr5, L(duLcr5) | |
737 | cmpld cr7, rN, rSHR | |
738 | beq L(duZeroReturn) | |
739 | li rA, 0 | |
740 | ble cr7, L(dutrim) | |
741 | ld rWORD2, 8(rSTR2) | |
742 | srd rA, rWORD2, rSHR | |
743 | b L(dutrim) | |
744 | ||
745 | /* Remainder is 24 */ | |
746 | .align 4 | |
747 | L(duP3): | |
748 | srd rC, rWORD8, rSHR | |
749 | ld rWORD3, 0(rSTR1) | |
750 | sld rF, rWORD8, rSHL | |
751 | or rWORD4, rC, rH | |
752 | L(duP3e): | |
753 | ld rWORD5, 8(rSTR1) | |
754 | ld rWORD6, 8(rSTR2) | |
755 | cmpld cr1, rWORD3, rWORD4 | |
756 | srd rE, rWORD6, rSHR | |
757 | sld rH, rWORD6, rSHL | |
758 | or rWORD6, rE, rF | |
759 | ld rWORD7, 16(rSTR1) | |
760 | ld rWORD8, 16(rSTR2) | |
761 | cmpld cr6, rWORD5, rWORD6 | |
762 | bne cr1, L(duLcr1) | |
763 | srd rG, rWORD8, rSHR | |
764 | sld rB, rWORD8, rSHL | |
765 | or rWORD8, rG, rH | |
766 | blt cr7, L(duP3x) | |
767 | ld rWORD1, 24(rSTR1) | |
768 | ld rWORD2, 24(rSTR2) | |
769 | cmpld cr5, rWORD7, rWORD8 | |
770 | bne cr6, L(duLcr6) | |
771 | srd rA, rWORD2, rSHR | |
772 | sld rD, rWORD2, rSHL | |
773 | or rWORD2, rA, rB | |
774 | addi rSTR1, rSTR1, 16 | |
775 | addi rSTR2, rSTR2, 16 | |
776 | cmpld cr0, rWORD1, rWORD2 | |
777 | b L(duLoop1) | |
778 | .align 4 | |
779 | L(duP3x): | |
780 | addi rSTR1, rSTR1, 16 | |
781 | addi rSTR2, rSTR2, 16 | |
782 | bne cr1, L(duLcr1) | |
783 | cmpld cr5, rWORD7, rWORD8 | |
784 | bne cr6, L(duLcr6) | |
785 | sldi. rN, rN, 3 | |
786 | bne cr5, L(duLcr5) | |
787 | cmpld cr7, rN, rSHR | |
788 | beq L(duZeroReturn) | |
789 | li rA, 0 | |
790 | ble cr7, L(dutrim) | |
791 | ld rWORD2, 8(rSTR2) | |
792 | srd rA, rWORD2, rSHR | |
793 | b L(dutrim) | |
794 | ||
795 | /* Count is a multiple of 32, remainder is 0 */ | |
796 | .align 4 | |
797 | L(duP4): | |
798 | mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ | |
799 | srd rA, rWORD8, rSHR | |
800 | ld rWORD1, 0(rSTR1) | |
801 | sld rD, rWORD8, rSHL | |
802 | or rWORD2, rA, rH | |
803 | L(duP4e): | |
804 | ld rWORD3, 8(rSTR1) | |
805 | ld rWORD4, 8(rSTR2) | |
806 | cmpld cr0, rWORD1, rWORD2 | |
807 | srd rC, rWORD4, rSHR | |
808 | sld rF, rWORD4, rSHL | |
809 | or rWORD4, rC, rD | |
810 | ld rWORD5, 16(rSTR1) | |
811 | ld rWORD6, 16(rSTR2) | |
812 | cmpld cr1, rWORD3, rWORD4 | |
813 | bne cr0, L(duLcr0) | |
814 | srd rE, rWORD6, rSHR | |
815 | sld rH, rWORD6, rSHL | |
816 | or rWORD6, rE, rF | |
817 | ldu rWORD7, 24(rSTR1) | |
818 | ldu rWORD8, 24(rSTR2) | |
819 | cmpld cr6, rWORD5, rWORD6 | |
820 | bne cr1, L(duLcr1) | |
821 | srd rG, rWORD8, rSHR | |
822 | sld rB, rWORD8, rSHL | |
823 | or rWORD8, rG, rH | |
824 | cmpld cr5, rWORD7, rWORD8 | |
825 | bdz- L(du24) /* Adjust CTR as we start with +4 */ | |
826 | /* This is the primary loop */ | |
827 | .align 4 | |
828 | L(duLoop): | |
829 | ld rWORD1, 8(rSTR1) | |
830 | ld rWORD2, 8(rSTR2) | |
831 | cmpld cr1, rWORD3, rWORD4 | |
832 | bne cr6, L(duLcr6) | |
833 | srd rA, rWORD2, rSHR | |
834 | sld rD, rWORD2, rSHL | |
835 | or rWORD2, rA, rB | |
836 | L(duLoop1): | |
837 | ld rWORD3, 16(rSTR1) | |
838 | ld rWORD4, 16(rSTR2) | |
839 | cmpld cr6, rWORD5, rWORD6 | |
840 | bne cr5, L(duLcr5) | |
841 | srd rC, rWORD4, rSHR | |
842 | sld rF, rWORD4, rSHL | |
843 | or rWORD4, rC, rD | |
844 | L(duLoop2): | |
845 | ld rWORD5, 24(rSTR1) | |
846 | ld rWORD6, 24(rSTR2) | |
847 | cmpld cr5, rWORD7, rWORD8 | |
848 | bne cr0, L(duLcr0) | |
849 | srd rE, rWORD6, rSHR | |
850 | sld rH, rWORD6, rSHL | |
851 | or rWORD6, rE, rF | |
852 | L(duLoop3): | |
853 | ldu rWORD7, 32(rSTR1) | |
854 | ldu rWORD8, 32(rSTR2) | |
855 | cmpld cr0, rWORD1, rWORD2 | |
856 | bne- cr1, L(duLcr1) | |
857 | srd rG, rWORD8, rSHR | |
858 | sld rB, rWORD8, rSHL | |
859 | or rWORD8, rG, rH | |
860 | bdnz+ L(duLoop) | |
861 | ||
862 | L(duL4): | |
863 | bne cr1, L(duLcr1) | |
864 | cmpld cr1, rWORD3, rWORD4 | |
865 | bne cr6, L(duLcr6) | |
866 | cmpld cr6, rWORD5, rWORD6 | |
867 | bne cr5, L(duLcr5) | |
868 | cmpld cr5, rWORD7, rWORD8 | |
869 | L(du44): | |
870 | bne cr0, L(duLcr0) | |
871 | L(du34): | |
872 | bne cr1, L(duLcr1) | |
873 | L(du24): | |
874 | bne cr6, L(duLcr6) | |
875 | L(du14): | |
876 | sldi. rN, rN, 3 | |
877 | bne cr5, L(duLcr5) | |
878 | /* At this point we have a remainder of 1 to 7 bytes to compare. We use | |
879 | shift right double to elliminate bits beyond the compare length. | |
880 | This allows the use of double word subtract to compute the final | |
881 | result. | |
882 | ||
883 | However it may not be safe to load rWORD2 which may be beyond the | |
884 | string length. So we compare the bit length of the remainder to | |
885 | the right shift count (rSHR). If the bit count is less than or equal | |
886 | we do not need to load rWORD2 (all significant bits are already in | |
887 | rB). */ | |
888 | cmpld cr7, rN, rSHR | |
889 | beq L(duZeroReturn) | |
890 | li rA, 0 | |
891 | ble cr7, L(dutrim) | |
892 | ld rWORD2, 8(rSTR2) | |
893 | srd rA, rWORD2, rSHR | |
894 | .align 4 | |
895 | L(dutrim): | |
896 | ld rWORD1, 8(rSTR1) | |
897 | ld rWORD8,-8(r1) | |
898 | subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ | |
899 | or rWORD2, rA, rB | |
900 | ld rWORD7,-16(r1) | |
901 | ld r29,-24(r1) | |
902 | srd rWORD1, rWORD1, rN | |
903 | srd rWORD2, rWORD2, rN | |
904 | ld r28,-32(r1) | |
905 | ld r27,-40(r1) | |
906 | li rRTN, 0 | |
907 | cmpld cr0, rWORD1, rWORD2 | |
908 | ld r26,-48(r1) | |
909 | ld r25,-56(r1) | |
910 | beq cr0, L(dureturn24) | |
911 | li rRTN, 1 | |
912 | ld r24,-64(r1) | |
913 | bgtlr cr0 | |
914 | li rRTN, -1 | |
915 | blr | |
916 | .align 4 | |
917 | L(duLcr0): | |
918 | ld rWORD8,-8(r1) | |
919 | ld rWORD7,-16(r1) | |
920 | li rRTN, 1 | |
921 | bgt cr0, L(dureturn29) | |
922 | ld r29,-24(r1) | |
923 | ld r28,-32(r1) | |
924 | li rRTN, -1 | |
925 | b L(dureturn27) | |
926 | .align 4 | |
927 | L(duLcr1): | |
928 | ld rWORD8,-8(r1) | |
929 | ld rWORD7,-16(r1) | |
930 | li rRTN, 1 | |
931 | bgt cr1, L(dureturn29) | |
932 | ld r29,-24(r1) | |
933 | ld r28,-32(r1) | |
934 | li rRTN, -1 | |
935 | b L(dureturn27) | |
936 | .align 4 | |
937 | L(duLcr6): | |
938 | ld rWORD8,-8(r1) | |
939 | ld rWORD7,-16(r1) | |
940 | li rRTN, 1 | |
941 | bgt cr6, L(dureturn29) | |
942 | ld r29,-24(r1) | |
943 | ld r28,-32(r1) | |
944 | li rRTN, -1 | |
945 | b L(dureturn27) | |
946 | .align 4 | |
947 | L(duLcr5): | |
948 | ld rWORD8,-8(r1) | |
949 | ld rWORD7,-16(r1) | |
950 | li rRTN, 1 | |
951 | bgt cr5, L(dureturn29) | |
952 | ld r29,-24(r1) | |
953 | ld r28,-32(r1) | |
954 | li rRTN, -1 | |
955 | b L(dureturn27) | |
956 | .align 3 | |
957 | L(duZeroReturn): | |
958 | li rRTN,0 | |
959 | .align 4 | |
960 | L(dureturn): | |
961 | ld rWORD8,-8(r1) | |
962 | ld rWORD7,-16(r1) | |
963 | L(dureturn29): | |
964 | ld r29,-24(r1) | |
965 | ld r28,-32(r1) | |
966 | L(dureturn27): | |
967 | ld r27,-40(r1) | |
968 | L(dureturn26): | |
969 | ld r26,-48(r1) | |
970 | L(dureturn25): | |
971 | ld r25,-56(r1) | |
972 | L(dureturn24): | |
973 | ld r24,-64(r1) | |
974 | blr | |
975 | L(duzeroLength): | |
976 | li rRTN,0 | |
977 | blr | |
978 | ||
979 | END (BP_SYM (memcmp)) | |
980 | libc_hidden_builtin_def (memcmp) | |
981 | weak_alias (memcmp, bcmp) |