]>
Commit | Line | Data |
---|---|---|
6c6ab1fc | 1 | /* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn. |
04277e02 | 2 | Copyright (C) 2017-2019 Free Software Foundation, Inc. |
6c6ab1fc RS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #include <sysdep.h> | |
20 | ||
21 | /* char *[r3] strrchr (char *s [r3], int c [r4]) */ | |
22 | /* TODO: change these to the actual instructions when the minimum required | |
23 | binutils allows it. */ | |
24 | #define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) | |
25 | #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) | |
26 | #define VBPERMQ(t,a,b) .long (0x1000054c \ | |
27 | | ((t)<<(32-11)) \ | |
28 | | ((a)<<(32-16)) \ | |
29 | | ((b)<<(32-21)) ) | |
30 | #define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21))) | |
31 | #define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21))) | |
32 | #define VADDUQM(t,a,b) .long (0x10000100 \ | |
33 | | ((t)<<(32-11)) \ | |
34 | | ((a)<<(32-16)) \ | |
35 | | ((b)<<(32-21)) ) | |
36 | #ifdef __LITTLE_ENDIAN__ | |
37 | /* Find the match position from v6 and place result in r6. */ | |
38 | # define CALCULATE_MATCH() \ | |
39 | VBPERMQ(v6, v6, v10); \ | |
40 | vsldoi v6, v6, v6, 6; \ | |
41 | MFVRD(r7, v6); \ | |
42 | cntlzd r6, r7; \ | |
43 | subfic r6, r6, 15; | |
44 | /* | |
45 | * Find the first null position to mask bytes after null. | |
46 | * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw. | |
47 | * Result placed at v2. | |
48 | */ | |
49 | # define FIND_NULL_POS(reg) \ | |
50 | vspltisb v11, -1; \ | |
51 | VADDUQM(v11, reg, v11); \ | |
52 | vandc v11, v11, reg; \ | |
53 | VPOPCNTD(v2, v11); \ | |
54 | vspltb v11, v2, 15; \ | |
55 | vcmpequb. v11, v11, v9; \ | |
56 | blt cr6, 1f; \ | |
57 | vsldoi v9, v0, v9, 1; \ | |
58 | vslo v2, v2, v9; \ | |
59 | 1: \ | |
60 | vsumsws v2, v2, v0; | |
61 | #else | |
62 | # define CALCULATE_MATCH() \ | |
63 | VBPERMQ(v6, v6, v10); \ | |
64 | MFVRD(r7, v6); \ | |
65 | addi r6, r7, -1; \ | |
66 | andc r6, r6, r7; \ | |
67 | popcntd r6, r6; \ | |
68 | subfic r6, r6, 15; | |
69 | # define FIND_NULL_POS(reg) \ | |
70 | VCLZD(v2, reg); \ | |
71 | vspltb v11, v2, 7; \ | |
72 | vcmpequb. v11, v11, v9; \ | |
73 | blt cr6, 1f; \ | |
74 | vsldoi v9, v0, v9, 1; \ | |
75 | vsro v2, v2, v9; \ | |
76 | 1: \ | |
77 | vsumsws v2, v2, v0; | |
78 | #endif /* !__LITTLE_ENDIAN__ */ | |
12f50337 RS |
79 | |
80 | #ifndef STRRCHR | |
81 | # define STRRCHR strrchr | |
82 | #endif | |
6c6ab1fc | 83 | .machine power7 |
12f50337 | 84 | ENTRY_TOCLESS (STRRCHR) |
6c6ab1fc RS |
85 | CALL_MCOUNT 2 |
86 | dcbt 0,r3 | |
87 | clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ | |
88 | cmpdi cr7,r4,0 | |
89 | ld r12,0(r8) /* Load doubleword from memory. */ | |
90 | li r9,0 /* Used to store last occurence. */ | |
91 | li r0,0 /* Doubleword with null chars to use | |
92 | with cmpb. */ | |
93 | ||
94 | rlwinm r6,r3,3,26,28 /* Calculate padding. */ | |
95 | ||
96 | beq cr7,L(null_match) | |
97 | ||
98 | /* Replicate byte to doubleword. */ | |
99 | insrdi r4,r4,8,48 | |
100 | insrdi r4,r4,16,32 | |
101 | insrdi r4,r4,32,0 | |
102 | ||
103 | /* r4 is changed now. If it's passed more chars, then | |
104 | check for null again. */ | |
105 | cmpdi cr7,r4,0 | |
106 | beq cr7,L(null_match) | |
107 | /* Now r4 has a doubleword of c bytes and r0 has | |
108 | a doubleword of null bytes. */ | |
109 | ||
110 | cmpb r10,r12,r4 /* Compare each byte against c byte. */ | |
111 | cmpb r11,r12,r0 /* Compare each byte against null byte. */ | |
112 | ||
113 | /* Move the doublewords left and right to discard the bits that are | |
114 | not part of the string and bring them back as zeros. */ | |
115 | #ifdef __LITTLE_ENDIAN__ | |
116 | srd r10,r10,r6 | |
117 | srd r11,r11,r6 | |
118 | sld r10,r10,r6 | |
119 | sld r11,r11,r6 | |
120 | #else | |
121 | sld r10,r10,r6 | |
122 | sld r11,r11,r6 | |
123 | srd r10,r10,r6 | |
124 | srd r11,r11,r6 | |
125 | #endif | |
126 | or r5,r10,r11 /* OR the results to speed things up. */ | |
127 | cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes | |
128 | have been found. */ | |
129 | bne cr7,L(done) | |
130 | ||
131 | L(align): | |
132 | andi. r12, r8, 15 | |
133 | ||
134 | /* Are we now aligned to a doubleword boundary? If so, skip to | |
135 | the main loop. Otherwise, go through the alignment code. */ | |
136 | ||
137 | bne cr0, L(loop) | |
138 | ||
139 | /* Handle WORD2 of pair. */ | |
140 | ldu r12,8(r8) | |
141 | cmpb r10,r12,r4 | |
142 | cmpb r11,r12,r0 | |
143 | or r5,r10,r11 | |
144 | cmpdi cr7,r5,0 | |
145 | bne cr7,L(done) | |
146 | b L(loop) /* We branch here (rather than falling through) | |
147 | to skip the nops due to heavy alignment | |
148 | of the loop below. */ | |
149 | .p2align 5 | |
150 | L(loop): | |
151 | /* Load two doublewords, compare and merge in a | |
152 | single register for speed. This is an attempt | |
153 | to speed up the null-checking process for bigger strings. */ | |
154 | ld r12,8(r8) | |
155 | ldu r7,16(r8) | |
156 | cmpb r10,r12,r4 | |
157 | cmpb r11,r12,r0 | |
158 | cmpb r6,r7,r4 | |
159 | cmpb r7,r7,r0 | |
160 | or r12,r10,r11 | |
161 | or r5,r6,r7 | |
162 | or r5,r12,r5 | |
163 | cmpdi cr7,r5,0 | |
164 | beq cr7,L(vector) | |
165 | ||
166 | /* OK, one (or both) of the doublewords contains a c/null byte. Check | |
167 | the first doubleword and decrement the address in case the first | |
168 | doubleword really contains a c/null byte. */ | |
169 | cmpdi cr6,r12,0 | |
170 | addi r8,r8,-8 | |
171 | bne cr6,L(done) | |
172 | ||
173 | /* The c/null byte must be in the second doubleword. Adjust the | |
174 | address again and move the result of cmpb to r10 so we can calculate | |
175 | the pointer. */ | |
176 | ||
177 | mr r10,r6 | |
178 | mr r11,r7 | |
179 | addi r8,r8,8 | |
180 | ||
181 | /* r10/r11 have the output of the cmpb instructions, that is, | |
182 | 0xff in the same position as the c/null byte in the original | |
183 | doubleword from the string. Use that to calculate the pointer. */ | |
184 | ||
185 | L(done): | |
186 | /* If there are more than one 0xff in r11, find the first position of | |
187 | 0xff in r11 and fill r10 with 0 from that position. */ | |
188 | cmpdi cr7,r11,0 | |
189 | beq cr7,L(no_null) | |
190 | #ifdef __LITTLE_ENDIAN__ | |
191 | addi r3,r11,-1 | |
192 | andc r3,r3,r11 | |
193 | popcntd r0,r3 | |
194 | #else | |
195 | cntlzd r0,r11 | |
196 | #endif | |
197 | subfic r0,r0,63 | |
198 | li r6,-1 | |
199 | #ifdef __LITTLE_ENDIAN__ | |
200 | srd r0,r6,r0 | |
201 | #else | |
202 | sld r0,r6,r0 | |
203 | #endif | |
204 | and r10,r0,r10 | |
205 | L(no_null): | |
206 | #ifdef __LITTLE_ENDIAN__ | |
207 | cntlzd r0,r10 /* Count leading zeros before c matches. */ | |
208 | addi r3,r10,-1 | |
209 | andc r3,r3,r10 | |
210 | addi r10,r11,-1 | |
211 | andc r10,r10,r11 | |
212 | cmpld cr7,r3,r10 | |
213 | bgt cr7,L(no_match) | |
214 | #else | |
215 | addi r3,r10,-1 /* Count trailing zeros before c matches. */ | |
216 | andc r3,r3,r10 | |
217 | popcntd r0,r3 | |
218 | cmpld cr7,r11,r10 | |
219 | bgt cr7,L(no_match) | |
220 | #endif | |
221 | srdi r0,r0,3 /* Convert trailing zeros to bytes. */ | |
222 | subfic r0,r0,7 | |
223 | add r9,r8,r0 /* Return address of the matching c byte | |
224 | or null in case c was not found. */ | |
225 | li r0,0 | |
226 | cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */ | |
227 | beq cr7,L(align) | |
228 | ||
229 | .align 4 | |
230 | L(no_match): | |
231 | mr r3,r9 | |
232 | blr | |
233 | ||
234 | /* Check the first 32B in GPR's and move to vectorized loop. */ | |
235 | .p2align 5 | |
236 | L(vector): | |
237 | addi r3, r8, 8 | |
238 | /* Make sure 32B aligned. */ | |
239 | andi. r10, r3, 31 | |
240 | bne cr0, L(loop) | |
241 | vspltisb v0, 0 | |
242 | /* Precompute vbpermq constant. */ | |
243 | vspltisb v10, 3 | |
244 | lvsl v11, r0, r0 | |
245 | vslb v10, v11, v10 | |
246 | MTVRD(v1, r4) | |
247 | li r5, 16 | |
248 | vspltb v1, v1, 7 | |
249 | /* Compare 32 bytes in each loop. */ | |
250 | L(continue): | |
251 | lvx v4, 0, r3 | |
252 | lvx v5, r3, r5 | |
253 | vcmpequb v2, v0, v4 | |
254 | vcmpequb v3, v0, v5 | |
255 | vcmpequb v6, v1, v4 | |
256 | vcmpequb v7, v1, v5 | |
257 | vor v8, v2, v3 | |
258 | vor v9, v6, v7 | |
259 | vor v11, v8, v9 | |
260 | vcmpequb. v11, v0, v11 | |
261 | addi r3, r3, 32 | |
262 | blt cr6, L(continue) | |
263 | vcmpequb. v8, v0, v8 | |
264 | blt cr6, L(match) | |
265 | ||
266 | /* One (or both) of the quadwords contains c/null. */ | |
267 | vspltisb v8, 2 | |
268 | vspltisb v9, 5 | |
269 | /* Precompute values used for comparison. */ | |
270 | vsl v9, v8, v9 /* v9 = 0x4040404040404040. */ | |
271 | vaddubm v8, v9, v9 | |
272 | vsldoi v8, v0, v8, 1 /* v8 = 0x80. */ | |
273 | ||
274 | /* Check if null is in second qw. */ | |
275 | vcmpequb. v11, v0, v2 | |
276 | blt cr6, L(secondqw) | |
277 | ||
278 | /* Null found in first qw. */ | |
279 | addi r8, r3, -32 | |
280 | /* Calculate the null position. */ | |
281 | FIND_NULL_POS(v2) | |
282 | /* Check if null is in the first byte. */ | |
283 | vcmpequb. v11, v0, v2 | |
284 | blt cr6, L(no_match) | |
285 | vsububm v2, v8, v2 | |
286 | /* Mask unwanted bytes after null. */ | |
287 | #ifdef __LITTLE_ENDIAN__ | |
288 | vslo v6, v6, v2 | |
289 | vsro v6, v6, v2 | |
290 | #else | |
291 | vsro v6, v6, v2 | |
292 | vslo v6, v6, v2 | |
293 | #endif | |
294 | vcmpequb. v11, v0, v6 | |
295 | blt cr6, L(no_match) | |
296 | /* Found a match before null. */ | |
297 | CALCULATE_MATCH() | |
298 | add r3, r8, r6 | |
299 | blr | |
300 | ||
301 | L(secondqw): | |
302 | addi r8, r3, -16 | |
303 | FIND_NULL_POS(v3) | |
304 | vcmpequb. v11, v0, v2 | |
305 | blt cr6, L(no_match1) | |
306 | vsububm v2, v8, v2 | |
307 | /* Mask unwanted bytes after null. */ | |
308 | #ifdef __LITTLE_ENDIAN__ | |
309 | vslo v7, v7, v2 | |
310 | vsro v7, v7, v2 | |
311 | #else | |
312 | vsro v7, v7, v2 | |
313 | vslo v7, v7, v2 | |
314 | #endif | |
315 | vcmpequb. v11, v0, v7 | |
316 | blt cr6, L(no_match1) | |
317 | addi r8, r8, 16 | |
318 | vor v6, v0, v7 | |
319 | L(no_match1): | |
320 | addi r8, r8, -16 | |
321 | vcmpequb. v11, v0, v6 | |
322 | blt cr6, L(no_match) | |
323 | /* Found a match before null. */ | |
324 | CALCULATE_MATCH() | |
325 | add r3, r8, r6 | |
326 | blr | |
327 | ||
328 | L(match): | |
329 | /* One (or both) of the quadwords contains a match. */ | |
330 | mr r8, r3 | |
331 | vcmpequb. v8, v0, v7 | |
332 | blt cr6, L(firstqw) | |
333 | /* Match found in second qw. */ | |
334 | addi r8, r8, 16 | |
335 | vor v6, v0, v7 | |
336 | L(firstqw): | |
337 | addi r8, r8, -32 | |
338 | CALCULATE_MATCH() | |
339 | add r9, r8, r6 /* Compute final length. */ | |
340 | b L(continue) | |
341 | /* We are here because strrchr was called with a null byte. */ | |
342 | .align 4 | |
343 | L(null_match): | |
344 | /* r0 has a doubleword of null bytes. */ | |
345 | ||
346 | cmpb r5,r12,r0 /* Compare each byte against null bytes. */ | |
347 | ||
348 | /* Move the doublewords left and right to discard the bits that are | |
349 | not part of the string and bring them back as zeros. */ | |
350 | #ifdef __LITTLE_ENDIAN__ | |
351 | srd r5,r5,r6 | |
352 | sld r5,r5,r6 | |
353 | #else | |
354 | sld r5,r5,r6 | |
355 | srd r5,r5,r6 | |
356 | #endif | |
357 | cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes | |
358 | have been found. */ | |
359 | bne cr7,L(done_null) | |
360 | ||
361 | andi. r12, r8, 15 | |
362 | ||
363 | /* Are we now aligned to a quadword boundary? If so, skip to | |
364 | the main loop. Otherwise, go through the alignment code. */ | |
365 | ||
366 | bne cr0, L(loop_null) | |
367 | ||
368 | /* Handle WORD2 of pair. */ | |
369 | ldu r12,8(r8) | |
370 | cmpb r5,r12,r0 | |
371 | cmpdi cr7,r5,0 | |
372 | bne cr7,L(done_null) | |
373 | b L(loop_null) /* We branch here (rather than falling through) | |
374 | to skip the nops due to heavy alignment | |
375 | of the loop below. */ | |
376 | ||
377 | /* Main loop to look for the end of the string. Since it's a | |
378 | small loop (< 8 instructions), align it to 32-bytes. */ | |
379 | .p2align 5 | |
380 | L(loop_null): | |
381 | /* Load two doublewords, compare and merge in a | |
382 | single register for speed. This is an attempt | |
383 | to speed up the null-checking process for bigger strings. */ | |
384 | ld r12,8(r8) | |
385 | ldu r11,16(r8) | |
386 | cmpb r5,r12,r0 | |
387 | cmpb r10,r11,r0 | |
388 | or r6,r5,r10 | |
389 | cmpdi cr7,r6,0 | |
390 | beq cr7,L(vector1) | |
391 | ||
392 | /* OK, one (or both) of the doublewords contains a null byte. Check | |
393 | the first doubleword and decrement the address in case the first | |
394 | doubleword really contains a null byte. */ | |
395 | ||
396 | cmpdi cr6,r5,0 | |
397 | addi r8,r8,-8 | |
398 | bne cr6,L(done_null) | |
399 | ||
400 | /* The null byte must be in the second doubleword. Adjust the address | |
401 | again and move the result of cmpb to r10 so we can calculate the | |
402 | pointer. */ | |
403 | ||
404 | mr r5,r10 | |
405 | addi r8,r8,8 | |
406 | ||
407 | /* r5 has the output of the cmpb instruction, that is, it contains | |
408 | 0xff in the same position as the null byte in the original | |
409 | doubleword from the string. Use that to calculate the pointer. */ | |
410 | L(done_null): | |
411 | #ifdef __LITTLE_ENDIAN__ | |
412 | addi r0,r5,-1 | |
413 | andc r0,r0,r5 | |
414 | popcntd r0,r0 | |
415 | #else | |
416 | cntlzd r0,r5 /* Count leading zeros before the match. */ | |
417 | #endif | |
418 | srdi r0,r0,3 /* Convert trailing zeros to bytes. */ | |
419 | add r3,r8,r0 /* Return address of the matching null byte. */ | |
420 | blr | |
421 | /* Check the first 32B in GPR's and move to vectorized loop. */ | |
422 | .p2align 5 | |
423 | L(vector1): | |
424 | addi r3, r8, 8 | |
425 | /* Make sure 32B aligned. */ | |
426 | andi. r10, r3, 31 | |
427 | bne cr0, L(loop_null) | |
428 | vspltisb v0, 0 | |
429 | /* Precompute vbpermq constant. */ | |
430 | vspltisb v10, 3 | |
431 | lvsl v11, r0, r0 | |
432 | vslb v10, v11, v10 | |
433 | li r5, 16 | |
434 | /* Compare 32 bytes in each loop. */ | |
435 | L(continue1): | |
436 | lvx v4, 0, r3 | |
437 | lvx v5, r3, r5 | |
438 | vcmpequb v2, v0, v4 | |
439 | vcmpequb v3, v0, v5 | |
440 | vor v8, v2, v3 | |
441 | vcmpequb. v11, v0, v8 | |
442 | addi r3, r3, 32 | |
443 | blt cr6, L(continue1) | |
444 | addi r3, r3, -32 | |
445 | VBPERMQ(v2, v2, v10) | |
446 | VBPERMQ(v3, v3, v10) | |
447 | /* Shift each component into its correct position for merging. */ | |
448 | #ifdef __LITTLE_ENDIAN__ | |
449 | vsldoi v3, v3, v3, 2 | |
450 | #else | |
451 | vsldoi v2, v2, v2, 6 | |
452 | vsldoi v3, v3, v3, 4 | |
453 | #endif | |
454 | /* Merge the results and move to a GPR. */ | |
455 | vor v4, v3, v2 | |
456 | MFVRD(r5, v4) | |
457 | #ifdef __LITTLE_ENDIAN__ | |
458 | addi r6, r5, -1 | |
459 | andc r6, r6, r5 | |
460 | popcntd r6, r6 | |
461 | #else | |
462 | cntlzd r6, r5 /* Count leading zeros before the match. */ | |
463 | #endif | |
464 | add r3, r3, r6 /* Compute final length. */ | |
465 | blr | |
12f50337 | 466 | END_GEN_TB (STRRCHR, TB_TOCLESS) |
6c6ab1fc RS |
467 | weak_alias (strrchr, rindex) |
468 | libc_hidden_builtin_def (strrchr) |