]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power8/strrchr.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / strrchr.S
1 /* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
2 Copyright (C) 2017-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* char *[r3] strrchr (char *s [r3], int c [r4]) */
22 /* TODO: change these to the actual instructions when the minimum required
23 binutils allows it. */
24 #define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
25 #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
26 #define VBPERMQ(t,a,b) .long (0x1000054c \
27 | ((t)<<(32-11)) \
28 | ((a)<<(32-16)) \
29 | ((b)<<(32-21)) )
30 #define VCLZD(r,v) .long (0x100007c2 | ((r)<<(32-11)) | ((v)<<(32-21)))
31 #define VPOPCNTD(r,v) .long (0x100007c3 | ((r)<<(32-11)) | ((v)<<(32-21)))
32 #define VADDUQM(t,a,b) .long (0x10000100 \
33 | ((t)<<(32-11)) \
34 | ((a)<<(32-16)) \
35 | ((b)<<(32-21)) )
36 #ifdef __LITTLE_ENDIAN__
37 /* Find the match position from v6 and place result in r6. */
38 # define CALCULATE_MATCH() \
39 VBPERMQ(v6, v6, v10); \
40 vsldoi v6, v6, v6, 6; \
41 MFVRD(r7, v6); \
42 cntlzd r6, r7; \
43 subfic r6, r6, 15;
44 /*
45 * Find the first null position to mask bytes after null.
46 * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
47 * Result placed at v2.
48 */
49 # define FIND_NULL_POS(reg) \
50 vspltisb v11, -1; \
51 VADDUQM(v11, reg, v11); \
52 vandc v11, v11, reg; \
53 VPOPCNTD(v2, v11); \
54 vspltb v11, v2, 15; \
55 vcmpequb. v11, v11, v9; \
56 blt cr6, 1f; \
57 vsldoi v9, v0, v9, 1; \
58 vslo v2, v2, v9; \
59 1: \
60 vsumsws v2, v2, v0;
61 #else
62 # define CALCULATE_MATCH() \
63 VBPERMQ(v6, v6, v10); \
64 MFVRD(r7, v6); \
65 addi r6, r7, -1; \
66 andc r6, r6, r7; \
67 popcntd r6, r6; \
68 subfic r6, r6, 15;
69 # define FIND_NULL_POS(reg) \
70 VCLZD(v2, reg); \
71 vspltb v11, v2, 7; \
72 vcmpequb. v11, v11, v9; \
73 blt cr6, 1f; \
74 vsldoi v9, v0, v9, 1; \
75 vsro v2, v2, v9; \
76 1: \
77 vsumsws v2, v2, v0;
78 #endif /* !__LITTLE_ENDIAN__ */
79
80 #ifndef STRRCHR
81 # define STRRCHR strrchr
82 #endif
83 .machine power7
84 ENTRY_TOCLESS (STRRCHR)
85 CALL_MCOUNT 2
86 dcbt 0,r3
87 clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
88 cmpdi cr7,r4,0
89 ld r12,0(r8) /* Load doubleword from memory. */
90 li r9,0 /* Used to store last occurence. */
91 li r0,0 /* Doubleword with null chars to use
92 with cmpb. */
93
94 rlwinm r6,r3,3,26,28 /* Calculate padding. */
95
96 beq cr7,L(null_match)
97
98 /* Replicate byte to doubleword. */
99 insrdi r4,r4,8,48
100 insrdi r4,r4,16,32
101 insrdi r4,r4,32,0
102
103 /* r4 is changed now. If it's passed more chars, then
104 check for null again. */
105 cmpdi cr7,r4,0
106 beq cr7,L(null_match)
107 /* Now r4 has a doubleword of c bytes and r0 has
108 a doubleword of null bytes. */
109
110 cmpb r10,r12,r4 /* Compare each byte against c byte. */
111 cmpb r11,r12,r0 /* Compare each byte against null byte. */
112
113 /* Move the doublewords left and right to discard the bits that are
114 not part of the string and bring them back as zeros. */
115 #ifdef __LITTLE_ENDIAN__
116 srd r10,r10,r6
117 srd r11,r11,r6
118 sld r10,r10,r6
119 sld r11,r11,r6
120 #else
121 sld r10,r10,r6
122 sld r11,r11,r6
123 srd r10,r10,r6
124 srd r11,r11,r6
125 #endif
126 or r5,r10,r11 /* OR the results to speed things up. */
127 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
128 have been found. */
129 bne cr7,L(done)
130
131 L(align):
132 andi. r12, r8, 15
133
134 /* Are we now aligned to a doubleword boundary? If so, skip to
135 the main loop. Otherwise, go through the alignment code. */
136
137 bne cr0, L(loop)
138
139 /* Handle WORD2 of pair. */
140 ldu r12,8(r8)
141 cmpb r10,r12,r4
142 cmpb r11,r12,r0
143 or r5,r10,r11
144 cmpdi cr7,r5,0
145 bne cr7,L(done)
146 b L(loop) /* We branch here (rather than falling through)
147 to skip the nops due to heavy alignment
148 of the loop below. */
149 .p2align 5
150 L(loop):
151 /* Load two doublewords, compare and merge in a
152 single register for speed. This is an attempt
153 to speed up the null-checking process for bigger strings. */
154 ld r12,8(r8)
155 ldu r7,16(r8)
156 cmpb r10,r12,r4
157 cmpb r11,r12,r0
158 cmpb r6,r7,r4
159 cmpb r7,r7,r0
160 or r12,r10,r11
161 or r5,r6,r7
162 or r5,r12,r5
163 cmpdi cr7,r5,0
164 beq cr7,L(vector)
165
166 /* OK, one (or both) of the doublewords contains a c/null byte. Check
167 the first doubleword and decrement the address in case the first
168 doubleword really contains a c/null byte. */
169 cmpdi cr6,r12,0
170 addi r8,r8,-8
171 bne cr6,L(done)
172
173 /* The c/null byte must be in the second doubleword. Adjust the
174 address again and move the result of cmpb to r10 so we can calculate
175 the pointer. */
176
177 mr r10,r6
178 mr r11,r7
179 addi r8,r8,8
180
181 /* r10/r11 have the output of the cmpb instructions, that is,
182 0xff in the same position as the c/null byte in the original
183 doubleword from the string. Use that to calculate the pointer. */
184
185 L(done):
186 /* If there are more than one 0xff in r11, find the first position of
187 0xff in r11 and fill r10 with 0 from that position. */
188 cmpdi cr7,r11,0
189 beq cr7,L(no_null)
190 #ifdef __LITTLE_ENDIAN__
191 addi r3,r11,-1
192 andc r3,r3,r11
193 popcntd r0,r3
194 #else
195 cntlzd r0,r11
196 #endif
197 subfic r0,r0,63
198 li r6,-1
199 #ifdef __LITTLE_ENDIAN__
200 srd r0,r6,r0
201 #else
202 sld r0,r6,r0
203 #endif
204 and r10,r0,r10
205 L(no_null):
206 #ifdef __LITTLE_ENDIAN__
207 cntlzd r0,r10 /* Count leading zeros before c matches. */
208 addi r3,r10,-1
209 andc r3,r3,r10
210 addi r10,r11,-1
211 andc r10,r10,r11
212 cmpld cr7,r3,r10
213 bgt cr7,L(no_match)
214 #else
215 addi r3,r10,-1 /* Count trailing zeros before c matches. */
216 andc r3,r3,r10
217 popcntd r0,r3
218 cmpld cr7,r11,r10
219 bgt cr7,L(no_match)
220 #endif
221 srdi r0,r0,3 /* Convert trailing zeros to bytes. */
222 subfic r0,r0,7
223 add r9,r8,r0 /* Return address of the matching c byte
224 or null in case c was not found. */
225 li r0,0
226 cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
227 beq cr7,L(align)
228
229 .align 4
230 L(no_match):
231 mr r3,r9
232 blr
233
234 /* Check the first 32B in GPR's and move to vectorized loop. */
235 .p2align 5
236 L(vector):
237 addi r3, r8, 8
238 /* Make sure 32B aligned. */
239 andi. r10, r3, 31
240 bne cr0, L(loop)
241 vspltisb v0, 0
242 /* Precompute vbpermq constant. */
243 vspltisb v10, 3
244 lvsl v11, r0, r0
245 vslb v10, v11, v10
246 MTVRD(v1, r4)
247 li r5, 16
248 vspltb v1, v1, 7
249 /* Compare 32 bytes in each loop. */
250 L(continue):
251 lvx v4, 0, r3
252 lvx v5, r3, r5
253 vcmpequb v2, v0, v4
254 vcmpequb v3, v0, v5
255 vcmpequb v6, v1, v4
256 vcmpequb v7, v1, v5
257 vor v8, v2, v3
258 vor v9, v6, v7
259 vor v11, v8, v9
260 vcmpequb. v11, v0, v11
261 addi r3, r3, 32
262 blt cr6, L(continue)
263 vcmpequb. v8, v0, v8
264 blt cr6, L(match)
265
266 /* One (or both) of the quadwords contains c/null. */
267 vspltisb v8, 2
268 vspltisb v9, 5
269 /* Precompute values used for comparison. */
270 vsl v9, v8, v9 /* v9 = 0x4040404040404040. */
271 vaddubm v8, v9, v9
272 vsldoi v8, v0, v8, 1 /* v8 = 0x80. */
273
274 /* Check if null is in second qw. */
275 vcmpequb. v11, v0, v2
276 blt cr6, L(secondqw)
277
278 /* Null found in first qw. */
279 addi r8, r3, -32
280 /* Calculate the null position. */
281 FIND_NULL_POS(v2)
282 /* Check if null is in the first byte. */
283 vcmpequb. v11, v0, v2
284 blt cr6, L(no_match)
285 vsububm v2, v8, v2
286 /* Mask unwanted bytes after null. */
287 #ifdef __LITTLE_ENDIAN__
288 vslo v6, v6, v2
289 vsro v6, v6, v2
290 #else
291 vsro v6, v6, v2
292 vslo v6, v6, v2
293 #endif
294 vcmpequb. v11, v0, v6
295 blt cr6, L(no_match)
296 /* Found a match before null. */
297 CALCULATE_MATCH()
298 add r3, r8, r6
299 blr
300
301 L(secondqw):
302 addi r8, r3, -16
303 FIND_NULL_POS(v3)
304 vcmpequb. v11, v0, v2
305 blt cr6, L(no_match1)
306 vsububm v2, v8, v2
307 /* Mask unwanted bytes after null. */
308 #ifdef __LITTLE_ENDIAN__
309 vslo v7, v7, v2
310 vsro v7, v7, v2
311 #else
312 vsro v7, v7, v2
313 vslo v7, v7, v2
314 #endif
315 vcmpequb. v11, v0, v7
316 blt cr6, L(no_match1)
317 addi r8, r8, 16
318 vor v6, v0, v7
319 L(no_match1):
320 addi r8, r8, -16
321 vcmpequb. v11, v0, v6
322 blt cr6, L(no_match)
323 /* Found a match before null. */
324 CALCULATE_MATCH()
325 add r3, r8, r6
326 blr
327
328 L(match):
329 /* One (or both) of the quadwords contains a match. */
330 mr r8, r3
331 vcmpequb. v8, v0, v7
332 blt cr6, L(firstqw)
333 /* Match found in second qw. */
334 addi r8, r8, 16
335 vor v6, v0, v7
336 L(firstqw):
337 addi r8, r8, -32
338 CALCULATE_MATCH()
339 add r9, r8, r6 /* Compute final length. */
340 b L(continue)
341 /* We are here because strrchr was called with a null byte. */
342 .align 4
343 L(null_match):
344 /* r0 has a doubleword of null bytes. */
345
346 cmpb r5,r12,r0 /* Compare each byte against null bytes. */
347
348 /* Move the doublewords left and right to discard the bits that are
349 not part of the string and bring them back as zeros. */
350 #ifdef __LITTLE_ENDIAN__
351 srd r5,r5,r6
352 sld r5,r5,r6
353 #else
354 sld r5,r5,r6
355 srd r5,r5,r6
356 #endif
357 cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
358 have been found. */
359 bne cr7,L(done_null)
360
361 andi. r12, r8, 15
362
363 /* Are we now aligned to a quadword boundary? If so, skip to
364 the main loop. Otherwise, go through the alignment code. */
365
366 bne cr0, L(loop_null)
367
368 /* Handle WORD2 of pair. */
369 ldu r12,8(r8)
370 cmpb r5,r12,r0
371 cmpdi cr7,r5,0
372 bne cr7,L(done_null)
373 b L(loop_null) /* We branch here (rather than falling through)
374 to skip the nops due to heavy alignment
375 of the loop below. */
376
377 /* Main loop to look for the end of the string. Since it's a
378 small loop (< 8 instructions), align it to 32-bytes. */
379 .p2align 5
380 L(loop_null):
381 /* Load two doublewords, compare and merge in a
382 single register for speed. This is an attempt
383 to speed up the null-checking process for bigger strings. */
384 ld r12,8(r8)
385 ldu r11,16(r8)
386 cmpb r5,r12,r0
387 cmpb r10,r11,r0
388 or r6,r5,r10
389 cmpdi cr7,r6,0
390 beq cr7,L(vector1)
391
392 /* OK, one (or both) of the doublewords contains a null byte. Check
393 the first doubleword and decrement the address in case the first
394 doubleword really contains a null byte. */
395
396 cmpdi cr6,r5,0
397 addi r8,r8,-8
398 bne cr6,L(done_null)
399
400 /* The null byte must be in the second doubleword. Adjust the address
401 again and move the result of cmpb to r10 so we can calculate the
402 pointer. */
403
404 mr r5,r10
405 addi r8,r8,8
406
407 /* r5 has the output of the cmpb instruction, that is, it contains
408 0xff in the same position as the null byte in the original
409 doubleword from the string. Use that to calculate the pointer. */
410 L(done_null):
411 #ifdef __LITTLE_ENDIAN__
412 addi r0,r5,-1
413 andc r0,r0,r5
414 popcntd r0,r0
415 #else
416 cntlzd r0,r5 /* Count leading zeros before the match. */
417 #endif
418 srdi r0,r0,3 /* Convert trailing zeros to bytes. */
419 add r3,r8,r0 /* Return address of the matching null byte. */
420 blr
421 /* Check the first 32B in GPR's and move to vectorized loop. */
422 .p2align 5
423 L(vector1):
424 addi r3, r8, 8
425 /* Make sure 32B aligned. */
426 andi. r10, r3, 31
427 bne cr0, L(loop_null)
428 vspltisb v0, 0
429 /* Precompute vbpermq constant. */
430 vspltisb v10, 3
431 lvsl v11, r0, r0
432 vslb v10, v11, v10
433 li r5, 16
434 /* Compare 32 bytes in each loop. */
435 L(continue1):
436 lvx v4, 0, r3
437 lvx v5, r3, r5
438 vcmpequb v2, v0, v4
439 vcmpequb v3, v0, v5
440 vor v8, v2, v3
441 vcmpequb. v11, v0, v8
442 addi r3, r3, 32
443 blt cr6, L(continue1)
444 addi r3, r3, -32
445 VBPERMQ(v2, v2, v10)
446 VBPERMQ(v3, v3, v10)
447 /* Shift each component into its correct position for merging. */
448 #ifdef __LITTLE_ENDIAN__
449 vsldoi v3, v3, v3, 2
450 #else
451 vsldoi v2, v2, v2, 6
452 vsldoi v3, v3, v3, 4
453 #endif
454 /* Merge the results and move to a GPR. */
455 vor v4, v3, v2
456 MFVRD(r5, v4)
457 #ifdef __LITTLE_ENDIAN__
458 addi r6, r5, -1
459 andc r6, r6, r5
460 popcntd r6, r6
461 #else
462 cntlzd r6, r5 /* Count leading zeros before the match. */
463 #endif
464 add r3, r3, r6 /* Compute final length. */
465 blr
466 END_GEN_TB (STRRCHR, TB_TOCLESS)
467 weak_alias (strrchr, rindex)
468 libc_hidden_builtin_def (strrchr)