1 /* Optimized memrchr implementation for PowerPC64/POWER8.
2 Copyright (C) 2017-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 /* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
24 # define MEMRCHR __memrchr
27 ENTRY_TOCLESS (MEMRCHR)
29 add r7, r3, r5 /* Calculate the last acceptable address. */
35 dcbt r9, r6, 8 /* Stream hint, decreasing addresses. */
37 /* Replicate BYTE to doubleword. */
43 rlwinm r0, r0, 3, 26, 28 /* Calculate padding. */
50 #ifdef __LITTLE_ENDIAN__
53 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
55 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
57 cmpldi cr7, r3, 0 /* If r3 == 0, no BYTEs have been found. */
60 /* Are we now aligned to a quadword boundary? If so, skip to
61 the main loop. Otherwise, go through the alignment code. */
65 /* Handle DWORD2 of pair. */
66 #ifdef __LITTLE_ENDIAN__
77 /* At this point, r8 is 16B aligned. */
81 /* Precompute vbpermq constant. */
90 /* Are we 64-byte aligned? If so, jump to the vectorized loop.
91 Note: aligning to 64-byte will necessarily slow down performance for
92 strings around 64 bytes in length due to the extra comparisons
93 required to check alignment for the vectorized loop. This is a
94 necessary tradeoff we are willing to take in order to speed up the
95 calculation for larger strings. */
97 beq cr0, L(preloop_64B)
98 /* In order to begin the 64B loop, it needs to be 64
99 bytes aligned. So read until it is 64B aligned. */
103 vcmpequb. v11, v0, v6
104 bnl cr6, L(found_16B)
108 beq cr0, L(preloop_64B)
112 vcmpequb. v11, v0, v6
113 bnl cr6, L(found_16B)
117 beq cr0, L(preloop_64B)
121 vcmpequb. v11, v0, v6
122 bnl cr6, L(found_16B)
124 /* At this point it should be 64B aligned.
125 Prepare for the 64B loop. */
127 cmpldi r5, 64 /* Check if r5 < 64. */
129 srdi r9, r5, 6 /* Number of loop iterations. */
130 mtctr r9 /* Setup the counter. */
131 li r11, 16 /* Load required offsets. */
135 /* Handle r5 > 64. Loop over the bytes in strides of 64B. */
138 addi r8, r8, -64 /* Adjust address for the next iteration. */
139 lvx v2, 0, r8 /* Load 4 quadwords. */
149 vor v11, v11, v12 /* Compare and merge into one VR for speed. */
150 vcmpequb. v11, v0, v11
155 /* Handle remainder of 64B loop or r5 > 64. */
163 vcmpequb. v11, v0, v6
164 bnl cr6, L(found_16B)
172 vcmpequb. v11, v0, v6
173 bnl cr6, L(found_16B)
181 vcmpequb. v11, v0, v6
182 bnl cr6, L(found_16B)
190 vcmpequb. v11, v0, v6
191 bnl cr6, L(found_16B)
195 /* Found a match in 64B loop. */
198 /* Permute the first bit of each byte into bits 48-63. */
203 /* Shift each component into its correct position for merging. */
204 #ifdef __LITTLE_ENDIAN__
213 /* Merge the results and move to a GPR. */
218 #ifdef __LITTLE_ENDIAN__
219 cntlzd r6, r5 /* Count leading zeros before the match. */
226 sub r3, r8, r6 /* Compute final address. */
232 /* Found a match in last 16 bytes. */
235 cmpld r8, r10 /* Are we on the last QW? */
237 /* Now discard bytes before starting address. */
241 /* Mask unwanted bytes. */
242 #ifdef __LITTLE_ENDIAN__
256 /* Permute the first bit of each byte into bits 48-63. */
258 /* Shift each component into its correct position for merging. */
259 #ifdef __LITTLE_ENDIAN__
262 cntlzd r6, r7 /* Count leading zeros before the match. */
270 sub r3, r8, r6 /* Compute final address. */
276 /* r3 has the output of the cmpb instruction, that is, it contains
277 0xff in the same position as BYTE in the original
278 word from the string. Use that to calculate the pointer.
279 We need to make sure BYTE is *before* the end of the
282 cntlzd r9, r3 /* Count leading zeros before the match. */
283 cmpld r8, r0 /* Are we on the last word? */
284 srdi r6, r9, 3 /* Convert leading zeros to bytes. */
298 /* Deals with size <= 32. */
304 #ifdef __LITTLE_ENDIAN__
307 ldbrx r12, 0, r8 /* Load reversed doubleword from memory. */
309 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
314 /* Are we done already? */
321 #ifdef __LITTLE_ENDIAN__
335 weak_alias (__memrchr, memrchr)
336 libc_hidden_builtin_def (memrchr)