]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power8/memchr.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / memchr.S
CommitLineData
43e0ac24 1/* Optimized memchr implementation for POWER8.
688903eb 2 Copyright (C) 2017-2018 Free Software Foundation, Inc.
43e0ac24
RS
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21/* void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]) */
22
23/* TODO: change these to the actual instructions when the minimum required
24 binutils allows it. */
25#define MTVRD(v, r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16)))
26#define MFVRD(r, v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
27#define VBPERMQ(t, a, b) .long (0x1000054c \
28 | ((t)<<(32-11)) \
29 | ((a)<<(32-16)) \
30 | ((b)<<(32-21)) )
31
32#ifndef MEMCHR
33# define MEMCHR __memchr
34#endif
35/* TODO: change this to .machine power8 when the minimum required binutils
36 allows it. */
37 .machine power7
38ENTRY_TOCLESS (MEMCHR)
39 CALL_MCOUNT 3
40 dcbt 0, r3
41 clrrdi r8, r3, 3
42 insrdi r4, r4, 8, 48
43
44 /* Calculate the last acceptable address and check for possible
45 addition overflow by using satured math:
46 r7 = r3 + r5
47 r7 |= -(r7 < x) */
48 add r7, r3, r5
49 subfc r6, r3, r7
50 subfe r9, r9, r9
51 extsw r6, r9
52 or r7, r7, r6
53
54 insrdi r4, r4, 16, 32
55 cmpldi r5, 32
56 li r9, -1
57 rlwinm r6, r3, 3, 26, 28 /* Calculate padding. */
58 insrdi r4, r4, 32, 0
59 mr r10, r7
60 addi r7, r7, -1
61#ifdef __LITTLE_ENDIAN__
62 sld r9, r9, r6
63#else
64 srd r9, r9, r6
65#endif
66 ble L(small_range)
67 andi. r11, r3, 63
68 beq cr0, L(align_qw)
69 clrldi r11, r3, 61
70 ld r12, 0(r8) /* Load doubleword from memory. */
71 cmpb r3, r12, r4 /* Check for BYTEs in DWORD1. */
72 and r3, r3, r9
73 clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
74 clrrdi r7, r7, 3 /* Address of last doubleword. */
75 cmpldi cr7, r3, 0 /* Does r3 indicate we got a hit? */
76 bne cr7, L(done)
77 addi r8, r8, 8
78 addi r5, r5, -8
79 add r5, r5, r11
80
81 /* Are we now aligned to a quadword boundary? */
82 andi. r11, r8, 15
83 beq cr0, L(align_qw)
84
85 /* Handle DWORD to make it QW aligned. */
86 ld r12, 0(r8)
87 cmpb r3, r12, r4
88 cmpldi cr7, r3, 0
89 bne cr7, L(done)
90 addi r5, r5, -8
91 addi r8, r8, 8
92 /* At this point, r8 is 16B aligned. */
93L(align_qw):
94 vspltisb v0, 0
95 /* Precompute vbpermq constant. */
96 vspltisb v10, 3
97 li r0, 0
98 lvsl v11, r0, r0
99 vslb v10, v11, v10
100 MTVRD(v1, r4)
101 vspltb v1, v1, 7
102 cmpldi r5, 64
103 ble L(tail64)
104 /* Are we 64-byte aligned? If so, jump to the vectorized loop.
105 Note: aligning to 64-byte will necessarily slow down performance for
106 strings around 64 bytes in length due to the extra comparisons
107 required to check alignment for the vectorized loop. This is a
108 necessary tradeoff we are willing to take in order to speed up the
109 calculation for larger strings. */
110 andi. r11, r8, 63
111 beq cr0, L(preloop_64B)
112 /* In order to begin the 64B loop, it needs to be 64
113 bytes aligned. So read until it is 64B aligned. */
114 lvx v4, 0, r8
115 vcmpequb v6, v1, v4
116 vcmpequb. v11, v0, v6
117 bnl cr6, L(found_16B)
118 addi r8, r8, 16
119 addi r5, r5, -16
120
121 andi. r11, r8, 63
122 beq cr0, L(preloop_64B)
123 lvx v4, 0, r8
124 vcmpequb v6, v1, v4
125 vcmpequb. v11, v0, v6
126 bnl cr6, L(found_16B)
127 addi r8, r8, 16
128 addi r5, r5, -16
129
130 andi. r11, r8, 63
131 beq cr0, L(preloop_64B)
132 lvx v4, 0, r8
133 vcmpequb v6, v1, v4
134 vcmpequb. v11, v0, v6
135 bnl cr6, L(found_16B)
136 addi r8, r8, 16
137 addi r5, r5, -16
138 /* At this point it should be 64B aligned.
139 Prepare for the 64B loop. */
140L(preloop_64B):
141 cmpldi r5, 64 /* Check if r5 < 64. */
142 ble L(tail64)
143 sub r6, r10, r8
144 srdi r9, r6, 6 /* Number of loop iterations. */
145 mtctr r9 /* Setup the counter. */
146 li r11, 16 /* Load required offsets. */
147 li r9, 32
148 li r7, 48
149
150 /* Handle r5 > 64. Loop over the bytes in strides of 64B. */
151 .align 4
152L(loop):
153 lvx v2, 0, r8 /* Load 4 quadwords. */
154 lvx v3, r8, r11
155 lvx v4, v8, r9
156 lvx v5, v8, r7
157 vcmpequb v6, v1, v2
158 vcmpequb v7, v1, v3
159 vcmpequb v8, v1, v4
160 vcmpequb v9, v1, v5
161 vor v11, v6, v7
162 vor v12, v8, v9
163 vor v11, v11, v12 /* Compare and merge into one VR for speed. */
164 vcmpequb. v11, v0, v11
165 bnl cr6, L(found)
166 addi r8, r8, 64 /* Adjust address for the next iteration. */
167 bdnz L(loop)
168 clrldi r5, r6, 58
169
170 /* Handle remainder of 64B loop or r5 > 64. */
171 .align 4
172L(tail64):
173 cmpldi r5, 0
174 beq L(null)
175 lvx v4, 0, r8
176 vcmpequb v6, v1, v4
177 vcmpequb. v11, v0, v6
178 bnl cr6, L(found_16B)
179 addi r8, r8, 16
180 cmpldi cr6, r5, 16
181 ble cr6, L(null)
182 addi r5, r5, -16
183
184 lvx v4, 0, r8
185 vcmpequb v6, v1, v4
186 vcmpequb. v11, v0, v6
187 bnl cr6, L(found_16B)
188 addi r8, r8, 16
189 cmpldi cr6, r5, 16
190 ble cr6, L(null)
191 addi r5, r5, -16
192
193 lvx v4, 0, r8
194 vcmpequb v6, v1, v4
195 vcmpequb. v11, v0, v6
196 bnl cr6, L(found_16B)
197 addi r8, r8, 16
198 cmpldi cr6, r5, 16
199 ble cr6, L(null)
200 addi r5, r5, -16
201
202 lvx v4, 0, r8
203 vcmpequb v6, v1, v4
204 vcmpequb. v11, v0, v6
205 bnl cr6, L(found_16B)
206 li r3, 0
207 blr
208
209 /* Found a match in 64B loop. */
210 .align 4
211L(found):
212 /* Permute the first bit of each byte into bits 48-63. */
213 VBPERMQ(v6, v6, v10)
214 VBPERMQ(v7, v7, v10)
215 VBPERMQ(v8, v8, v10)
216 VBPERMQ(v9, v9, v10)
217 /* Shift each component into its correct position for merging. */
218#ifdef __LITTLE_ENDIAN__
219 vsldoi v7, v7, v7, 2
220 vsldoi v8, v8, v8, 4
221 vsldoi v9, v9, v9, 6
222#else
223 vsldoi v6, v6, v6, 6
224 vsldoi v7, v7, v7, 4
225 vsldoi v8, v8, v8, 2
226#endif
227 /* Merge the results and move to a GPR. */
228 vor v11, v6, v7
229 vor v4, v9, v8
230 vor v4, v11, v4
231 MFVRD(r5, v4)
232#ifdef __LITTLE_ENDIAN__
233 addi r6, r5, -1
234 andc r6, r6, r5
235 popcntd r6, r6
236#else
237 cntlzd r6, r5 /* Count leading zeros before the match. */
238#endif
239 add r3, r8, r6 /* Compute final length. */
240 blr
241
242 /* Found a match in last 16 bytes. */
243 .align 4
244L(found_16B):
245 /* Permute the first bit of each byte into bits 48-63. */
246 VBPERMQ(v6, v6, v10)
247 /* Shift each component into its correct position for merging. */
248#ifdef __LITTLE_ENDIAN__
249 MFVRD(r7, v6)
250 addi r6, r7, -1
251 andc r6, r6, r7
252 popcntd r6, r6
253#else
254 vsldoi v6, v6, v6, 6
255 MFVRD(r7, v6)
256 cntlzd r6, r7 /* Count leading zeros before the match. */
257#endif
258 add r3, r8, r6 /* Compute final length. */
259 cmpld r6, r5
260 bltlr
261 li r3, 0
262 blr
263
264 .align 4
265 /* r3 has the output of the cmpb instruction, that is, it contains
266 0xff in the same position as BYTE in the original
267 doubleword from the string. Use that to calculate the pointer.
268 We need to make sure BYTE is *before* the end of the range. */
269L(done):
270#ifdef __LITTLE_ENDIAN__
271 addi r0, r3, -1
272 andc r0, r0, r3
273 popcntd r0, r0 /* Count trailing zeros. */
274#else
275 cntlzd r0, r3 /* Count leading zeros before the match. */
276#endif
277 cmpld r8, r7 /* Are we on the last dword? */
278 srdi r0, r0, 3 /* Convert leading/trailing zeros to bytes. */
279 add r3, r8, r0
280 cmpld cr7, r0, r6 /* If on the last dword, check byte offset. */
281 bnelr
282 blelr cr7
283 li r3, 0
284 blr
285
286 .align 4
287L(null):
288 li r3, 0
289 blr
290
291/* Deals with size <= 32. */
292 .align 4
293L(small_range):
294 cmpldi r5, 0
295 beq L(null)
296 ld r12, 0(r8) /* Load word from memory. */
297 cmpb r3, r12, r4 /* Check for BYTE in DWORD1. */
298 and r3, r3, r9
299 cmpldi cr7, r3, 0
300 clrldi r6, r7, 61 /* Byte count - 1 in last dword. */
301 clrrdi r7, r7, 3 /* Address of last doubleword. */
302 cmpld r8, r7 /* Are we done already? */
303 bne cr7, L(done)
304 beqlr
305
306 ldu r12, 8(r8)
307 cmpb r3, r12, r4
308 cmpldi cr6, r3, 0
309 cmpld r8, r7
310 bne cr6, L(done) /* Found something. */
311 beqlr /* Hit end of string (length). */
312
313 ldu r12, 8(r8)
314 cmpb r3, r12, r4
315 cmpldi cr6, r3, 0
316 cmpld r8, r7
317 bne cr6, L(done)
318 beqlr
319
320 ldu r12, 8(r8)
321 cmpb r3, r12, r4
322 cmpldi cr6, r3, 0
323 cmpld r8, r7
324 bne cr6, L(done)
325 beqlr
326
327 ldu r12, 8(r8)
328 cmpb r3, r12, r4
329 cmpldi cr6, r3, 0
330 bne cr6, L(done)
331 blr
332
333END (MEMCHR)
334weak_alias (__memchr, memchr)
335libc_hidden_builtin_def (memchr)