]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/le/power9/strncpy.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / le / power9 / strncpy.S
1 /* Optimized strncpy implementation for POWER9 LE.
2 Copyright (C) 2020-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 #ifdef USE_AS_STPNCPY
22 # ifndef STPNCPY
23 # define FUNC_NAME __stpncpy
24 # else
25 # define FUNC_NAME STPNCPY
26 # endif
27 #else
28 # ifndef STRNCPY
29 # define FUNC_NAME strncpy
30 # else
31 # define FUNC_NAME STRNCPY
32 # endif
33 #endif /* !USE_AS_STPNCPY */
34
35 #ifndef MEMSET
36 /* For builds without IFUNC support, local calls should be made to internal
37 GLIBC symbol (created by libc_hidden_builtin_def). */
38 # ifdef SHARED
39 # define MEMSET_is_local
40 # define MEMSET __GI_memset
41 # else
42 # define MEMSET memset
43 # endif
44 #endif
45
46 #define FRAMESIZE (FRAME_MIN_SIZE+8)
47
48 /* Implements the function
49
50 char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
51
52 or
53
54 char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
55
56 if USE_AS_STPNCPY is defined.
57
58 The implementation can load bytes past a null terminator, but only
59 up to the next 16-byte aligned address, so it never crosses a page. */
60
61 .machine power9
62 #ifdef MEMSET_is_local
63 ENTRY_TOCLESS (FUNC_NAME, 4)
64 #else
65 ENTRY (FUNC_NAME, 4)
66 #endif
67 CALL_MCOUNT 2
68
69 /* NULL string optimizations */
70 cmpdi r5, 0
71 beqlr
72
73 lbz r0,0(r4)
74 stb r0,0(r3)
75 addi r11,r3,1
76 addi r5,r5,-1
77 vspltisb v18,0 /* Zeroes in v18 */
78 cmpdi r0,0
79 beq L(zero_padding)
80
81 /* Empty/1-byte string optimization */
82 cmpdi r5,0
83 #ifdef USE_AS_STPNCPY
84 bgt L(cont)
85 /* Compute pointer to last byte copied into dest. */
86 addi r3,r3,1
87 blr
88 L(cont):
89 #else
90 beqlr
91 #endif
92
93 addi r4,r4,1
94 neg r7,r4
95 rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
96
97 /* Get source 16B aligned */
98 lvx v0,0,r4
99 lvsr v1,0,r4
100 vperm v0,v18,v0,v1
101
102 vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
103 vctzlsbb r7,v6 /* Number of trailing zeroes */
104 addi r8,r7,1 /* Add null terminator */
105
106 /* r8 = bytes including null
107 r9 = bytes to get source 16B aligned
108 if r8 > r9
109 no null, copy r9 bytes
110 else
111 there is a null, copy r8 bytes and return. */
112 cmpld r8,r9
113 bgt L(no_null)
114
115 cmpld cr6,r8,r5 /* r8 <= n? */
116 ble cr6,L(null)
117
118 sldi r10,r5,56 /* stxvl wants size in top 8 bits */
119 stxvl 32+v0,r11,r10 /* Partial store */
120
121 #ifdef USE_AS_STPNCPY
122 /* Compute pointer to last byte copied into dest. */
123 add r3,r11,r5
124 #endif
125 blr
126
127 L(null):
128 sldi r10,r8,56 /* stxvl wants size in top 8 bits */
129 stxvl 32+v0,r11,r10 /* Partial store */
130
131 #ifdef USE_AS_STPNCPY
132 /* Compute pointer to last byte copied into dest. */
133 add r3,r11,r7
134 #endif
135 add r11,r11,r8
136 sub r5,r5,r8
137 b L(zero_padding)
138
139 L(no_null):
140 cmpld r9,r5 /* Check if length was reached. */
141 bge L(n_tail1)
142
143 sldi r10,r9,56 /* stxvl wants size in top 8 bits */
144 stxvl 32+v0,r11,r10 /* Partial store */
145
146 add r4,r4,r9
147 add r11,r11,r9
148 sub r5,r5,r9
149
150 L(loop):
151 cmpldi cr6,r5,64 /* Check if length was reached. */
152 ble cr6,L(final_loop)
153
154 lxv 32+v0,0(r4)
155 vcmpequb. v6,v0,v18 /* Any zero bytes? */
156 bne cr6,L(prep_tail1)
157
158 lxv 32+v1,16(r4)
159 vcmpequb. v6,v1,v18 /* Any zero bytes? */
160 bne cr6,L(prep_tail2)
161
162 lxv 32+v2,32(r4)
163 vcmpequb. v6,v2,v18 /* Any zero bytes? */
164 bne cr6,L(prep_tail3)
165
166 lxv 32+v3,48(r4)
167 vcmpequb. v6,v3,v18 /* Any zero bytes? */
168 bne cr6,L(prep_tail4)
169
170 stxv 32+v0,0(r11)
171 stxv 32+v1,16(r11)
172 stxv 32+v2,32(r11)
173 stxv 32+v3,48(r11)
174
175 addi r4,r4,64
176 addi r11,r11,64
177 addi r5,r5,-64
178
179 b L(loop)
180
181 L(final_loop):
182 cmpldi cr5,r5,16
183 lxv 32+v0,0(r4)
184 vcmpequb. v6,v0,v18 /* Any zero bytes? */
185 ble cr5,L(prep_n_tail1)
186 bne cr6,L(count_tail1)
187 addi r5,r5,-16
188
189 cmpldi cr5,r5,16
190 lxv 32+v1,16(r4)
191 vcmpequb. v6,v1,v18 /* Any zero bytes? */
192 ble cr5,L(prep_n_tail2)
193 bne cr6,L(count_tail2)
194 addi r5,r5,-16
195
196 cmpldi cr5,r5,16
197 lxv 32+v2,32(r4)
198 vcmpequb. v6,v2,v18 /* Any zero bytes? */
199 ble cr5,L(prep_n_tail3)
200 bne cr6,L(count_tail3)
201 addi r5,r5,-16
202
203 lxv 32+v3,48(r4)
204 vcmpequb. v6,v3,v18 /* Any zero bytes? */
205 beq cr6,L(n_tail4)
206
207 vctzlsbb r8,v6 /* Number of trailing zeroes */
208 cmpld r8,r5 /* r8 < n? */
209 blt L(tail4)
210
211 L(n_tail4):
212 stxv 32+v0,0(r11)
213 stxv 32+v1,16(r11)
214 stxv 32+v2,32(r11)
215 sldi r10,r5,56 /* stxvl wants size in top 8 bits */
216 addi r11,r11,48 /* Offset */
217 stxvl 32+v3,r11,r10 /* Partial store */
218 #ifdef USE_AS_STPNCPY
219 /* Compute pointer to last byte copied into dest. */
220 add r3,r11,r5
221 #endif
222 blr
223
224 L(prep_n_tail1):
225 beq cr6,L(n_tail1) /* Any zero bytes? */
226 vctzlsbb r8,v6 /* Number of trailing zeroes */
227 cmpld r8,r5 /* r8 < n? */
228 blt L(tail1)
229
230 L(n_tail1):
231 sldi r10,r5,56 /* stxvl wants size in top 8 bits */
232 stxvl 32+v0,r11,r10 /* Partial store */
233 #ifdef USE_AS_STPNCPY
234 /* Compute pointer to last byte copied into dest. */
235 add r3,r11,r5
236 #endif
237 blr
238
239 L(prep_n_tail2):
240 beq cr6,L(n_tail2) /* Any zero bytes? */
241 vctzlsbb r8,v6 /* Number of trailing zeroes */
242 cmpld r8,r5 /* r8 < n? */
243 blt L(tail2)
244
245 L(n_tail2):
246 stxv 32+v0,0(r11)
247 sldi r10,r5,56 /* stxvl wants size in top 8 bits */
248 addi r11,r11,16 /* offset */
249 stxvl 32+v1,r11,r10 /* Partial store */
250 #ifdef USE_AS_STPNCPY
251 /* Compute pointer to last byte copied into dest. */
252 add r3,r11,r5
253 #endif
254 blr
255
256 L(prep_n_tail3):
257 beq cr6,L(n_tail3) /* Any zero bytes? */
258 vctzlsbb r8,v6 /* Number of trailing zeroes */
259 cmpld r8,r5 /* r8 < n? */
260 blt L(tail3)
261
262 L(n_tail3):
263 stxv 32+v0,0(r11)
264 stxv 32+v1,16(r11)
265 sldi r10,r5,56 /* stxvl wants size in top 8 bits */
266 addi r11,r11,32 /* Offset */
267 stxvl 32+v2,r11,r10 /* Partial store */
268 #ifdef USE_AS_STPNCPY
269 /* Compute pointer to last byte copied into dest. */
270 add r3,r11,r5
271 #endif
272 blr
273
274 L(prep_tail1):
275 L(count_tail1):
276 vctzlsbb r8,v6 /* Number of trailing zeroes */
277 L(tail1):
278 addi r9,r8,1 /* Add null terminator */
279 sldi r10,r9,56 /* stxvl wants size in top 8 bits */
280 stxvl 32+v0,r11,r10 /* Partial store */
281 #ifdef USE_AS_STPNCPY
282 /* Compute pointer to last byte copied into dest. */
283 add r3,r11,r8
284 #endif
285 add r11,r11,r9
286 sub r5,r5,r9
287 b L(zero_padding)
288
289 L(prep_tail2):
290 addi r5,r5,-16
291 L(count_tail2):
292 vctzlsbb r8,v6 /* Number of trailing zeroes */
293 L(tail2):
294 addi r9,r8,1 /* Add null terminator */
295 stxv 32+v0,0(r11)
296 sldi r10,r9,56 /* stxvl wants size in top 8 bits */
297 addi r11,r11,16 /* offset */
298 stxvl 32+v1,r11,r10 /* Partial store */
299 #ifdef USE_AS_STPNCPY
300 /* Compute pointer to last byte copied into dest. */
301 add r3,r11,r8
302 #endif
303 add r11,r11,r9
304 sub r5,r5,r9
305 b L(zero_padding)
306
307 L(prep_tail3):
308 addi r5,r5,-32
309 L(count_tail3):
310 vctzlsbb r8,v6 /* Number of trailing zeroes */
311 L(tail3):
312 addi r9,r8,1 /* Add null terminator */
313 stxv 32+v0,0(r11)
314 stxv 32+v1,16(r11)
315 sldi r10,r9,56 /* stxvl wants size in top 8 bits */
316 addi r11,r11,32 /* offset */
317 stxvl 32+v2,r11,r10 /* Partial store */
318 #ifdef USE_AS_STPNCPY
319 /* Compute pointer to last byte copied into dest. */
320 add r3,r11,r8
321 #endif
322 add r11,r11,r9
323 sub r5,r5,r9
324 b L(zero_padding)
325
326 L(prep_tail4):
327 addi r5,r5,-48
328 vctzlsbb r8,v6 /* Number of trailing zeroes */
329 L(tail4):
330 addi r9,r8,1 /* Add null terminator */
331 stxv 32+v0,0(r11)
332 stxv 32+v1,16(r11)
333 stxv 32+v2,32(r11)
334 sldi r10,r9,56 /* stxvl wants size in top 8 bits */
335 addi r11,r11,48 /* offset */
336 stxvl 32+v3,r11,r10 /* Partial store */
337 #ifdef USE_AS_STPNCPY
338 /* Compute pointer to last byte copied into dest. */
339 add r3,r11,r8
340 #endif
341 add r11,r11,r9
342 sub r5,r5,r9
343
344 /* This code pads the remainder of dest with NULL bytes. For large numbers
345 memset gives a better performance, 255 was chosen through experimentation.
346 */
347 L(zero_padding):
348 cmpldi r5,255
349 bge L(zero_padding_memset)
350
351 L(zero_padding_loop):
352 cmpldi cr6,r5,16 /* Check if length was reached. */
353 ble cr6,L(zero_padding_end)
354
355 stxv v18,0(r11)
356 addi r11,r11,16
357 addi r5,r5,-16
358
359 b L(zero_padding_loop)
360
361 L(zero_padding_end):
362 sldi r10,r5,56 /* stxvl wants size in top 8 bits */
363 stxvl v18,r11,r10 /* Partial store */
364 blr
365
366 .align 4
367 L(zero_padding_memset):
368 std r30,-8(r1) /* Save r30 on the stack. */
369 cfi_offset(r30, -8)
370 mr r30,r3 /* Save the return value of strncpy. */
371 /* Prepare the call to memset. */
372 mr r3,r11 /* Pointer to the area to be zero-filled. */
373 li r4,0 /* Byte to be written (zero). */
374
375 /* We delayed the creation of the stack frame, as well as the saving of
376 the link register, because only at this point, we are sure that
377 doing so is actually needed. */
378
379 /* Save the link register. */
380 mflr r0
381 std r0,16(r1)
382
383 /* Create the stack frame. */
384 stdu r1,-FRAMESIZE(r1)
385 cfi_adjust_cfa_offset(FRAMESIZE)
386 cfi_offset(lr, 16)
387
388 bl MEMSET
389 #ifndef MEMSET_is_local
390 nop
391 #endif
392
393 ld r0,FRAMESIZE+16(r1)
394
395 mr r3,r30 /* Restore the return value of strncpy, i.e.:
396 dest. For stpncpy, the return value is the
397 same as return value of memset. */
398 ld r30,FRAMESIZE-8(r1) /* Restore r30. */
399 /* Restore the stack frame. */
400 addi r1,r1,FRAMESIZE
401 cfi_adjust_cfa_offset(-FRAMESIZE)
402 /* Restore the link register. */
403 mtlr r0
404 cfi_restore(lr)
405 blr
406
407 END (FUNC_NAME)
408 #ifndef USE_AS_STPNCPY
409 libc_hidden_builtin_def (strncpy)
410 #endif