]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strcpy-evex.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcpy-evex.S
1 /* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <isa-level.h>
20 #if ISA_SHOULD_BUILD (4)
21
22
23 /* Use evex-masked stores for small sizes. Turned off at the
24 moment. */
25 # define USE_EVEX_MASKED_STORE 0
26 /* Use movsb in page cross case to save code size. */
27 # define USE_MOVSB_IN_PAGE_CROSS 1
28
29 # include <sysdep.h>
30
31 # ifndef VEC_SIZE
32 # include "x86-evex256-vecs.h"
33 # endif
34
35 # ifndef STRCPY
36 # define STRCPY __strcpy_evex
37 # endif
38
39
40 # ifdef USE_AS_WCSCPY
41 # define VMOVU_MASK vmovdqu32
42 # define VPMIN vpminud
43 # define VPTESTN vptestnmd
44 # define VPTEST vptestmd
45 # define VPCMPEQ vpcmpeqd
46 # define CHAR_SIZE 4
47
48 # define REP_MOVS rep movsd
49
50 # define USE_WIDE_CHAR
51 # else
52 # define VMOVU_MASK vmovdqu8
53 # define VPMIN vpminub
54 # define VPTESTN vptestnmb
55 # define VPTEST vptestmb
56 # define VPCMPEQ vpcmpeqb
57 # define CHAR_SIZE 1
58
59 # define REP_MOVS rep movsb
60 # endif
61
62 # include "reg-macros.h"
63
64
65 # ifdef USE_AS_STPCPY
66 # define END_REG rax
67 # else
68 # define END_REG rdi, %rdx, CHAR_SIZE
69 # endif
70
71 # ifdef USE_AS_STRCAT
72 # define PAGE_ALIGN_REG edx
73 # define PAGE_ALIGN_REG_64 rdx
74 # else
75 # define PAGE_ALIGN_REG eax
76 # define PAGE_ALIGN_REG_64 rax
77 # endif
78
79 # define VZERO VMM(7)
80 # define VZERO_128 VMM_128(7)
81
82
83 # define PAGE_SIZE 4096
84 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
85
86
87 .section SECTION(.text), "ax", @progbits
88 ENTRY(STRCPY)
89 # ifdef USE_AS_STRCAT
90 movq %rdi, %rax
91 # include "strcat-strlen-evex.h.S"
92 # endif
93
94 movl %esi, %PAGE_ALIGN_REG
95 andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
96 cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
97 ja L(page_cross)
98 L(page_cross_continue):
99 VMOVU (%rsi), %VMM(0)
100 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
101 movq %rdi, %rax
102 # endif
103
104
105 /* Two short string implementations. One with traditional
106 branching approach and one with masked instructions (which
107 have potential for dramatically bad perf if dst splits a
108 page and is not in the TLB). */
109 # if USE_EVEX_MASKED_STORE
110 VPTEST %VMM(0), %VMM(0), %k0
111 KMOV %k0, %VRCX
112 # ifdef USE_AS_WCSCPY
113 subl $((1 << CHAR_PER_VEC)- 1), %VRCX
114 # else
115 inc %VRCX
116 # endif
117 jz L(more_1x_vec)
118 KMOV %VRCX, %k1
119 KXOR %k0, %k1, %k1
120
121 VMOVU_MASK %VMM(0), (%rdi){%k1}
122
123 # ifdef USE_AS_STPCPY
124 bsf %VRCX, %VRCX
125 leaq (%rdi, %rcx, CHAR_SIZE), %rax
126 # endif
127 ret
128
129 # else
130 VPTESTN %VMM(0), %VMM(0), %k0
131 KMOV %k0, %VRCX
132 test %VRCX, %VRCX
133 jz L(more_1x_vec)
134
135 xorl %edx, %edx
136 bsf %VRCX, %VRDX
137 # ifdef USE_AS_STPCPY
138 leaq (%rdi, %rdx, CHAR_SIZE), %rax
139 # endif
140
141 /* Use mask bits in rcx to detect which copy we need. If the low
142 mask is zero then there must be a bit set in the upper half.
143 I.e if rcx != 0 and ecx == 0, then match must be upper 32
144 bits so we use L(copy_32_63). */
145 # if VEC_SIZE == 64
146 # ifdef USE_AS_WCSCPY
147 testb %cl, %cl
148 # else
149 testl %ecx, %ecx
150 # endif
151 jz L(copy_32_63)
152 # endif
153
154 # ifdef USE_AS_WCSCPY
155 testb $0xf, %cl
156 # else
157 testw %cx, %cx
158 # endif
159 jz L(copy_16_31)
160
161
162 # ifdef USE_AS_WCSCPY
163 testb $0x3, %cl
164 # else
165 testb %cl, %cl
166 # endif
167 jz L(copy_8_15)
168
169
170 # ifdef USE_AS_WCSCPY
171 vmovd %VMM_128(0), (%rdi)
172 /* No need to copy, we know its zero. */
173 movl $0, (%END_REG)
174
175 ret
176 # else
177
178 testb $0x7, %cl
179 jz L(copy_4_7)
180
181
182 test %edx, %edx
183 jz L(set_null_term)
184
185 /* NB: make this `vmovw` if support for AVX512-FP16 is added.
186 */
187 vmovd %VMM_128(0), %esi
188 movw %si, (%rdi)
189
190 .p2align 4,, 1
191 L(set_null_term):
192 /* No need to copy, we know its zero. */
193 movb $0, (%END_REG)
194 ret
195 # endif
196
197 # if VEC_SIZE == 64
198 .p2align 4,, 6
199 L(copy_32_63):
200 VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
201 VMOVU %VMM_256(0), (%rdi)
202 VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
203 ret
204 # endif
205
206
207 .p2align 4,, 6
208 L(copy_16_31):
209 /* Use xmm1 explicitly here as it won't require a `vzeroupper`
210 and will save code size. */
211 vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
212 VMOVU %VMM_128(0), (%rdi)
213 vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
214 ret
215
216 .p2align 4,, 8
217 L(copy_8_15):
218 # ifdef USE_AS_WCSCPY
219 movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
220 # else
221 movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx
222 # endif
223 vmovq %VMM_128(0), (%rdi)
224 movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
225 ret
226 # endif
227
228
229 # ifndef USE_AS_WCSCPY
230 .p2align 4,, 12
231 L(copy_4_7):
232 movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx
233 vmovd %VMM_128(0), (%rdi)
234 movl %ecx, -(4 - CHAR_SIZE)(%END_REG)
235 ret
236 # endif
237
238
239 .p2align 4,, 8
240 L(more_1x_vec):
241 # if defined USE_AS_STPCPY || defined USE_AS_STRCAT
242 VMOVU %VMM(0), (%rdi)
243 # endif
244 subq %rsi, %rdi
245 andq $-(VEC_SIZE), %rsi
246 addq %rsi, %rdi
247 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
248
249 /* Ideally we store after moves to minimize impact of potential
250 false-dependencies. */
251 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
252 VMOVU %VMM(0), (%rax)
253 # endif
254
255 VPTESTN %VMM(1), %VMM(1), %k0
256 KMOV %k0, %VRCX
257 test %VRCX, %VRCX
258 jnz L(ret_vec_x1)
259
260 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
261 VMOVU %VMM(1), VEC_SIZE(%rdi)
262
263 VPTESTN %VMM(2), %VMM(2), %k0
264 KMOV %k0, %VRCX
265 test %VRCX, %VRCX
266 jnz L(ret_vec_x2)
267
268 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
269 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
270
271 VPTESTN %VMM(3), %VMM(3), %k0
272 KMOV %k0, %VRDX
273 test %VRDX, %VRDX
274 jnz L(ret_vec_x3)
275
276 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
277 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
278 VPTESTN %VMM(4), %VMM(4), %k0
279 KMOV %k0, %VRCX
280 test %VRCX, %VRCX
281 jnz L(ret_vec_x4)
282
283 VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
284
285
286 /* Align for 4x loop. */
287 subq %rsi, %rdi
288
289 /* + VEC_SIZE * 5 because we never added the original VEC_SIZE
290 we covered before aligning. */
291 subq $-(VEC_SIZE * 5), %rsi
292 andq $-(VEC_SIZE * 4), %rsi
293
294
295 /* Load first half of the loop before entry. */
296 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
297 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
298 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
299 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
300
301 VPMIN %VMM(0), %VMM(1), %VMM(4)
302 VPMIN %VMM(2), %VMM(3), %VMM(6)
303 VPTESTN %VMM(4), %VMM(4), %k2
304 VPTESTN %VMM(6), %VMM(6), %k4
305 KORTEST %k2, %k4
306 jnz L(loop_4x_done)
307
308 .p2align 4,, 11
309 L(loop_4x_vec):
310
311 VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi)
312 VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
313 VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
314 VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
315
316 subq $(VEC_SIZE * -4), %rsi
317
318 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
319 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
320 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
321 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
322
323
324 VPMIN %VMM(0), %VMM(1), %VMM(4)
325 VPMIN %VMM(2), %VMM(3), %VMM(6)
326 VPTESTN %VMM(4), %VMM(4), %k2
327 VPTESTN %VMM(6), %VMM(6), %k4
328 KORTEST %k2, %k4
329 jz L(loop_4x_vec)
330
331 L(loop_4x_done):
332 VPTESTN %VMM(0), %VMM(0), %k0
333 KMOV %k0, %VRCX
334 /* Restore rdi (%rdi). */
335 addq %rsi, %rdi
336 test %VRCX, %VRCX
337 jnz L(ret_vec_x0_end)
338 VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
339
340 KMOV %k2, %VRCX
341 test %VRCX, %VRCX
342 jnz L(ret_vec_x1)
343 VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
344
345 VPTESTN %VMM(2), %VMM(2), %k0
346 KMOV %k0, %VRCX
347 test %VRCX, %VRCX
348 jnz L(ret_vec_x2)
349 VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
350 /* Place L(ret_vec_x4) here to save code size. We get a
351 meaningfuly benefit doing this for stpcpy. */
352 KMOV %k4, %VRDX
353 L(ret_vec_x3):
354 bsf %VRDX, %VRDX
355 VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
356 VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
357 # ifdef USE_AS_STPCPY
358 leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax
359 # endif
360 L(return_end):
361 ret
362
363 .p2align 4,, 6
364 L(ret_vec_x0_end):
365 bsf %VRCX, %VRCX
366 # ifdef USE_AS_STPCPY
367 leaq (%rdi, %rcx, CHAR_SIZE), %rax
368 # endif
369 inc %VRCX
370 VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
371 VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
372 ret
373
374 .p2align 4,, 8
375 L(ret_vec_x1):
376 bsf %VRCX, %VRCX
377 VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
378 VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
379 # ifdef USE_AS_STPCPY
380 leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
381 # endif
382 ret
383
384 .p2align 4,, 4
385 L(ret_vec_x2):
386 bsf %VRCX, %VRCX
387 VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
388 VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
389 # ifdef USE_AS_STPCPY
390 leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
391 # endif
392 ret
393
394 /* ret_vec_x3 reuses return code after the loop. */
395 .p2align 4,, 6
396 L(ret_vec_x4):
397 bsf %VRCX, %VRCX
398 VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
399 VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
400 # ifdef USE_AS_STPCPY
401 leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
402 # endif
403 ret
404
405
406 .p2align 4,, 4
407 L(page_cross):
408 # ifndef USE_AS_STRCAT
409 vpxorq %VZERO_128, %VZERO_128, %VZERO_128
410 # endif
411 movq %rsi, %rcx
412 andq $(VEC_SIZE * -1), %rcx
413
414 VPCMPEQ (%rcx), %VZERO, %k0
415 KMOV %k0, %VRCX
416 # ifdef USE_AS_WCSCPY
417 andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG
418 shrl $2, %PAGE_ALIGN_REG
419 # endif
420 shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX
421
422 # if USE_MOVSB_IN_PAGE_CROSS
423 /* Optimizing more aggressively for space as this is very cold
424 code. This saves 2x cache lines. */
425
426 /* This adds once to the later result which will get correct
427 copy bounds. NB: this can never zero-out a non-zero RCX as
428 to be in the page cross case rsi cannot be aligned and we
429 already right-shift rcx by the misalignment. */
430 shl %VRCX
431 jz L(page_cross_continue)
432 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
433 movq %rdi, %rax
434 # endif
435 bsf %VRCX, %VRCX
436 REP_MOVS
437
438 # ifdef USE_AS_STPCPY
439 leaq -CHAR_SIZE(%rdi), %rax
440 # endif
441 ret
442
443
444 # else
445 /* Check if we found zero-char before end of page. */
446 test %VRCX, %VRCX
447 jz L(page_cross_continue)
448
449 /* Traditional copy case, essentially same as used in non-page-
450 cross case but since we can't reuse VMM(0) we need twice as
451 many loads from rsi. */
452
453 # ifndef USE_AS_STRCAT
454 xorl %edx, %edx
455 # endif
456 /* Dependency on rdi must already have been satisfied. */
457 bsf %VRCX, %VRDX
458 # ifdef USE_AS_STPCPY
459 leaq (%rdi, %rdx, CHAR_SIZE), %rax
460 # elif !defined USE_AS_STRCAT
461 movq %rdi, %rax
462 # endif
463
464 # if VEC_SIZE == 64
465 # ifdef USE_AS_WCSCPY
466 testb %cl, %cl
467 # else
468 test %ecx, %ecx
469 # endif
470 jz L(page_cross_copy_32_63)
471 # endif
472
473 # ifdef USE_AS_WCSCPY
474 testb $0xf, %cl
475 # else
476 testw %cx, %cx
477 # endif
478 jz L(page_cross_copy_16_31)
479
480 # ifdef USE_AS_WCSCPY
481 testb $0x3, %cl
482 # else
483 testb %cl, %cl
484 # endif
485 jz L(page_cross_copy_8_15)
486
487 # ifdef USE_AS_WCSCPY
488 movl (%rsi), %esi
489 movl %esi, (%rdi)
490 movl $0, (%END_REG)
491 ret
492 # else
493
494 testb $0x7, %cl
495 jz L(page_cross_copy_4_7)
496
497 test %edx, %edx
498 jz L(page_cross_set_null_term)
499 movzwl (%rsi), %ecx
500 movw %cx, (%rdi)
501 L(page_cross_set_null_term):
502 movb $0, (%END_REG)
503 ret
504
505
506 .p2align 4,, 4
507 L(page_cross_copy_4_7):
508 movl (%rsi), %ecx
509 movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi
510 movl %ecx, (%rdi)
511 movl %esi, -(4 - CHAR_SIZE)(%END_REG)
512 ret
513 # endif
514
515 # if VEC_SIZE == 64
516 .p2align 4,, 4
517 L(page_cross_copy_32_63):
518 VMOVU (%rsi), %VMM_256(0)
519 VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
520 VMOVU %VMM_256(0), (%rdi)
521 VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG)
522 ret
523 # endif
524
525 .p2align 4,, 4
526 L(page_cross_copy_16_31):
527 vmovdqu (%rsi), %xmm0
528 vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
529 vmovdqu %xmm0, (%rdi)
530 vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG)
531 ret
532
533 .p2align 4,, 4
534 L(page_cross_copy_8_15):
535 movq (%rsi), %rcx
536 movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
537 movq %rcx, (%rdi)
538 movq %rsi, -(8 - CHAR_SIZE)(%END_REG)
539 ret
540 # endif
541 END(STRCPY)
542 #endif