1 /* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (4)
23 /* Use evex-masked stores for small sizes. Turned off at the
25 # define USE_EVEX_MASKED_STORE 0
30 # include "x86-evex256-vecs.h"
35 # define STRNCPY __strncpy_evex
39 # define VMOVU_MASK vmovdqu32
40 # define VPCMPEQ vpcmpeqd
41 # define VPMIN vpminud
42 # define VPTESTN vptestnmd
43 # define VPTEST vptestmd
46 # define REP_MOVS rep movsd
47 # define REP_STOS rep stosl
49 # define USE_WIDE_CHAR
52 # define VMOVU_MASK vmovdqu8
53 # define VPCMPEQ vpcmpeqb
54 # define VPMIN vpminub
55 # define VPTESTN vptestnmb
56 # define VPTEST vptestmb
59 # define REP_MOVS rep movsb
60 # define REP_STOS rep stosb
63 # include "strncpy-or-cat-overflow-def.h"
65 # define PAGE_SIZE 4096
66 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
68 # include "reg-macros.h"
72 # define VZERO_256 VMM_256(7)
73 # define VZERO_128 VMM_128(7)
76 # define VZERO_HALF VZERO_256
78 # define VZERO_HALF VZERO_128
81 .section SECTION(.text), "ax", @progbits
84 /* Clear the upper 32 bits. */
87 /* Filter zero length strings and very long strings. Zero
88 length strings just return, very long strings are handled by
89 just running rep stos{b|l} to zero set (which will almost
90 certainly segfault), if that succeeds then just calling
91 OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
95 /* 56 is end of max supported address space. */
100 /* If the flag needs to become `jb` replace `dec` with `sub`.
105 vpxorq %VZERO_128, %VZERO_128, %VZERO_128
107 andl $(PAGE_SIZE - 1), %eax
108 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
111 L(page_cross_continue):
112 VMOVU (%rsi), %VMM(0)
113 VPTESTN %VMM(0), %VMM(0), %k0
116 /* If no STPCPY just save end ahead of time. */
117 # ifndef USE_AS_STPCPY
122 cmpq $(CHAR_PER_VEC), %rdx
124 /* If USE_EVEX_MASK_STORE is enabled then we just handle length
125 <= CHAR_PER_VEC with masked instructions (which have
126 potential for dramatically bad perf if dst splits a page and
127 is not in the TLB). */
128 # if USE_EVEX_MASKED_STORE
129 /* `jae` because length rdx is now length - 1. */
132 /* If there where multiple zero-CHAR matches in the first VEC,
133 VRCX will be overset but that's fine since any oversets where
134 at zero-positions anyways. */
136 # ifdef USE_AS_STPCPY
140 # ifdef USE_AS_WCSCPY
142 leaq (%rdi, %rax, CHAR_SIZE), %rax
149 /* Zero out all non-zero CHAR's after the first zero match. */
152 /* Use VZERO as destination so this can be reused for
153 L(zfill_less_vec) (which if jumped to by subsequent logic
154 will have zerod out VZERO. */
155 VMOVU_MASK %VMM(0), %VZERO{%k1}{z}
157 /* Get mask for what we need to set. */
160 bzhi %VRDX, %VRCX, %VRCX
162 VMOVU_MASK %VZERO, (%rdi){%k1}
168 jne L(best_effort_strncpy)
175 /* `jb` because length rdx is now length - 1. */
180 /* This may overset but that's fine because we still need to zero
182 VMOVU %VMM(0), (%rdi)
185 /* Length must be >= CHAR_PER_VEC so match here means we must
191 /* We are going to align rsi here so will need to be able to re-
192 adjust rdi/rdx afterwards. NB: We filtered out huge lengths
193 so rsi + rdx * CHAR_SIZE cannot overflow. */
194 leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
196 andq $-(VEC_SIZE), %rsi
201 # ifdef USE_AS_WCSCPY
205 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
206 VPTESTN %VMM(1), %VMM(1), %k0
209 /* -1 because of the `dec %rdx` earlier. */
210 cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
214 /* This will be need to be computed no matter what. We do it
215 ahead of time for CHAR_PER_VEC == 64 because we can't adjust
216 the value of `tzcnt` with a shift. */
217 # if CHAR_PER_VEC == 64
221 cmpl $(CHAR_PER_VEC), %edx
224 /* Separate logic for CHAR_PER_VEC == 64 because we already did
226 # if CHAR_PER_VEC == 64
227 /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */
228 cmpb $CHAR_PER_VEC, %cl
229 jnz L(ret_vec_x1_no_bsf)
237 VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0
238 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
241 # if CHAR_PER_VEC < 64
242 /* This essentiallys adds CHAR_PER_VEC to computed result. */
243 shlq $CHAR_PER_VEC, %rcx
246 addl $CHAR_PER_VEC, %ecx
251 /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has
252 already been done. */
253 # if CHAR_PER_VEC < 64
257 jbe L(ret_vec_x1_len_no_zfill)
258 /* Fall through (expectation) is copy len < buffer len. */
259 VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
260 L(ret_vec_x1_len_no_zfill_mov):
262 # ifdef USE_AS_STPCPY
266 L(ret_vec_x1_len_no_zfill):
267 VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
268 VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
269 # ifdef USE_AS_STPCPY
270 # ifdef USE_AS_WCSCPY
272 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
274 leal (VEC_SIZE)(%rdx), %eax
284 L(ret_vec_x1_no_bsf):
285 VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
287 cmpl $CHAR_PER_VEC, %edx
288 jb L(ret_vec_x1_len_no_zfill_mov)
289 /* Fall through (expectation) is copy len < buffer len. */
290 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
291 VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE)
292 # ifdef USE_AS_STPCPY
293 leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax
299 /* Separate logic for CHAR_PER_VEC == 64 because we can do `andl
300 $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just
302 # if CHAR_PER_VEC == 64
305 andl $(CHAR_PER_VEC * 4 - 1), %edx
307 VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
308 VPTESTN %VMM(1), %VMM(1), %k0
310 subq $-(VEC_SIZE * 4), %rsi
311 subq $-(VEC_SIZE * 4), %rdi
312 cmpl $(CHAR_PER_VEC * 2 - 1), %edx
316 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
318 /* Must fill at least 2x VEC. */
321 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
322 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
323 VPTESTN %VMM(2), %VMM(2), %k0
326 /* Must fill at least 1x VEC. */
329 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
330 VPTESTN %VMM(3), %VMM(3), %k0
333 /* Check if len is more 4x VEC. -1 because rdx is len - 1. */
334 cmpq $(CHAR_PER_VEC * 4 - 1), %rdx
337 subl $(CHAR_PER_VEC * 3), %edx
343 VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0
344 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
348 jbe L(ret_vec_x4_len_no_zfill)
349 /* Fall through (expectation) is copy len < buffer len. */
350 VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
352 L(ret_vec_x4_len_no_zfill):
353 VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
354 VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
355 # ifdef USE_AS_STPCPY
356 # ifdef USE_AS_WCSCPY
358 leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax
360 leal (VEC_SIZE * 4 + 0)(%rdx), %eax
368 addl $(CHAR_PER_VEC * 1), %edx
371 jbe L(ret_vec_x3_len_no_zfill)
372 /* Fall through (expectation) is copy len < buffer len. */
373 VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
374 L(ret_vec_x3_len_no_zfill_mov):
376 # ifdef USE_AS_STPCPY
381 L(ret_vec_x3_len_no_zfill):
382 VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
383 VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE)
384 # ifdef USE_AS_STPCPY
385 # ifdef USE_AS_WCSCPY
387 leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
389 leal (VEC_SIZE * 3 + 0)(%rdx), %eax
399 VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE)
401 jl L(ret_vec_x3_len_no_zfill_mov)
402 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
403 VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
404 # ifdef USE_AS_STPCPY
405 leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
411 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
415 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
416 VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
417 VPTESTN %VMM(4), %VMM(4), %k0
422 /* Recheck length before aligning. */
423 cmpq $(CHAR_PER_VEC * 8 - 1), %rdx
426 /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */
427 # ifdef USE_AS_WCSCPY
428 leaq (%rsi, %rdx, CHAR_SIZE), %rdx
433 subq $-(VEC_SIZE * 5), %rsi
434 andq $(VEC_SIZE * -4), %rsi
437 /* Load first half of the loop before entry. */
438 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
439 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
440 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
441 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
443 VPMIN %VMM(0), %VMM(1), %VMM(4)
444 VPMIN %VMM(2), %VMM(3), %VMM(6)
445 VPTESTN %VMM(4), %VMM(4), %k2
446 VPTESTN %VMM(6), %VMM(6), %k4
449 /* Offset rsi by VEC_SIZE so that we can jump to
450 L(loop_last_4x_vec). */
451 addq $-(VEC_SIZE), %rsi
455 /* Store loop end in r9. */
456 leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9
460 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
461 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
462 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
463 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
465 subq $(VEC_SIZE * -4), %rsi
467 jbe L(loop_last_4x_vec)
469 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
470 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
471 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
472 VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
474 VPMIN %VMM(0), %VMM(1), %VMM(4)
475 VPMIN %VMM(2), %VMM(3), %VMM(6)
476 VPTESTN %VMM(4), %VMM(4), %k2
477 VPTESTN %VMM(6), %VMM(6), %k4
482 /* Restore rdx (length). */
484 # ifdef USE_AS_WCSCPY
487 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
488 /* Restore rdi (dst). */
490 VPTESTN %VMM(0), %VMM(0), %k0
495 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
500 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
501 VPTESTN %VMM(2), %VMM(2), %k0
506 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi)
512 subq $(VEC_SIZE * -2), %rdi
513 addq $(CHAR_PER_VEC * -2), %rdx
515 subq $(VEC_SIZE * -2), %rdi
516 addq $(CHAR_PER_VEC * -1), %rdx
518 /* VRCX must be non-zero. */
521 /* Adjust length / dst for zfill. */
523 # ifdef USE_AS_WCSCPY
524 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
528 # ifdef USE_AS_STPCPY
531 L(zfill_from_page_cross):
533 /* From here on out its just memset(rdi, 0, rdx). */
534 cmpq $CHAR_PER_VEC, %rdx
537 L(zfill_more_1x_vec):
539 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
540 cmpq $(CHAR_PER_VEC * 2 - 1), %rdx
541 ja L(zfill_more_2x_vec)
545 /* Coming from vec1/vec2 we must be able to zfill at least 2x
549 subq $(VEC_SIZE * -2), %rdi
550 addq $(CHAR_PER_VEC * -2), %rdx
554 /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here.
556 leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
558 # ifdef USE_AS_STPCPY
564 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
565 cmpq $(CHAR_PER_VEC * 2), %rdx
567 L(zfill_more_2x_vec):
568 VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
569 VMOVU %VZERO, (VEC_SIZE)(%rdi)
570 subq $(CHAR_PER_VEC * 4 - 1), %rdx
573 # ifdef USE_AS_WCSCPY
574 leaq (%rdi, %rdx, CHAR_SIZE), %rdx
579 VMOVU %VZERO, (VEC_SIZE * 2)(%rdi)
580 VMOVU %VZERO, (VEC_SIZE * 3)(%rdi)
583 VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
584 VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
586 subq $-(VEC_SIZE * 4), %rdi
590 /* Align rdi and zfill loop. */
591 andq $-(VEC_SIZE), %rdi
593 L(zfill_loop_4x_vec):
594 VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
595 VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
596 VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
597 VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
598 subq $-(VEC_SIZE * 4), %rdi
600 ja L(zfill_loop_4x_vec)
605 /* Less 1x VEC case if we are not using evex masked store. */
606 # if !USE_EVEX_MASKED_STORE
609 /* Special case for copy 1x. It can be handled quickly and many
610 buffer sizes have convenient alignment. */
611 VMOVU %VMM(0), (%rdi)
612 /* If no zeros then we are done. */
616 /* Need to zfill, not we know that length <= CHAR_PER_VEC so we
617 only handle the small case here. */
619 L(zfill_less_vec_no_bsf):
620 /* Adjust length / dst then just zfill less_vec. */
622 # ifdef USE_AS_WCSCPY
623 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
627 # ifdef USE_AS_STPCPY
632 cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx
633 jb L(zfill_less_half)
635 VMOVU %VZERO_HALF, (%rdi)
636 VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
638 # ifdef USE_AS_STPCPY
640 leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax
648 /* Overfill to avoid branches. */
649 VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
650 VMOVU %VMM_256(0), (%rdi)
651 VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
653 /* We are taking advantage of the fact that to be here we must
654 be writing null-term as (%rdi, %rcx) we have a byte of lee-
655 way for overwriting. */
657 ja L(zfill_less_vec_no_bsf)
658 # ifndef USE_AS_STPCPY
661 # ifdef USE_AS_WCSCPY
663 leaq (%rdi, %rdx, CHAR_SIZE), %rax
674 /* Overfill to avoid branches. */
675 vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1
676 VMOVU %VMM_128(0), (%rdi)
677 vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
680 /* Separate logic depending on VEC_SIZE. If VEC_SIZE == 64 then
681 we have a larger copy block for 32-63 so this is just falls
682 through to zfill 16-31. If VEC_SIZE == 32 then we check for
683 full zfill of less 1x VEC. */
687 # ifdef USE_AS_WCSCPY
688 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
692 # ifdef USE_AS_STPCPY
697 cmpl $(16 / CHAR_SIZE), %edx
699 VMOVU %VZERO_128, (%rdi)
700 VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
701 # ifdef USE_AS_STPCPY
705 # ifdef USE_AS_STPCPY
706 # ifdef USE_AS_WCSCPY
708 leaq (%rdi, %rdx, CHAR_SIZE), %rax
716 /* VEC_SIZE == 32 begins. */
717 ja L(zfill_less_vec_no_bsf)
718 # ifndef USE_AS_STPCPY
721 # ifdef USE_AS_WCSCPY
723 leaq (%rdi, %rdx, CHAR_SIZE), %rax
735 /* Overfill to avoid branches. */
736 movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
737 vmovq %VMM_128(0), (%rdi)
738 movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
742 # ifdef USE_AS_WCSCPY
743 leaq (%rdi, %rcx, CHAR_SIZE), %rdi
747 # ifdef USE_AS_STPCPY
756 cmpl $(8 / CHAR_SIZE), %edx
759 movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
760 # ifndef USE_AS_STPCPY
769 /* We will need `tzcnt` result for all other copy sizes. */
772 cmpl $(32 / CHAR_SIZE), %edx
776 cmpl $(16 / CHAR_SIZE), %edx
779 cmpl $(8 / CHAR_SIZE), %edx
781 # ifdef USE_AS_WCSCPY
783 jz L(zfill_less_8_set_ret)
785 movl (%rsi, %rdx, CHAR_SIZE), %esi
786 vmovd %VMM_128(0), (%rdi)
787 movl %esi, (%rdi, %rdx, CHAR_SIZE)
788 # ifdef USE_AS_STPCPY
792 leaq (%rdi, %rdx, CHAR_SIZE), %rax
795 L(zfill_less_8_set_ret):
797 # ifdef USE_AS_STPCPY
802 movl %ecx, (%rdi, %rdx, CHAR_SIZE)
807 /* Overfill to avoid branches. */
808 movl -3(%rsi, %rdx), %esi
809 vmovd %VMM_128(0), (%rdi)
810 movl %esi, -3(%rdi, %rdx)
815 # ifdef USE_AS_STPCPY
824 movl %ecx, -3(%rdi, %rdx)
825 # ifdef USE_AS_STPCPY
830 # ifdef USE_AS_STPCPY
843 movb %cl, (%rdi, %rdx)
848 vmovd %VMM_128(0), %r8d
854 movzbl (%rsi, %rdx), %r8d
855 # ifdef USE_AS_STPCPY
858 movb %r8b, (%rdi, %rdx)
863 # ifdef USE_AS_STPCPY
868 # ifdef USE_AS_WCSCPY
869 vmovd %VMM_128(0), (%rdi)
871 movb %r8b, (%rdi, %rdx)
877 # ifndef USE_AS_WCSCPY
880 # ifdef USE_AS_STPCPY
881 leaq (%rdi, %rcx), %rax
883 movw $0, -1(%rdi, %rdx)
890 jne L(best_effort_strncpy)
900 andq $(VEC_SIZE * -1), %rax
901 VPCMPEQ (%rax), %VZERO, %k0
903 # ifdef USE_AS_WCSCPY
906 andl $(CHAR_PER_VEC - 1), %r8d
907 shrx %VR8, %VRCX, %VRCX
909 shrx %VRSI, %VRCX, %VRCX
912 /* Compute amount of bytes we checked. */
914 andl $(VEC_SIZE - 1), %eax
915 # ifdef USE_AS_WCSCPY
919 /* If rax > rdx then we are finishing the copy at the end of the
922 jb L(page_cross_small)
925 /* If rcx is non-zero then continue. */
927 jz L(page_cross_continue)
929 /* We found zero-CHAR so need to copy then zfill (we know we
930 didn't cover all of length here). */
935 # ifdef USE_AS_STPCPY
936 leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax
942 # ifdef USE_AS_WCSCPY
947 jmp L(zfill_from_page_cross)
952 jbe L(page_cross_copy_only)
954 /* Do a zfill of the tail before copying. */
961 leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi
966 L(page_cross_copy_only):
968 # ifdef USE_AS_STPCPY
969 # ifdef USE_AS_WCSCPY
971 leaq (%rdi, %rdx, CHAR_SIZE), %rax
983 L(best_effort_strncpy):
987 /* The length is >= 2^63. We very much so expect to segfault at
988 rep stos. If that doesn't happen then just strcpy to finish.