2 Copyright (C) 2011-2021 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
25 # define STRCAT __strcat_avx2
28 # define USE_AS_STRCAT
30 /* Number of bytes in a vector register */
33 .section .text.avx,"ax",@progbits
36 # ifdef USE_AS_STRNCAT
42 and $((VEC_SIZE * 4) - 1), %ecx
43 vpxor %xmm6, %xmm6, %xmm6
44 cmp $(VEC_SIZE * 3), %ecx
45 ja L(fourth_vector_boundary)
46 vpcmpeqb (%rdi), %ymm6, %ymm0
49 jnz L(exit_null_on_first_vector)
52 jmp L(align_vec_size_start)
53 L(fourth_vector_boundary):
56 vpcmpeqb (%rax), %ymm6, %ymm0
64 L(align_vec_size_start):
65 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
68 jnz L(exit_null_on_second_vector)
70 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
73 jnz L(exit_null_on_third_vector)
75 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
78 jnz L(exit_null_on_fourth_vector)
80 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
83 jnz L(exit_null_on_fifth_vector)
85 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
86 add $(VEC_SIZE * 4), %rax
89 jnz L(exit_null_on_second_vector)
91 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
94 jnz L(exit_null_on_third_vector)
96 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
99 jnz L(exit_null_on_fourth_vector)
101 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
102 vpmovmskb %ymm3, %edx
104 jnz L(exit_null_on_fifth_vector)
106 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
107 add $(VEC_SIZE * 4), %rax
108 vpmovmskb %ymm0, %edx
110 jnz L(exit_null_on_second_vector)
112 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
113 vpmovmskb %ymm1, %edx
115 jnz L(exit_null_on_third_vector)
117 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
118 vpmovmskb %ymm2, %edx
120 jnz L(exit_null_on_fourth_vector)
122 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
123 vpmovmskb %ymm3, %edx
125 jnz L(exit_null_on_fifth_vector)
127 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
128 add $(VEC_SIZE * 4), %rax
129 vpmovmskb %ymm0, %edx
131 jnz L(exit_null_on_second_vector)
133 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
134 vpmovmskb %ymm1, %edx
136 jnz L(exit_null_on_third_vector)
138 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
139 vpmovmskb %ymm2, %edx
141 jnz L(exit_null_on_fourth_vector)
143 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
144 vpmovmskb %ymm3, %edx
146 jnz L(exit_null_on_fifth_vector)
148 test $((VEC_SIZE * 4) - 1), %rax
149 jz L(align_four_vec_loop)
151 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
152 add $(VEC_SIZE * 5), %rax
153 vpmovmskb %ymm0, %edx
157 test $((VEC_SIZE * 4) - 1), %rax
158 jz L(align_four_vec_loop)
160 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
162 vpmovmskb %ymm1, %edx
166 test $((VEC_SIZE * 4) - 1), %rax
167 jz L(align_four_vec_loop)
169 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
171 vpmovmskb %ymm2, %edx
175 test $((VEC_SIZE * 4) - 1), %rax
176 jz L(align_four_vec_loop)
178 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
180 vpmovmskb %ymm3, %edx
187 L(align_four_vec_loop):
188 vmovaps (%rax), %ymm4
189 vpminub VEC_SIZE(%rax), %ymm4, %ymm4
190 vmovaps (VEC_SIZE * 2)(%rax), %ymm5
191 vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
192 add $(VEC_SIZE * 4), %rax
193 vpminub %ymm4, %ymm5, %ymm5
194 vpcmpeqb %ymm5, %ymm6, %ymm5
195 vpmovmskb %ymm5, %edx
197 jz L(align_four_vec_loop)
199 vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
200 sub $(VEC_SIZE * 5), %rax
201 vpmovmskb %ymm0, %edx
203 jnz L(exit_null_on_second_vector)
205 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
206 vpmovmskb %ymm1, %edx
208 jnz L(exit_null_on_third_vector)
210 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
211 vpmovmskb %ymm2, %edx
213 jnz L(exit_null_on_fourth_vector)
215 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
216 vpmovmskb %ymm3, %edx
220 add $(VEC_SIZE * 4), %rax
221 jmp L(StartStrcpyPart)
226 L(exit_null_on_first_vector):
229 jmp L(StartStrcpyPart)
232 L(exit_null_on_second_vector):
237 jmp L(StartStrcpyPart)
240 L(exit_null_on_third_vector):
244 add $(VEC_SIZE * 2), %rax
245 jmp L(StartStrcpyPart)
248 L(exit_null_on_fourth_vector):
252 add $(VEC_SIZE * 3), %rax
253 jmp L(StartStrcpyPart)
256 L(exit_null_on_fifth_vector):
260 add $(VEC_SIZE * 4), %rax
264 lea (%r9, %rax), %rdi
266 mov %r9, %rax /* save result */
268 # ifdef USE_AS_STRNCAT
271 # define USE_AS_STRNCPY
274 # include "strcpy-avx2.S"