]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/strlen.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / strlen.S
CommitLineData
30cb625a 1/* SSE2 version of strlen/wcslen.
04277e02 2 Copyright (C) 2012-2019 Free Software Foundation, Inc.
78df0fcb
AJ
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
78df0fcb
AJ
18
19#include <sysdep.h>
78df0fcb 20
30cb625a
L
21#ifdef AS_WCSLEN
22# define PMINU pminud
23# define PCMPEQ pcmpeqd
24# define SHIFT_RETURN shrq $2, %rax
25#else
26# define PMINU pminub
27# define PCMPEQ pcmpeqb
28# define SHIFT_RETURN
29#endif
30
37bb363f 31/* Long lived register in strlen(s), strnlen(s, n) are:
78df0fcb 32
2194737e 33 %xmm3 - zero
37bb363f
OB
34 %rdi - s
35 %r10 (s+n) & (~(64-1))
36 %r11 s+n
37*/
38
39
40.text
a152f366 41ENTRY(strlen)
37bb363f
OB
42
43/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
44#define FIND_ZERO \
30cb625a
L
45 PCMPEQ (%rax), %xmm0; \
46 PCMPEQ 16(%rax), %xmm1; \
47 PCMPEQ 32(%rax), %xmm2; \
48 PCMPEQ 48(%rax), %xmm3; \
2194737e
L
49 pmovmskb %xmm0, %esi; \
50 pmovmskb %xmm1, %edx; \
51 pmovmskb %xmm2, %r8d; \
52 pmovmskb %xmm3, %ecx; \
37bb363f
OB
53 salq $16, %rdx; \
54 salq $16, %rcx; \
55 orq %rsi, %rdx; \
56 orq %r8, %rcx; \
57 salq $32, %rcx; \
58 orq %rcx, %rdx;
59
60#ifdef AS_STRNLEN
61/* Do not read anything when n==0. */
62 test %rsi, %rsi
63 jne L(n_nonzero)
623aac7f 64 xor %rax, %rax
37bb363f
OB
65 ret
66L(n_nonzero):
30cb625a
L
67# ifdef AS_WCSLEN
68 shlq $2, %rsi
69# endif
623aac7f 70
37bb363f 71/* Initialize long lived registers. */
623aac7f 72
37bb363f
OB
73 add %rdi, %rsi
74 mov %rsi, %r10
75 and $-64, %r10
76 mov %rsi, %r11
77#endif
b79188d7 78
2194737e
L
79 pxor %xmm0, %xmm0
80 pxor %xmm1, %xmm1
81 pxor %xmm2, %xmm2
82 pxor %xmm3, %xmm3
37bb363f
OB
83 movq %rdi, %rax
84 movq %rdi, %rcx
85 andq $4095, %rcx
86/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
87 cmpq $4047, %rcx
88/* We cannot unify this branching as it would be ~6 cycles slower. */
89 ja L(cross_page)
90
91#ifdef AS_STRNLEN
92/* Test if end is among first 64 bytes. */
93# define STRNLEN_PROLOG \
94 mov %r11, %rsi; \
95 subq %rax, %rsi; \
96 andq $-64, %rax; \
97 testq $-64, %rsi; \
98 je L(strnlen_ret)
99#else
100# define STRNLEN_PROLOG andq $-64, %rax;
101#endif
102
103/* Ignore bits in mask that come before start of string. */
104#define PROLOG(lab) \
105 movq %rdi, %rcx; \
106 xorq %rax, %rcx; \
107 STRNLEN_PROLOG; \
108 sarq %cl, %rdx; \
109 test %rdx, %rdx; \
110 je L(lab); \
111 bsfq %rdx, %rax; \
30cb625a 112 SHIFT_RETURN; \
37bb363f
OB
113 ret
114
115#ifdef AS_STRNLEN
116 andq $-16, %rax
117 FIND_ZERO
118#else
119 /* Test first 16 bytes unaligned. */
2194737e 120 movdqu (%rax), %xmm4
30cb625a 121 PCMPEQ %xmm0, %xmm4
2194737e 122 pmovmskb %xmm4, %edx
87bd9bc4 123 test %edx, %edx
37bb363f
OB
124 je L(next48_bytes)
125 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
30cb625a 126 SHIFT_RETURN
37bb363f
OB
127 ret
128
129L(next48_bytes):
130/* Same as FIND_ZERO except we do not check first 16 bytes. */
131 andq $-16, %rax
30cb625a
L
132 PCMPEQ 16(%rax), %xmm1
133 PCMPEQ 32(%rax), %xmm2
134 PCMPEQ 48(%rax), %xmm3
2194737e
L
135 pmovmskb %xmm1, %edx
136 pmovmskb %xmm2, %r8d
137 pmovmskb %xmm3, %ecx
37bb363f
OB
138 salq $16, %rdx
139 salq $16, %rcx
140 orq %r8, %rcx
141 salq $32, %rcx
142 orq %rcx, %rdx
143#endif
144
2194737e 145 /* When no zero byte is found xmm1-3 are zero so we do not have to
37bb363f
OB
146 zero them. */
147 PROLOG(loop)
148
149 .p2align 4
150L(cross_page):
151 andq $-64, %rax
152 FIND_ZERO
153 PROLOG(loop_init)
154
155#ifdef AS_STRNLEN
156/* We must do this check to correctly handle strnlen (s, -1). */
157L(strnlen_ret):
158 bts %rsi, %rdx
159 sarq %cl, %rdx
160 test %rdx, %rdx
161 je L(loop_init)
162 bsfq %rdx, %rax
30cb625a 163 SHIFT_RETURN
623aac7f 164 ret
37bb363f
OB
165#endif
166 .p2align 4
167L(loop_init):
2194737e
L
168 pxor %xmm1, %xmm1
169 pxor %xmm2, %xmm2
170 pxor %xmm3, %xmm3
37bb363f
OB
171#ifdef AS_STRNLEN
172 .p2align 4
173L(loop):
174
175 addq $64, %rax
176 cmpq %rax, %r10
177 je L(exit_end)
178
2194737e 179 movdqa (%rax), %xmm0
30cb625a
L
180 PMINU 16(%rax), %xmm0
181 PMINU 32(%rax), %xmm0
182 PMINU 48(%rax), %xmm0
183 PCMPEQ %xmm3, %xmm0
2194737e 184 pmovmskb %xmm0, %edx
37bb363f
OB
185 testl %edx, %edx
186 jne L(exit)
187 jmp L(loop)
188
623aac7f 189 .p2align 4
37bb363f
OB
190L(exit_end):
191 cmp %rax, %r11
192 je L(first) /* Do not read when end is at page boundary. */
2194737e 193 pxor %xmm0, %xmm0
37bb363f
OB
194 FIND_ZERO
195
196L(first):
197 bts %r11, %rdx
198 bsfq %rdx, %rdx
199 addq %rdx, %rax
200 subq %rdi, %rax
30cb625a 201 SHIFT_RETURN
623aac7f 202 ret
37bb363f 203
623aac7f 204 .p2align 4
37bb363f 205L(exit):
2194737e 206 pxor %xmm0, %xmm0
37bb363f
OB
207 FIND_ZERO
208
209 bsfq %rdx, %rdx
210 addq %rdx, %rax
211 subq %rdi, %rax
30cb625a 212 SHIFT_RETURN
87bd9bc4 213 ret
37bb363f
OB
214
215#else
216
217 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
218 .p2align 4
219L(loop):
220
2194737e 221 movdqa 64(%rax), %xmm0
30cb625a
L
222 PMINU 80(%rax), %xmm0
223 PMINU 96(%rax), %xmm0
224 PMINU 112(%rax), %xmm0
225 PCMPEQ %xmm3, %xmm0
2194737e 226 pmovmskb %xmm0, %edx
37bb363f
OB
227 testl %edx, %edx
228 jne L(exit64)
229
230 subq $-128, %rax
231
2194737e 232 movdqa (%rax), %xmm0
30cb625a
L
233 PMINU 16(%rax), %xmm0
234 PMINU 32(%rax), %xmm0
235 PMINU 48(%rax), %xmm0
236 PCMPEQ %xmm3, %xmm0
2194737e 237 pmovmskb %xmm0, %edx
37bb363f
OB
238 testl %edx, %edx
239 jne L(exit0)
240 jmp L(loop)
241
b79188d7 242 .p2align 4
37bb363f
OB
243L(exit64):
244 addq $64, %rax
245L(exit0):
2194737e 246 pxor %xmm0, %xmm0
37bb363f
OB
247 FIND_ZERO
248
249 bsfq %rdx, %rdx
250 addq %rdx, %rax
251 subq %rdi, %rax
30cb625a 252 SHIFT_RETURN
78df0fcb 253 ret
37bb363f
OB
254
255#endif
256
a152f366 257END(strlen)
85dd1003 258libc_hidden_builtin_def (strlen)