]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/memcmp.S
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / memcmp.S
CommitLineData
e26c9b84 1/* memcmp with SSE2
d4697bc9 2 Copyright (C) 2009-2014 Free Software Foundation, Inc.
e26c9b84
L
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
e26c9b84
L
19
20#include <sysdep.h>
21
22 .text
23ENTRY (memcmp)
24 test %rdx, %rdx
25 jz L(finz)
26 cmpq $1, %rdx
27 jle L(finr1b)
24a12a5a 28 subq %rdi, %rsi
e26c9b84
L
29 movq %rdx, %r10
30 cmpq $32, %r10
31 jge L(gt32)
32 /* Handle small chunks and last block of less than 32 bytes. */
33L(small):
34 testq $1, %r10
35 jz L(s2b)
36 movzbl (%rdi), %eax
37 movzbl (%rdi, %rsi), %edx
38 subq $1, %r10
24a12a5a 39 je L(finz1)
e26c9b84
L
40 addq $1, %rdi
41 subl %edx, %eax
42 jnz L(exit)
43L(s2b):
44 testq $2, %r10
45 jz L(s4b)
46 movzwl (%rdi), %eax
47 movzwl (%rdi, %rsi), %edx
48 subq $2, %r10
24a12a5a 49 je L(fin2_7)
e26c9b84
L
50 addq $2, %rdi
51 cmpl %edx, %eax
52 jnz L(fin2_7)
53L(s4b):
54 testq $4, %r10
55 jz L(s8b)
56 movl (%rdi), %eax
57 movl (%rdi, %rsi), %edx
58 subq $4, %r10
24a12a5a 59 je L(fin2_7)
e26c9b84
L
60 addq $4, %rdi
61 cmpl %edx, %eax
62 jnz L(fin2_7)
63L(s8b):
64 testq $8, %r10
65 jz L(s16b)
66 movq (%rdi), %rax
67 movq (%rdi, %rsi), %rdx
68 subq $8, %r10
24a12a5a 69 je L(fin2_7)
e26c9b84
L
70 addq $8, %rdi
71 cmpq %rdx, %rax
72 jnz L(fin2_7)
73L(s16b):
74 movdqu (%rdi), %xmm1
75 movdqu (%rdi, %rsi), %xmm0
76 pcmpeqb %xmm0, %xmm1
77 pmovmskb %xmm1, %edx
24a12a5a 78 xorl %eax, %eax
e26c9b84
L
79 subl $0xffff, %edx
80 jz L(finz)
24a12a5a
UD
81 bsfl %edx, %ecx
82 leaq (%rdi, %rcx), %rcx
e26c9b84
L
83 movzbl (%rcx), %eax
84 movzbl (%rsi, %rcx), %edx
85 jmp L(finz1)
86
87 .p2align 4,, 4
88L(finr1b):
89 movzbl (%rdi), %eax
24a12a5a 90 movzbl (%rsi), %edx
e26c9b84
L
91L(finz1):
92 subl %edx, %eax
93L(exit):
94 ret
95
96 .p2align 4,, 4
97L(fin2_7):
98 cmpq %rdx, %rax
99 jz L(finz)
24a12a5a
UD
100 movq %rax, %r11
101 subq %rdx, %r11
e26c9b84 102 bsfq %r11, %rcx
24a12a5a 103 sarq $3, %rcx
e26c9b84 104 salq $3, %rcx
24a12a5a 105 sarq %cl, %rax
e26c9b84 106 movzbl %al, %eax
24a12a5a 107 sarq %cl, %rdx
e26c9b84
L
108 movzbl %dl, %edx
109 subl %edx, %eax
24a12a5a 110 ret
e26c9b84
L
111
112 .p2align 4,, 4
113L(finz):
114 xorl %eax, %eax
115 ret
116
24a12a5a 117 /* For blocks bigger than 32 bytes
e26c9b84
L
118 1. Advance one of the addr pointer to be 16B aligned.
119 2. Treat the case of both addr pointers aligned to 16B
120 separately to avoid movdqu.
121 3. Handle any blocks of greater than 64 consecutive bytes with
122 unrolling to reduce branches.
123 4. At least one addr pointer is 16B aligned, use memory version
124 of pcmbeqb.
125 */
126 .p2align 4,, 4
127L(gt32):
128 movq %rdx, %r11
129 addq %rdi, %r11
24a12a5a 130 movq %rdi, %r8
e26c9b84
L
131
132 andq $15, %r8
24a12a5a 133 jz L(16am)
e26c9b84
L
134 /* Both pointers may be misaligned. */
135 movdqu (%rdi), %xmm1
136 movdqu (%rdi, %rsi), %xmm0
137 pcmpeqb %xmm0, %xmm1
138 pmovmskb %xmm1, %edx
139 subl $0xffff, %edx
140 jnz L(neq)
141 neg %r8
142 leaq 16(%rdi, %r8), %rdi
143L(16am):
144 /* Handle two 16B aligned pointers separately. */
145 testq $15, %rsi
146 jz L(ATR)
147 testq $16, %rdi
148 jz L(A32)
149 movdqu (%rdi, %rsi), %xmm0
150 pcmpeqb (%rdi), %xmm0
151 pmovmskb %xmm0, %edx
152 subl $0xffff, %edx
153 jnz L(neq)
154 addq $16, %rdi
155L(A32):
156 movq %r11, %r10
157 andq $-32, %r10
24a12a5a
UD
158 cmpq %r10, %rdi
159 jge L(mt16)
e26c9b84
L
160 /* Pre-unroll to be ready for unrolled 64B loop. */
161 testq $32, %rdi
162 jz L(A64)
163 movdqu (%rdi,%rsi), %xmm0
164 pcmpeqb (%rdi), %xmm0
165 pmovmskb %xmm0, %edx
166 subl $0xffff, %edx
167 jnz L(neq)
168 addq $16, %rdi
24a12a5a 169
e26c9b84
L
170 movdqu (%rdi,%rsi), %xmm0
171 pcmpeqb (%rdi), %xmm0
172 pmovmskb %xmm0, %edx
173 subl $0xffff, %edx
174 jnz L(neq)
175 addq $16, %rdi
176
177L(A64):
178 movq %r11, %r10
179 andq $-64, %r10
24a12a5a
UD
180 cmpq %r10, %rdi
181 jge L(mt32)
182
e26c9b84
L
183L(A64main):
184 movdqu (%rdi,%rsi), %xmm0
185 pcmpeqb (%rdi), %xmm0
186 pmovmskb %xmm0, %edx
187 subl $0xffff, %edx
188 jnz L(neq)
189 addq $16, %rdi
24a12a5a 190
e26c9b84
L
191 movdqu (%rdi,%rsi), %xmm0
192 pcmpeqb (%rdi), %xmm0
193 pmovmskb %xmm0, %edx
194 subl $0xffff, %edx
195 jnz L(neq)
196 addq $16, %rdi
197
198 movdqu (%rdi,%rsi), %xmm0
199 pcmpeqb (%rdi), %xmm0
200 pmovmskb %xmm0, %edx
201 subl $0xffff, %edx
202 jnz L(neq)
203 addq $16, %rdi
204
205 movdqu (%rdi,%rsi), %xmm0
206 pcmpeqb (%rdi), %xmm0
207 pmovmskb %xmm0, %edx
208 subl $0xffff, %edx
209 jnz L(neq)
210 addq $16, %rdi
211
212 cmpq %rdi, %r10
213 jne L(A64main)
214
215L(mt32):
216 movq %r11, %r10
217 andq $-32, %r10
24a12a5a
UD
218 cmpq %r10, %rdi
219 jge L(mt16)
e26c9b84
L
220
221L(A32main):
222 movdqu (%rdi,%rsi), %xmm0
223 pcmpeqb (%rdi), %xmm0
224 pmovmskb %xmm0, %edx
225 subl $0xffff, %edx
226 jnz L(neq)
227 addq $16, %rdi
24a12a5a 228
e26c9b84
L
229 movdqu (%rdi,%rsi), %xmm0
230 pcmpeqb (%rdi), %xmm0
231 pmovmskb %xmm0, %edx
232 subl $0xffff, %edx
233 jnz L(neq)
234 addq $16, %rdi
235
236 cmpq %rdi, %r10
237 jne L(A32main)
238L(mt16):
239 subq %rdi, %r11
240 je L(finz)
24a12a5a
UD
241 movq %r11, %r10
242 jmp L(small)
e26c9b84
L
243
244 .p2align 4,, 4
245L(neq):
24a12a5a 246 bsfl %edx, %ecx
e26c9b84 247 movzbl (%rdi, %rcx), %eax
24a12a5a 248 addq %rdi, %rsi
e26c9b84
L
249 movzbl (%rsi,%rcx), %edx
250 jmp L(finz1)
251
252 .p2align 4,, 4
253L(ATR):
254 movq %r11, %r10
24a12a5a
UD
255 andq $-32, %r10
256 cmpq %r10, %rdi
257 jge L(mt16)
e26c9b84
L
258 testq $16, %rdi
259 jz L(ATR32)
260
261 movdqa (%rdi,%rsi), %xmm0
262 pcmpeqb (%rdi), %xmm0
263 pmovmskb %xmm0, %edx
264 subl $0xffff, %edx
265 jnz L(neq)
266 addq $16, %rdi
267 cmpq %rdi, %r10
268 je L(mt16)
269
270L(ATR32):
271 movq %r11, %r10
272 andq $-64, %r10
273 testq $32, %rdi
274 jz L(ATR64)
275
276 movdqa (%rdi,%rsi), %xmm0
277 pcmpeqb (%rdi), %xmm0
278 pmovmskb %xmm0, %edx
279 subl $0xffff, %edx
280 jnz L(neq)
281 addq $16, %rdi
282
283 movdqa (%rdi,%rsi), %xmm0
284 pcmpeqb (%rdi), %xmm0
285 pmovmskb %xmm0, %edx
286 subl $0xffff, %edx
287 jnz L(neq)
288 addq $16, %rdi
289
290L(ATR64):
291 cmpq %rdi, %r10
24a12a5a 292 je L(mt32)
e26c9b84
L
293
294L(ATR64main):
295 movdqa (%rdi,%rsi), %xmm0
296 pcmpeqb (%rdi), %xmm0
297 pmovmskb %xmm0, %edx
298 subl $0xffff, %edx
299 jnz L(neq)
300 addq $16, %rdi
301
302 movdqa (%rdi,%rsi), %xmm0
303 pcmpeqb (%rdi), %xmm0
304 pmovmskb %xmm0, %edx
305 subl $0xffff, %edx
306 jnz L(neq)
307 addq $16, %rdi
308
309 movdqa (%rdi,%rsi), %xmm0
310 pcmpeqb (%rdi), %xmm0
311 pmovmskb %xmm0, %edx
312 subl $0xffff, %edx
313 jnz L(neq)
314 addq $16, %rdi
315
316 movdqa (%rdi,%rsi), %xmm0
317 pcmpeqb (%rdi), %xmm0
318 pmovmskb %xmm0, %edx
319 subl $0xffff, %edx
320 jnz L(neq)
321 addq $16, %rdi
322 cmpq %rdi, %r10
323 jne L(ATR64main)
324
325 movq %r11, %r10
24a12a5a
UD
326 andq $-32, %r10
327 cmpq %r10, %rdi
328 jge L(mt16)
e26c9b84
L
329
330L(ATR32res):
331 movdqa (%rdi,%rsi), %xmm0
332 pcmpeqb (%rdi), %xmm0
333 pmovmskb %xmm0, %edx
334 subl $0xffff, %edx
335 jnz L(neq)
336 addq $16, %rdi
337
338 movdqa (%rdi,%rsi), %xmm0
339 pcmpeqb (%rdi), %xmm0
340 pmovmskb %xmm0, %edx
341 subl $0xffff, %edx
342 jnz L(neq)
343 addq $16, %rdi
344
24a12a5a 345 cmpq %r10, %rdi
e26c9b84
L
346 jne L(ATR32res)
347
348 subq %rdi, %r11
349 je L(finz)
24a12a5a
UD
350 movq %r11, %r10
351 jmp L(small)
e26c9b84
L
352 /* Align to 16byte to improve instruction fetch. */
353 .p2align 4,, 4
354END(memcmp)
355
356#undef bcmp
357weak_alias (memcmp, bcmp)
358libc_hidden_builtin_def (memcmp)