]>
Commit | Line | Data |
---|---|---|
e26c9b84 | 1 | /* memcmp with SSE2 |
d4697bc9 | 2 | Copyright (C) 2009-2014 Free Software Foundation, Inc. |
e26c9b84 L |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
e26c9b84 L |
19 | |
20 | #include <sysdep.h> | |
21 | ||
22 | .text | |
23 | ENTRY (memcmp) | |
24 | test %rdx, %rdx | |
25 | jz L(finz) | |
26 | cmpq $1, %rdx | |
27 | jle L(finr1b) | |
24a12a5a | 28 | subq %rdi, %rsi |
e26c9b84 L |
29 | movq %rdx, %r10 |
30 | cmpq $32, %r10 | |
31 | jge L(gt32) | |
32 | /* Handle small chunks and last block of less than 32 bytes. */ | |
33 | L(small): | |
34 | testq $1, %r10 | |
35 | jz L(s2b) | |
36 | movzbl (%rdi), %eax | |
37 | movzbl (%rdi, %rsi), %edx | |
38 | subq $1, %r10 | |
24a12a5a | 39 | je L(finz1) |
e26c9b84 L |
40 | addq $1, %rdi |
41 | subl %edx, %eax | |
42 | jnz L(exit) | |
43 | L(s2b): | |
44 | testq $2, %r10 | |
45 | jz L(s4b) | |
46 | movzwl (%rdi), %eax | |
47 | movzwl (%rdi, %rsi), %edx | |
48 | subq $2, %r10 | |
24a12a5a | 49 | je L(fin2_7) |
e26c9b84 L |
50 | addq $2, %rdi |
51 | cmpl %edx, %eax | |
52 | jnz L(fin2_7) | |
53 | L(s4b): | |
54 | testq $4, %r10 | |
55 | jz L(s8b) | |
56 | movl (%rdi), %eax | |
57 | movl (%rdi, %rsi), %edx | |
58 | subq $4, %r10 | |
24a12a5a | 59 | je L(fin2_7) |
e26c9b84 L |
60 | addq $4, %rdi |
61 | cmpl %edx, %eax | |
62 | jnz L(fin2_7) | |
63 | L(s8b): | |
64 | testq $8, %r10 | |
65 | jz L(s16b) | |
66 | movq (%rdi), %rax | |
67 | movq (%rdi, %rsi), %rdx | |
68 | subq $8, %r10 | |
24a12a5a | 69 | je L(fin2_7) |
e26c9b84 L |
70 | addq $8, %rdi |
71 | cmpq %rdx, %rax | |
72 | jnz L(fin2_7) | |
73 | L(s16b): | |
74 | movdqu (%rdi), %xmm1 | |
75 | movdqu (%rdi, %rsi), %xmm0 | |
76 | pcmpeqb %xmm0, %xmm1 | |
77 | pmovmskb %xmm1, %edx | |
24a12a5a | 78 | xorl %eax, %eax |
e26c9b84 L |
79 | subl $0xffff, %edx |
80 | jz L(finz) | |
24a12a5a UD |
81 | bsfl %edx, %ecx |
82 | leaq (%rdi, %rcx), %rcx | |
e26c9b84 L |
83 | movzbl (%rcx), %eax |
84 | movzbl (%rsi, %rcx), %edx | |
85 | jmp L(finz1) | |
86 | ||
87 | .p2align 4,, 4 | |
88 | L(finr1b): | |
89 | movzbl (%rdi), %eax | |
24a12a5a | 90 | movzbl (%rsi), %edx |
e26c9b84 L |
91 | L(finz1): |
92 | subl %edx, %eax | |
93 | L(exit): | |
94 | ret | |
95 | ||
96 | .p2align 4,, 4 | |
97 | L(fin2_7): | |
98 | cmpq %rdx, %rax | |
99 | jz L(finz) | |
24a12a5a UD |
100 | movq %rax, %r11 |
101 | subq %rdx, %r11 | |
e26c9b84 | 102 | bsfq %r11, %rcx |
24a12a5a | 103 | sarq $3, %rcx |
e26c9b84 | 104 | salq $3, %rcx |
24a12a5a | 105 | sarq %cl, %rax |
e26c9b84 | 106 | movzbl %al, %eax |
24a12a5a | 107 | sarq %cl, %rdx |
e26c9b84 L |
108 | movzbl %dl, %edx |
109 | subl %edx, %eax | |
24a12a5a | 110 | ret |
e26c9b84 L |
111 | |
112 | .p2align 4,, 4 | |
113 | L(finz): | |
114 | xorl %eax, %eax | |
115 | ret | |
116 | ||
24a12a5a | 117 | /* For blocks bigger than 32 bytes |
e26c9b84 L |
118 | 1. Advance one of the addr pointer to be 16B aligned. |
119 | 2. Treat the case of both addr pointers aligned to 16B | |
120 | separately to avoid movdqu. | |
121 | 3. Handle any blocks of greater than 64 consecutive bytes with | |
122 | unrolling to reduce branches. | |
123 | 4. At least one addr pointer is 16B aligned, use memory version | |
124 | of pcmbeqb. | |
125 | */ | |
126 | .p2align 4,, 4 | |
127 | L(gt32): | |
128 | movq %rdx, %r11 | |
129 | addq %rdi, %r11 | |
24a12a5a | 130 | movq %rdi, %r8 |
e26c9b84 L |
131 | |
132 | andq $15, %r8 | |
24a12a5a | 133 | jz L(16am) |
e26c9b84 L |
134 | /* Both pointers may be misaligned. */ |
135 | movdqu (%rdi), %xmm1 | |
136 | movdqu (%rdi, %rsi), %xmm0 | |
137 | pcmpeqb %xmm0, %xmm1 | |
138 | pmovmskb %xmm1, %edx | |
139 | subl $0xffff, %edx | |
140 | jnz L(neq) | |
141 | neg %r8 | |
142 | leaq 16(%rdi, %r8), %rdi | |
143 | L(16am): | |
144 | /* Handle two 16B aligned pointers separately. */ | |
145 | testq $15, %rsi | |
146 | jz L(ATR) | |
147 | testq $16, %rdi | |
148 | jz L(A32) | |
149 | movdqu (%rdi, %rsi), %xmm0 | |
150 | pcmpeqb (%rdi), %xmm0 | |
151 | pmovmskb %xmm0, %edx | |
152 | subl $0xffff, %edx | |
153 | jnz L(neq) | |
154 | addq $16, %rdi | |
155 | L(A32): | |
156 | movq %r11, %r10 | |
157 | andq $-32, %r10 | |
24a12a5a UD |
158 | cmpq %r10, %rdi |
159 | jge L(mt16) | |
e26c9b84 L |
160 | /* Pre-unroll to be ready for unrolled 64B loop. */ |
161 | testq $32, %rdi | |
162 | jz L(A64) | |
163 | movdqu (%rdi,%rsi), %xmm0 | |
164 | pcmpeqb (%rdi), %xmm0 | |
165 | pmovmskb %xmm0, %edx | |
166 | subl $0xffff, %edx | |
167 | jnz L(neq) | |
168 | addq $16, %rdi | |
24a12a5a | 169 | |
e26c9b84 L |
170 | movdqu (%rdi,%rsi), %xmm0 |
171 | pcmpeqb (%rdi), %xmm0 | |
172 | pmovmskb %xmm0, %edx | |
173 | subl $0xffff, %edx | |
174 | jnz L(neq) | |
175 | addq $16, %rdi | |
176 | ||
177 | L(A64): | |
178 | movq %r11, %r10 | |
179 | andq $-64, %r10 | |
24a12a5a UD |
180 | cmpq %r10, %rdi |
181 | jge L(mt32) | |
182 | ||
e26c9b84 L |
183 | L(A64main): |
184 | movdqu (%rdi,%rsi), %xmm0 | |
185 | pcmpeqb (%rdi), %xmm0 | |
186 | pmovmskb %xmm0, %edx | |
187 | subl $0xffff, %edx | |
188 | jnz L(neq) | |
189 | addq $16, %rdi | |
24a12a5a | 190 | |
e26c9b84 L |
191 | movdqu (%rdi,%rsi), %xmm0 |
192 | pcmpeqb (%rdi), %xmm0 | |
193 | pmovmskb %xmm0, %edx | |
194 | subl $0xffff, %edx | |
195 | jnz L(neq) | |
196 | addq $16, %rdi | |
197 | ||
198 | movdqu (%rdi,%rsi), %xmm0 | |
199 | pcmpeqb (%rdi), %xmm0 | |
200 | pmovmskb %xmm0, %edx | |
201 | subl $0xffff, %edx | |
202 | jnz L(neq) | |
203 | addq $16, %rdi | |
204 | ||
205 | movdqu (%rdi,%rsi), %xmm0 | |
206 | pcmpeqb (%rdi), %xmm0 | |
207 | pmovmskb %xmm0, %edx | |
208 | subl $0xffff, %edx | |
209 | jnz L(neq) | |
210 | addq $16, %rdi | |
211 | ||
212 | cmpq %rdi, %r10 | |
213 | jne L(A64main) | |
214 | ||
215 | L(mt32): | |
216 | movq %r11, %r10 | |
217 | andq $-32, %r10 | |
24a12a5a UD |
218 | cmpq %r10, %rdi |
219 | jge L(mt16) | |
e26c9b84 L |
220 | |
221 | L(A32main): | |
222 | movdqu (%rdi,%rsi), %xmm0 | |
223 | pcmpeqb (%rdi), %xmm0 | |
224 | pmovmskb %xmm0, %edx | |
225 | subl $0xffff, %edx | |
226 | jnz L(neq) | |
227 | addq $16, %rdi | |
24a12a5a | 228 | |
e26c9b84 L |
229 | movdqu (%rdi,%rsi), %xmm0 |
230 | pcmpeqb (%rdi), %xmm0 | |
231 | pmovmskb %xmm0, %edx | |
232 | subl $0xffff, %edx | |
233 | jnz L(neq) | |
234 | addq $16, %rdi | |
235 | ||
236 | cmpq %rdi, %r10 | |
237 | jne L(A32main) | |
238 | L(mt16): | |
239 | subq %rdi, %r11 | |
240 | je L(finz) | |
24a12a5a UD |
241 | movq %r11, %r10 |
242 | jmp L(small) | |
e26c9b84 L |
243 | |
244 | .p2align 4,, 4 | |
245 | L(neq): | |
24a12a5a | 246 | bsfl %edx, %ecx |
e26c9b84 | 247 | movzbl (%rdi, %rcx), %eax |
24a12a5a | 248 | addq %rdi, %rsi |
e26c9b84 L |
249 | movzbl (%rsi,%rcx), %edx |
250 | jmp L(finz1) | |
251 | ||
252 | .p2align 4,, 4 | |
253 | L(ATR): | |
254 | movq %r11, %r10 | |
24a12a5a UD |
255 | andq $-32, %r10 |
256 | cmpq %r10, %rdi | |
257 | jge L(mt16) | |
e26c9b84 L |
258 | testq $16, %rdi |
259 | jz L(ATR32) | |
260 | ||
261 | movdqa (%rdi,%rsi), %xmm0 | |
262 | pcmpeqb (%rdi), %xmm0 | |
263 | pmovmskb %xmm0, %edx | |
264 | subl $0xffff, %edx | |
265 | jnz L(neq) | |
266 | addq $16, %rdi | |
267 | cmpq %rdi, %r10 | |
268 | je L(mt16) | |
269 | ||
270 | L(ATR32): | |
271 | movq %r11, %r10 | |
272 | andq $-64, %r10 | |
273 | testq $32, %rdi | |
274 | jz L(ATR64) | |
275 | ||
276 | movdqa (%rdi,%rsi), %xmm0 | |
277 | pcmpeqb (%rdi), %xmm0 | |
278 | pmovmskb %xmm0, %edx | |
279 | subl $0xffff, %edx | |
280 | jnz L(neq) | |
281 | addq $16, %rdi | |
282 | ||
283 | movdqa (%rdi,%rsi), %xmm0 | |
284 | pcmpeqb (%rdi), %xmm0 | |
285 | pmovmskb %xmm0, %edx | |
286 | subl $0xffff, %edx | |
287 | jnz L(neq) | |
288 | addq $16, %rdi | |
289 | ||
290 | L(ATR64): | |
291 | cmpq %rdi, %r10 | |
24a12a5a | 292 | je L(mt32) |
e26c9b84 L |
293 | |
294 | L(ATR64main): | |
295 | movdqa (%rdi,%rsi), %xmm0 | |
296 | pcmpeqb (%rdi), %xmm0 | |
297 | pmovmskb %xmm0, %edx | |
298 | subl $0xffff, %edx | |
299 | jnz L(neq) | |
300 | addq $16, %rdi | |
301 | ||
302 | movdqa (%rdi,%rsi), %xmm0 | |
303 | pcmpeqb (%rdi), %xmm0 | |
304 | pmovmskb %xmm0, %edx | |
305 | subl $0xffff, %edx | |
306 | jnz L(neq) | |
307 | addq $16, %rdi | |
308 | ||
309 | movdqa (%rdi,%rsi), %xmm0 | |
310 | pcmpeqb (%rdi), %xmm0 | |
311 | pmovmskb %xmm0, %edx | |
312 | subl $0xffff, %edx | |
313 | jnz L(neq) | |
314 | addq $16, %rdi | |
315 | ||
316 | movdqa (%rdi,%rsi), %xmm0 | |
317 | pcmpeqb (%rdi), %xmm0 | |
318 | pmovmskb %xmm0, %edx | |
319 | subl $0xffff, %edx | |
320 | jnz L(neq) | |
321 | addq $16, %rdi | |
322 | cmpq %rdi, %r10 | |
323 | jne L(ATR64main) | |
324 | ||
325 | movq %r11, %r10 | |
24a12a5a UD |
326 | andq $-32, %r10 |
327 | cmpq %r10, %rdi | |
328 | jge L(mt16) | |
e26c9b84 L |
329 | |
330 | L(ATR32res): | |
331 | movdqa (%rdi,%rsi), %xmm0 | |
332 | pcmpeqb (%rdi), %xmm0 | |
333 | pmovmskb %xmm0, %edx | |
334 | subl $0xffff, %edx | |
335 | jnz L(neq) | |
336 | addq $16, %rdi | |
337 | ||
338 | movdqa (%rdi,%rsi), %xmm0 | |
339 | pcmpeqb (%rdi), %xmm0 | |
340 | pmovmskb %xmm0, %edx | |
341 | subl $0xffff, %edx | |
342 | jnz L(neq) | |
343 | addq $16, %rdi | |
344 | ||
24a12a5a | 345 | cmpq %r10, %rdi |
e26c9b84 L |
346 | jne L(ATR32res) |
347 | ||
348 | subq %rdi, %r11 | |
349 | je L(finz) | |
24a12a5a UD |
350 | movq %r11, %r10 |
351 | jmp L(small) | |
e26c9b84 L |
352 | /* Align to 16byte to improve instruction fetch. */ |
353 | .p2align 4,, 4 | |
354 | END(memcmp) | |
355 | ||
356 | #undef bcmp | |
357 | weak_alias (memcmp, bcmp) | |
358 | libc_hidden_builtin_def (memcmp) |