]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-sse2-unaligned.S
1 /* strcmp with unaligned loads
2 Copyright (C) 2013-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include "sysdep.h"
20
21 ENTRY ( __strcmp_sse2_unaligned)
22 movl %edi, %eax
23 xorl %edx, %edx
24 pxor %xmm7, %xmm7
25 orl %esi, %eax
26 andl $4095, %eax
27 cmpl $4032, %eax
28 jg L(cross_page)
29 movdqu (%rdi), %xmm1
30 movdqu (%rsi), %xmm0
31 pcmpeqb %xmm1, %xmm0
32 pminub %xmm1, %xmm0
33 pxor %xmm1, %xmm1
34 pcmpeqb %xmm1, %xmm0
35 pmovmskb %xmm0, %eax
36 testq %rax, %rax
37 je L(next_48_bytes)
38 L(return):
39 bsfq %rax, %rdx
40 movzbl (%rdi, %rdx), %eax
41 movzbl (%rsi, %rdx), %edx
42 subl %edx, %eax
43 ret
44
45 .p2align 4
46 L(next_48_bytes):
47 movdqu 16(%rdi), %xmm6
48 movdqu 16(%rsi), %xmm3
49 movdqu 32(%rdi), %xmm5
50 pcmpeqb %xmm6, %xmm3
51 movdqu 32(%rsi), %xmm2
52 pminub %xmm6, %xmm3
53 pcmpeqb %xmm1, %xmm3
54 movdqu 48(%rdi), %xmm4
55 pcmpeqb %xmm5, %xmm2
56 pmovmskb %xmm3, %edx
57 movdqu 48(%rsi), %xmm0
58 pminub %xmm5, %xmm2
59 pcmpeqb %xmm1, %xmm2
60 pcmpeqb %xmm4, %xmm0
61 pmovmskb %xmm2, %eax
62 salq $16, %rdx
63 pminub %xmm4, %xmm0
64 pcmpeqb %xmm1, %xmm0
65 salq $32, %rax
66 orq %rdx, %rax
67 pmovmskb %xmm0, %ecx
68 movq %rcx, %rdx
69 salq $48, %rdx
70 orq %rdx, %rax
71 jne L(return)
72 L(main_loop_header):
73 leaq 64(%rdi), %rdx
74 movl $4096, %ecx
75 pxor %xmm9, %xmm9
76 andq $-64, %rdx
77 subq %rdi, %rdx
78 leaq (%rdi, %rdx), %rax
79 addq %rsi, %rdx
80 movq %rdx, %rsi
81 andl $4095, %esi
82 subq %rsi, %rcx
83 shrq $6, %rcx
84 movq %rcx, %rsi
85 jmp L(loop_start)
86
87 .p2align 4
88 L(loop):
89 addq $64, %rax
90 addq $64, %rdx
91 L(loop_start):
92 testq %rsi, %rsi
93 leaq -1(%rsi), %rsi
94 je L(loop_cross_page)
95 L(back_to_loop):
96 movdqu (%rdx), %xmm0
97 movdqu 16(%rdx), %xmm1
98 movdqa (%rax), %xmm2
99 movdqa 16(%rax), %xmm3
100 pcmpeqb %xmm2, %xmm0
101 movdqu 32(%rdx), %xmm5
102 pcmpeqb %xmm3, %xmm1
103 pminub %xmm2, %xmm0
104 movdqu 48(%rdx), %xmm6
105 pminub %xmm3, %xmm1
106 movdqa 32(%rax), %xmm2
107 pminub %xmm1, %xmm0
108 movdqa 48(%rax), %xmm3
109 pcmpeqb %xmm2, %xmm5
110 pcmpeqb %xmm3, %xmm6
111 pminub %xmm2, %xmm5
112 pminub %xmm3, %xmm6
113 pminub %xmm5, %xmm0
114 pminub %xmm6, %xmm0
115 pcmpeqb %xmm7, %xmm0
116 pmovmskb %xmm0, %ecx
117 testl %ecx, %ecx
118 je L(loop)
119 pcmpeqb %xmm7, %xmm5
120 movdqu (%rdx), %xmm0
121 pcmpeqb %xmm7, %xmm1
122 movdqa (%rax), %xmm2
123 pcmpeqb %xmm2, %xmm0
124 pminub %xmm2, %xmm0
125 pcmpeqb %xmm7, %xmm6
126 pcmpeqb %xmm7, %xmm0
127 pmovmskb %xmm1, %ecx
128 pmovmskb %xmm5, %r8d
129 pmovmskb %xmm0, %edi
130 salq $16, %rcx
131 salq $32, %r8
132 pmovmskb %xmm6, %esi
133 orq %r8, %rcx
134 orq %rdi, %rcx
135 salq $48, %rsi
136 orq %rsi, %rcx
137 bsfq %rcx, %rcx
138 movzbl (%rax, %rcx), %eax
139 movzbl (%rdx, %rcx), %edx
140 subl %edx, %eax
141 ret
142
143 .p2align 4
144 L(loop_cross_page):
145 xor %r10, %r10
146 movq %rdx, %r9
147 and $63, %r9
148 subq %r9, %r10
149
150 movdqa (%rdx, %r10), %xmm0
151 movdqa 16(%rdx, %r10), %xmm1
152 movdqu (%rax, %r10), %xmm2
153 movdqu 16(%rax, %r10), %xmm3
154 pcmpeqb %xmm2, %xmm0
155 movdqa 32(%rdx, %r10), %xmm5
156 pcmpeqb %xmm3, %xmm1
157 pminub %xmm2, %xmm0
158 movdqa 48(%rdx, %r10), %xmm6
159 pminub %xmm3, %xmm1
160 movdqu 32(%rax, %r10), %xmm2
161 movdqu 48(%rax, %r10), %xmm3
162 pcmpeqb %xmm2, %xmm5
163 pcmpeqb %xmm3, %xmm6
164 pminub %xmm2, %xmm5
165 pminub %xmm3, %xmm6
166
167 pcmpeqb %xmm7, %xmm0
168 pcmpeqb %xmm7, %xmm1
169 pcmpeqb %xmm7, %xmm5
170 pcmpeqb %xmm7, %xmm6
171
172 pmovmskb %xmm1, %ecx
173 pmovmskb %xmm5, %r8d
174 pmovmskb %xmm0, %edi
175 salq $16, %rcx
176 salq $32, %r8
177 pmovmskb %xmm6, %esi
178 orq %r8, %rdi
179 orq %rcx, %rdi
180 salq $48, %rsi
181 orq %rsi, %rdi
182 movq %r9, %rcx
183 movq $63, %rsi
184 shrq %cl, %rdi
185 test %rdi, %rdi
186 je L(back_to_loop)
187 bsfq %rdi, %rcx
188 movzbl (%rax, %rcx), %eax
189 movzbl (%rdx, %rcx), %edx
190 subl %edx, %eax
191 ret
192
193 .p2align 4
194 L(cross_page_loop):
195 cmpb %cl, %al
196 jne L(different)
197 addq $1, %rdx
198 cmpq $64, %rdx
199 je L(main_loop_header)
200 L(cross_page):
201 movzbl (%rdi, %rdx), %eax
202 movzbl (%rsi, %rdx), %ecx
203 testb %al, %al
204 jne L(cross_page_loop)
205 xorl %eax, %eax
206 L(different):
207 subl %ecx, %eax
208 ret
209 END (__strcmp_sse2_unaligned)