]>
Commit | Line | Data |
---|---|---|
ce7dd29f | 1 | /* Optimized wcslen for x86-64 with SSE2. |
bfff8b1b | 2 | Copyright (C) 2011-2017 Free Software Foundation, Inc. |
ce7dd29f LD |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
ce7dd29f LD |
19 | |
20 | #include <sysdep.h> | |
21 | ||
22 | .text | |
23 | ENTRY (__wcslen) | |
24 | cmpl $0, (%rdi) | |
25 | jz L(exit_tail0) | |
26 | cmpl $0, 4(%rdi) | |
27 | jz L(exit_tail1) | |
28 | cmpl $0, 8(%rdi) | |
29 | jz L(exit_tail2) | |
30 | cmpl $0, 12(%rdi) | |
31 | jz L(exit_tail3) | |
32 | cmpl $0, 16(%rdi) | |
33 | jz L(exit_tail4) | |
34 | cmpl $0, 20(%rdi) | |
35 | jz L(exit_tail5) | |
36 | cmpl $0, 24(%rdi) | |
37 | jz L(exit_tail6) | |
38 | cmpl $0, 28(%rdi) | |
39 | jz L(exit_tail7) | |
40 | ||
41 | pxor %xmm0, %xmm0 | |
42 | ||
43 | lea 32(%rdi), %rax | |
44 | lea 16(%rdi), %rcx | |
45 | and $-16, %rax | |
46 | ||
47 | pcmpeqd (%rax), %xmm0 | |
48 | pmovmskb %xmm0, %edx | |
49 | pxor %xmm1, %xmm1 | |
50 | test %edx, %edx | |
51 | lea 16(%rax), %rax | |
52 | jnz L(exit) | |
53 | ||
54 | pcmpeqd (%rax), %xmm1 | |
55 | pmovmskb %xmm1, %edx | |
56 | pxor %xmm2, %xmm2 | |
57 | test %edx, %edx | |
58 | lea 16(%rax), %rax | |
59 | jnz L(exit) | |
60 | ||
61 | pcmpeqd (%rax), %xmm2 | |
62 | pmovmskb %xmm2, %edx | |
63 | pxor %xmm3, %xmm3 | |
64 | test %edx, %edx | |
65 | lea 16(%rax), %rax | |
66 | jnz L(exit) | |
67 | ||
68 | pcmpeqd (%rax), %xmm3 | |
69 | pmovmskb %xmm3, %edx | |
70 | test %edx, %edx | |
71 | lea 16(%rax), %rax | |
72 | jnz L(exit) | |
73 | ||
74 | pcmpeqd (%rax), %xmm0 | |
75 | pmovmskb %xmm0, %edx | |
76 | test %edx, %edx | |
77 | lea 16(%rax), %rax | |
78 | jnz L(exit) | |
79 | ||
80 | pcmpeqd (%rax), %xmm1 | |
81 | pmovmskb %xmm1, %edx | |
82 | test %edx, %edx | |
83 | lea 16(%rax), %rax | |
84 | jnz L(exit) | |
85 | ||
86 | pcmpeqd (%rax), %xmm2 | |
87 | pmovmskb %xmm2, %edx | |
88 | test %edx, %edx | |
89 | lea 16(%rax), %rax | |
90 | jnz L(exit) | |
91 | ||
92 | pcmpeqd (%rax), %xmm3 | |
93 | pmovmskb %xmm3, %edx | |
94 | test %edx, %edx | |
95 | lea 16(%rax), %rax | |
96 | jnz L(exit) | |
97 | ||
98 | pcmpeqd (%rax), %xmm0 | |
99 | pmovmskb %xmm0, %edx | |
100 | test %edx, %edx | |
101 | lea 16(%rax), %rax | |
102 | jnz L(exit) | |
103 | ||
104 | pcmpeqd (%rax), %xmm1 | |
105 | pmovmskb %xmm1, %edx | |
106 | test %edx, %edx | |
107 | lea 16(%rax), %rax | |
108 | jnz L(exit) | |
109 | ||
110 | pcmpeqd (%rax), %xmm2 | |
111 | pmovmskb %xmm2, %edx | |
112 | test %edx, %edx | |
113 | lea 16(%rax), %rax | |
114 | jnz L(exit) | |
115 | ||
116 | pcmpeqd (%rax), %xmm3 | |
117 | pmovmskb %xmm3, %edx | |
118 | test %edx, %edx | |
119 | lea 16(%rax), %rax | |
120 | jnz L(exit) | |
121 | ||
122 | and $-0x40, %rax | |
123 | ||
124 | .p2align 4 | |
125 | L(aligned_64_loop): | |
126 | movaps (%rax), %xmm0 | |
127 | movaps 16(%rax), %xmm1 | |
128 | movaps 32(%rax), %xmm2 | |
129 | movaps 48(%rax), %xmm6 | |
130 | ||
131 | pminub %xmm1, %xmm0 | |
132 | pminub %xmm6, %xmm2 | |
133 | pminub %xmm0, %xmm2 | |
134 | pcmpeqd %xmm3, %xmm2 | |
135 | pmovmskb %xmm2, %edx | |
136 | test %edx, %edx | |
137 | lea 64(%rax), %rax | |
138 | jz L(aligned_64_loop) | |
139 | ||
140 | pcmpeqd -64(%rax), %xmm3 | |
141 | pmovmskb %xmm3, %edx | |
142 | test %edx, %edx | |
143 | lea 48(%rcx), %rcx | |
144 | jnz L(exit) | |
145 | ||
146 | pcmpeqd %xmm1, %xmm3 | |
147 | pmovmskb %xmm3, %edx | |
148 | test %edx, %edx | |
149 | lea -16(%rcx), %rcx | |
150 | jnz L(exit) | |
151 | ||
152 | pcmpeqd -32(%rax), %xmm3 | |
153 | pmovmskb %xmm3, %edx | |
154 | test %edx, %edx | |
155 | lea -16(%rcx), %rcx | |
156 | jnz L(exit) | |
157 | ||
158 | pcmpeqd %xmm6, %xmm3 | |
159 | pmovmskb %xmm3, %edx | |
160 | test %edx, %edx | |
161 | lea -16(%rcx), %rcx | |
162 | jnz L(exit) | |
163 | ||
164 | jmp L(aligned_64_loop) | |
165 | ||
166 | .p2align 4 | |
167 | L(exit): | |
168 | sub %rcx, %rax | |
169 | shr $2, %rax | |
170 | test %dl, %dl | |
171 | jz L(exit_high) | |
172 | ||
173 | mov %dl, %cl | |
174 | and $15, %cl | |
175 | jz L(exit_1) | |
176 | ret | |
177 | ||
178 | .p2align 4 | |
179 | L(exit_high): | |
180 | mov %dh, %ch | |
181 | and $15, %ch | |
182 | jz L(exit_3) | |
183 | add $2, %rax | |
184 | ret | |
185 | ||
186 | .p2align 4 | |
187 | L(exit_1): | |
188 | add $1, %rax | |
189 | ret | |
190 | ||
191 | .p2align 4 | |
192 | L(exit_3): | |
193 | add $3, %rax | |
194 | ret | |
195 | ||
196 | .p2align 4 | |
197 | L(exit_tail0): | |
198 | xor %rax, %rax | |
199 | ret | |
200 | ||
201 | .p2align 4 | |
202 | L(exit_tail1): | |
203 | mov $1, %rax | |
204 | ret | |
205 | ||
206 | .p2align 4 | |
207 | L(exit_tail2): | |
208 | mov $2, %rax | |
209 | ret | |
210 | ||
211 | .p2align 4 | |
212 | L(exit_tail3): | |
213 | mov $3, %rax | |
214 | ret | |
215 | ||
216 | .p2align 4 | |
217 | L(exit_tail4): | |
218 | mov $4, %rax | |
219 | ret | |
220 | ||
221 | .p2align 4 | |
222 | L(exit_tail5): | |
223 | mov $5, %rax | |
224 | ret | |
225 | ||
226 | .p2align 4 | |
227 | L(exit_tail6): | |
228 | mov $6, %rax | |
229 | ret | |
230 | ||
231 | .p2align 4 | |
232 | L(exit_tail7): | |
233 | mov $7, %rax | |
234 | ret | |
235 | ||
236 | END (__wcslen) | |
237 | ||
238 | weak_alias(__wcslen, wcslen) |