]>
Commit | Line | Data |
---|---|---|
fc2ee42a | 1 | /* wcslen with SSE2 |
688903eb | 2 | Copyright (C) 2011-2018 Free Software Foundation, Inc. |
fc2ee42a LD |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
fc2ee42a | 19 | |
4f41c682 | 20 | #if IS_IN (libc) |
fc2ee42a LD |
21 | # include <sysdep.h> |
22 | # define STR 4 | |
23 | ||
24 | .text | |
25 | ENTRY (__wcslen_sse2) | |
26 | mov STR(%esp), %edx | |
27 | ||
28 | cmp $0, (%edx) | |
29 | jz L(exit_tail0) | |
30 | cmp $0, 4(%edx) | |
31 | jz L(exit_tail1) | |
32 | cmp $0, 8(%edx) | |
33 | jz L(exit_tail2) | |
34 | cmp $0, 12(%edx) | |
35 | jz L(exit_tail3) | |
36 | cmp $0, 16(%edx) | |
37 | jz L(exit_tail4) | |
38 | cmp $0, 20(%edx) | |
39 | jz L(exit_tail5) | |
40 | cmp $0, 24(%edx) | |
41 | jz L(exit_tail6) | |
42 | cmp $0, 28(%edx) | |
43 | jz L(exit_tail7) | |
44 | ||
45 | pxor %xmm0, %xmm0 | |
46 | ||
47 | lea 32(%edx), %eax | |
48 | lea 16(%edx), %ecx | |
49 | and $-16, %eax | |
50 | ||
51 | pcmpeqd (%eax), %xmm0 | |
52 | pmovmskb %xmm0, %edx | |
53 | pxor %xmm1, %xmm1 | |
54 | test %edx, %edx | |
55 | lea 16(%eax), %eax | |
56 | jnz L(exit) | |
57 | ||
58 | pcmpeqd (%eax), %xmm1 | |
59 | pmovmskb %xmm1, %edx | |
60 | pxor %xmm2, %xmm2 | |
61 | test %edx, %edx | |
62 | lea 16(%eax), %eax | |
63 | jnz L(exit) | |
64 | ||
65 | pcmpeqd (%eax), %xmm2 | |
66 | pmovmskb %xmm2, %edx | |
67 | pxor %xmm3, %xmm3 | |
68 | test %edx, %edx | |
69 | lea 16(%eax), %eax | |
70 | jnz L(exit) | |
71 | ||
72 | pcmpeqd (%eax), %xmm3 | |
73 | pmovmskb %xmm3, %edx | |
74 | test %edx, %edx | |
75 | lea 16(%eax), %eax | |
76 | jnz L(exit) | |
77 | ||
78 | and $-0x40, %eax | |
79 | ||
80 | .p2align 4 | |
81 | L(aligned_64_loop): | |
82 | movaps (%eax), %xmm0 | |
83 | movaps 16(%eax), %xmm1 | |
84 | movaps 32(%eax), %xmm2 | |
85 | movaps 48(%eax), %xmm6 | |
86 | ||
87 | pminub %xmm1, %xmm0 | |
88 | pminub %xmm6, %xmm2 | |
89 | pminub %xmm0, %xmm2 | |
90 | pcmpeqd %xmm3, %xmm2 | |
91 | pmovmskb %xmm2, %edx | |
92 | test %edx, %edx | |
93 | lea 64(%eax), %eax | |
94 | jz L(aligned_64_loop) | |
95 | ||
96 | pcmpeqd -64(%eax), %xmm3 | |
97 | pmovmskb %xmm3, %edx | |
98 | test %edx, %edx | |
99 | lea 48(%ecx), %ecx | |
100 | jnz L(exit) | |
101 | ||
102 | pcmpeqd %xmm1, %xmm3 | |
103 | pmovmskb %xmm3, %edx | |
104 | test %edx, %edx | |
105 | lea -16(%ecx), %ecx | |
106 | jnz L(exit) | |
107 | ||
108 | pcmpeqd -32(%eax), %xmm3 | |
109 | pmovmskb %xmm3, %edx | |
110 | test %edx, %edx | |
111 | lea -16(%ecx), %ecx | |
112 | jnz L(exit) | |
113 | ||
114 | pcmpeqd %xmm6, %xmm3 | |
115 | pmovmskb %xmm3, %edx | |
116 | test %edx, %edx | |
117 | lea -16(%ecx), %ecx | |
118 | jnz L(exit) | |
119 | ||
120 | jmp L(aligned_64_loop) | |
121 | ||
122 | .p2align 4 | |
123 | L(exit): | |
124 | sub %ecx, %eax | |
125 | shr $2, %eax | |
126 | test %dl, %dl | |
127 | jz L(exit_high) | |
128 | ||
129 | mov %dl, %cl | |
130 | and $15, %cl | |
131 | jz L(exit_1) | |
132 | ret | |
133 | ||
134 | .p2align 4 | |
135 | L(exit_high): | |
136 | mov %dh, %ch | |
137 | and $15, %ch | |
138 | jz L(exit_3) | |
139 | add $2, %eax | |
140 | ret | |
141 | ||
142 | .p2align 4 | |
143 | L(exit_1): | |
144 | add $1, %eax | |
145 | ret | |
146 | ||
147 | .p2align 4 | |
148 | L(exit_3): | |
149 | add $3, %eax | |
150 | ret | |
151 | ||
152 | .p2align 4 | |
153 | L(exit_tail0): | |
154 | xor %eax, %eax | |
155 | ret | |
156 | ||
157 | .p2align 4 | |
158 | L(exit_tail1): | |
159 | mov $1, %eax | |
160 | ret | |
161 | ||
162 | .p2align 4 | |
163 | L(exit_tail2): | |
164 | mov $2, %eax | |
165 | ret | |
166 | ||
167 | .p2align 4 | |
168 | L(exit_tail3): | |
169 | mov $3, %eax | |
170 | ret | |
171 | ||
172 | .p2align 4 | |
173 | L(exit_tail4): | |
174 | mov $4, %eax | |
175 | ret | |
176 | ||
177 | .p2align 4 | |
178 | L(exit_tail5): | |
179 | mov $5, %eax | |
180 | ret | |
181 | ||
182 | .p2align 4 | |
183 | L(exit_tail6): | |
184 | mov $6, %eax | |
185 | ret | |
186 | ||
187 | .p2align 4 | |
188 | L(exit_tail7): | |
189 | mov $7, %eax | |
190 | ret | |
191 | ||
192 | END (__wcslen_sse2) | |
193 | #endif |