]>
Commit | Line | Data |
---|---|---|
093ecf92 LD |
1 | /* fast SSE2 memchr with 64 byte loop and pmaxub instruction using |
2 | ||
d4697bc9 | 3 | Copyright (C) 2011-2014 Free Software Foundation, Inc. |
093ecf92 | 4 | Contributed by Intel Corporation. |
f140a0d5 UD |
5 | This file is part of the GNU C Library. |
6 | ||
7 | The GNU C Library is free software; you can redistribute it and/or | |
8 | modify it under the terms of the GNU Lesser General Public | |
9 | License as published by the Free Software Foundation; either | |
10 | version 2.1 of the License, or (at your option) any later version. | |
11 | ||
12 | The GNU C Library is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | Lesser General Public License for more details. | |
16 | ||
17 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
18 | License along with the GNU C Library; if not, see |
19 | <http://www.gnu.org/licenses/>. */ | |
f140a0d5 UD |
20 | |
21 | #include <sysdep.h> | |
22 | ||
f140a0d5 | 23 | .text |
293d9a41 | 24 | ENTRY (__rawmemchr) |
093ecf92 LD |
25 | movd %rsi, %xmm1 |
26 | mov %rdi, %rcx | |
27 | ||
f140a0d5 | 28 | punpcklbw %xmm1, %xmm1 |
f140a0d5 | 29 | punpcklbw %xmm1, %xmm1 |
093ecf92 LD |
30 | |
31 | and $63, %rcx | |
f140a0d5 | 32 | pshufd $0, %xmm1, %xmm1 |
093ecf92 LD |
33 | |
34 | cmp $48, %rcx | |
35 | ja L(crosscache) | |
36 | ||
37 | movdqu (%rdi), %xmm0 | |
f140a0d5 | 38 | pcmpeqb %xmm1, %xmm0 |
093ecf92 LD |
39 | /* Check if there is a match. */ |
40 | pmovmskb %xmm0, %eax | |
41 | test %eax, %eax | |
42 | ||
43 | jnz L(matches) | |
44 | add $16, %rdi | |
45 | and $-16, %rdi | |
46 | jmp L(loop_prolog) | |
47 | ||
48 | .p2align 4 | |
49 | L(crosscache): | |
50 | and $15, %rcx | |
51 | and $-16, %rdi | |
52 | movdqa (%rdi), %xmm0 | |
f140a0d5 | 53 | |
f140a0d5 | 54 | pcmpeqb %xmm1, %xmm0 |
093ecf92 LD |
55 | /* Check if there is a match. */ |
56 | pmovmskb %xmm0, %eax | |
57 | /* Remove the leading bytes. */ | |
58 | sar %cl, %eax | |
59 | test %eax, %eax | |
60 | je L(unaligned_no_match) | |
61 | /* Check which byte is a match. */ | |
62 | bsf %eax, %eax | |
f140a0d5 | 63 | |
093ecf92 LD |
64 | add %rdi, %rax |
65 | add %rcx, %rax | |
f140a0d5 | 66 | ret |
093ecf92 LD |
67 | |
68 | .p2align 4 | |
69 | L(unaligned_no_match): | |
70 | add $16, %rdi | |
71 | ||
72 | .p2align 4 | |
73 | L(loop_prolog): | |
74 | movdqa (%rdi), %xmm0 | |
75 | pcmpeqb %xmm1, %xmm0 | |
76 | pmovmskb %xmm0, %eax | |
77 | test %eax, %eax | |
78 | jnz L(matches) | |
79 | ||
80 | movdqa 16(%rdi), %xmm2 | |
81 | pcmpeqb %xmm1, %xmm2 | |
82 | pmovmskb %xmm2, %eax | |
83 | test %eax, %eax | |
84 | jnz L(matches16) | |
85 | ||
86 | movdqa 32(%rdi), %xmm3 | |
87 | pcmpeqb %xmm1, %xmm3 | |
88 | pmovmskb %xmm3, %eax | |
89 | test %eax, %eax | |
90 | jnz L(matches32) | |
91 | ||
92 | movdqa 48(%rdi), %xmm4 | |
93 | pcmpeqb %xmm1, %xmm4 | |
94 | add $64, %rdi | |
95 | pmovmskb %xmm4, %eax | |
96 | test %eax, %eax | |
97 | jnz L(matches0) | |
98 | ||
99 | test $0x3f, %rdi | |
100 | jz L(align64_loop) | |
101 | ||
102 | movdqa (%rdi), %xmm0 | |
103 | pcmpeqb %xmm1, %xmm0 | |
104 | pmovmskb %xmm0, %eax | |
105 | test %eax, %eax | |
106 | jnz L(matches) | |
107 | ||
108 | movdqa 16(%rdi), %xmm2 | |
109 | pcmpeqb %xmm1, %xmm2 | |
110 | pmovmskb %xmm2, %eax | |
111 | test %eax, %eax | |
112 | jnz L(matches16) | |
113 | ||
114 | movdqa 32(%rdi), %xmm3 | |
115 | pcmpeqb %xmm1, %xmm3 | |
116 | pmovmskb %xmm3, %eax | |
117 | test %eax, %eax | |
118 | jnz L(matches32) | |
119 | ||
120 | movdqa 48(%rdi), %xmm3 | |
121 | pcmpeqb %xmm1, %xmm3 | |
122 | pmovmskb %xmm3, %eax | |
123 | ||
124 | add $64, %rdi | |
125 | test %eax, %eax | |
126 | jnz L(matches0) | |
127 | ||
128 | and $-64, %rdi | |
129 | ||
130 | .p2align 4 | |
131 | L(align64_loop): | |
132 | movdqa (%rdi), %xmm0 | |
133 | movdqa 16(%rdi), %xmm2 | |
134 | movdqa 32(%rdi), %xmm3 | |
135 | movdqa 48(%rdi), %xmm4 | |
136 | ||
137 | pcmpeqb %xmm1, %xmm0 | |
138 | pcmpeqb %xmm1, %xmm2 | |
139 | pcmpeqb %xmm1, %xmm3 | |
140 | pcmpeqb %xmm1, %xmm4 | |
141 | ||
142 | pmaxub %xmm0, %xmm3 | |
143 | pmaxub %xmm2, %xmm4 | |
144 | pmaxub %xmm3, %xmm4 | |
145 | pmovmskb %xmm4, %eax | |
146 | ||
147 | add $64, %rdi | |
148 | ||
149 | test %eax, %eax | |
150 | jz L(align64_loop) | |
151 | ||
152 | sub $64, %rdi | |
153 | ||
154 | pmovmskb %xmm0, %eax | |
155 | test %eax, %eax | |
156 | jnz L(matches) | |
157 | ||
158 | pmovmskb %xmm2, %eax | |
159 | test %eax, %eax | |
160 | jnz L(matches16) | |
161 | ||
162 | movdqa 32(%rdi), %xmm3 | |
163 | pcmpeqb %xmm1, %xmm3 | |
164 | ||
165 | pcmpeqb 48(%rdi), %xmm1 | |
166 | pmovmskb %xmm3, %eax | |
167 | test %eax, %eax | |
168 | jnz L(matches32) | |
169 | ||
170 | pmovmskb %xmm1, %eax | |
171 | bsf %eax, %eax | |
172 | lea 48(%rdi, %rax), %rax | |
173 | ret | |
174 | ||
175 | .p2align 4 | |
176 | L(matches0): | |
177 | bsf %eax, %eax | |
178 | lea -16(%rax, %rdi), %rax | |
179 | ret | |
180 | ||
181 | .p2align 4 | |
182 | L(matches): | |
183 | bsf %eax, %eax | |
184 | add %rdi, %rax | |
185 | ret | |
186 | ||
187 | .p2align 4 | |
188 | L(matches16): | |
189 | bsf %eax, %eax | |
190 | lea 16(%rax, %rdi), %rax | |
191 | ret | |
192 | ||
193 | .p2align 4 | |
194 | L(matches32): | |
195 | bsf %eax, %eax | |
196 | lea 32(%rax, %rdi), %rax | |
197 | ret | |
198 | ||
199 | .p2align 4 | |
200 | L(return_null): | |
201 | xor %rax, %rax | |
202 | ret | |
203 | ||
293d9a41 | 204 | END (__rawmemchr) |
f140a0d5 | 205 | |
293d9a41 | 206 | weak_alias (__rawmemchr, rawmemchr) |
f140a0d5 | 207 | libc_hidden_builtin_def (__rawmemchr) |