]>
Commit | Line | Data |
---|---|---|
093ecf92 LD |
1 | /* fast SSE2 memchr with 64 byte loop and pmaxub instruction using |
2 | ||
2b778ceb | 3 | Copyright (C) 2011-2021 Free Software Foundation, Inc. |
f140a0d5 UD |
4 | This file is part of the GNU C Library. |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 17 | License along with the GNU C Library; if not, see |
5a82c748 | 18 | <https://www.gnu.org/licenses/>. */ |
f140a0d5 UD |
19 | |
20 | #include <sysdep.h> | |
21 | ||
f140a0d5 | 22 | .text |
293d9a41 | 23 | ENTRY (__rawmemchr) |
093ecf92 LD |
24 | movd %rsi, %xmm1 |
25 | mov %rdi, %rcx | |
26 | ||
f140a0d5 | 27 | punpcklbw %xmm1, %xmm1 |
f140a0d5 | 28 | punpcklbw %xmm1, %xmm1 |
093ecf92 LD |
29 | |
30 | and $63, %rcx | |
f140a0d5 | 31 | pshufd $0, %xmm1, %xmm1 |
093ecf92 LD |
32 | |
33 | cmp $48, %rcx | |
34 | ja L(crosscache) | |
35 | ||
36 | movdqu (%rdi), %xmm0 | |
f140a0d5 | 37 | pcmpeqb %xmm1, %xmm0 |
093ecf92 LD |
38 | /* Check if there is a match. */ |
39 | pmovmskb %xmm0, %eax | |
40 | test %eax, %eax | |
41 | ||
42 | jnz L(matches) | |
43 | add $16, %rdi | |
44 | and $-16, %rdi | |
45 | jmp L(loop_prolog) | |
46 | ||
47 | .p2align 4 | |
48 | L(crosscache): | |
49 | and $15, %rcx | |
50 | and $-16, %rdi | |
51 | movdqa (%rdi), %xmm0 | |
f140a0d5 | 52 | |
f140a0d5 | 53 | pcmpeqb %xmm1, %xmm0 |
093ecf92 LD |
54 | /* Check if there is a match. */ |
55 | pmovmskb %xmm0, %eax | |
56 | /* Remove the leading bytes. */ | |
57 | sar %cl, %eax | |
58 | test %eax, %eax | |
59 | je L(unaligned_no_match) | |
60 | /* Check which byte is a match. */ | |
61 | bsf %eax, %eax | |
f140a0d5 | 62 | |
093ecf92 LD |
63 | add %rdi, %rax |
64 | add %rcx, %rax | |
f140a0d5 | 65 | ret |
093ecf92 LD |
66 | |
67 | .p2align 4 | |
68 | L(unaligned_no_match): | |
69 | add $16, %rdi | |
70 | ||
71 | .p2align 4 | |
72 | L(loop_prolog): | |
73 | movdqa (%rdi), %xmm0 | |
74 | pcmpeqb %xmm1, %xmm0 | |
75 | pmovmskb %xmm0, %eax | |
76 | test %eax, %eax | |
77 | jnz L(matches) | |
78 | ||
79 | movdqa 16(%rdi), %xmm2 | |
80 | pcmpeqb %xmm1, %xmm2 | |
81 | pmovmskb %xmm2, %eax | |
82 | test %eax, %eax | |
83 | jnz L(matches16) | |
84 | ||
85 | movdqa 32(%rdi), %xmm3 | |
86 | pcmpeqb %xmm1, %xmm3 | |
87 | pmovmskb %xmm3, %eax | |
88 | test %eax, %eax | |
89 | jnz L(matches32) | |
90 | ||
91 | movdqa 48(%rdi), %xmm4 | |
92 | pcmpeqb %xmm1, %xmm4 | |
93 | add $64, %rdi | |
94 | pmovmskb %xmm4, %eax | |
95 | test %eax, %eax | |
96 | jnz L(matches0) | |
97 | ||
98 | test $0x3f, %rdi | |
99 | jz L(align64_loop) | |
100 | ||
101 | movdqa (%rdi), %xmm0 | |
102 | pcmpeqb %xmm1, %xmm0 | |
103 | pmovmskb %xmm0, %eax | |
104 | test %eax, %eax | |
105 | jnz L(matches) | |
106 | ||
107 | movdqa 16(%rdi), %xmm2 | |
108 | pcmpeqb %xmm1, %xmm2 | |
109 | pmovmskb %xmm2, %eax | |
110 | test %eax, %eax | |
111 | jnz L(matches16) | |
112 | ||
113 | movdqa 32(%rdi), %xmm3 | |
114 | pcmpeqb %xmm1, %xmm3 | |
115 | pmovmskb %xmm3, %eax | |
116 | test %eax, %eax | |
117 | jnz L(matches32) | |
118 | ||
119 | movdqa 48(%rdi), %xmm3 | |
120 | pcmpeqb %xmm1, %xmm3 | |
121 | pmovmskb %xmm3, %eax | |
122 | ||
123 | add $64, %rdi | |
124 | test %eax, %eax | |
125 | jnz L(matches0) | |
126 | ||
127 | and $-64, %rdi | |
128 | ||
129 | .p2align 4 | |
130 | L(align64_loop): | |
131 | movdqa (%rdi), %xmm0 | |
132 | movdqa 16(%rdi), %xmm2 | |
133 | movdqa 32(%rdi), %xmm3 | |
134 | movdqa 48(%rdi), %xmm4 | |
135 | ||
136 | pcmpeqb %xmm1, %xmm0 | |
137 | pcmpeqb %xmm1, %xmm2 | |
138 | pcmpeqb %xmm1, %xmm3 | |
139 | pcmpeqb %xmm1, %xmm4 | |
140 | ||
141 | pmaxub %xmm0, %xmm3 | |
142 | pmaxub %xmm2, %xmm4 | |
143 | pmaxub %xmm3, %xmm4 | |
144 | pmovmskb %xmm4, %eax | |
145 | ||
146 | add $64, %rdi | |
147 | ||
148 | test %eax, %eax | |
149 | jz L(align64_loop) | |
150 | ||
151 | sub $64, %rdi | |
152 | ||
153 | pmovmskb %xmm0, %eax | |
154 | test %eax, %eax | |
155 | jnz L(matches) | |
156 | ||
157 | pmovmskb %xmm2, %eax | |
158 | test %eax, %eax | |
159 | jnz L(matches16) | |
160 | ||
161 | movdqa 32(%rdi), %xmm3 | |
162 | pcmpeqb %xmm1, %xmm3 | |
163 | ||
164 | pcmpeqb 48(%rdi), %xmm1 | |
165 | pmovmskb %xmm3, %eax | |
166 | test %eax, %eax | |
167 | jnz L(matches32) | |
168 | ||
169 | pmovmskb %xmm1, %eax | |
170 | bsf %eax, %eax | |
171 | lea 48(%rdi, %rax), %rax | |
172 | ret | |
173 | ||
174 | .p2align 4 | |
175 | L(matches0): | |
176 | bsf %eax, %eax | |
177 | lea -16(%rax, %rdi), %rax | |
178 | ret | |
179 | ||
180 | .p2align 4 | |
181 | L(matches): | |
182 | bsf %eax, %eax | |
183 | add %rdi, %rax | |
184 | ret | |
185 | ||
186 | .p2align 4 | |
187 | L(matches16): | |
188 | bsf %eax, %eax | |
189 | lea 16(%rax, %rdi), %rax | |
190 | ret | |
191 | ||
192 | .p2align 4 | |
193 | L(matches32): | |
194 | bsf %eax, %eax | |
195 | lea 32(%rax, %rdi), %rax | |
196 | ret | |
197 | ||
293d9a41 | 198 | END (__rawmemchr) |
f140a0d5 | 199 | |
293d9a41 | 200 | weak_alias (__rawmemchr, rawmemchr) |
f140a0d5 | 201 | libc_hidden_builtin_def (__rawmemchr) |