]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/i386/i586/strchr.S
2.5-18.1
[thirdparty/glibc.git] / sysdeps / i386 / i586 / strchr.S
CommitLineData
5929563f 1/* Find character CH in a NUL terminated string.
6d52618b 2 Highly optimized version for ix85, x>=5.
0ecb606c 3 Copyright (C) 1995,1996,1997,2000,2003,2005 Free Software Foundation, Inc.
6d52618b
UD
4 This file is part of the GNU C Library.
5 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
6
7 The GNU C Library is free software; you can redistribute it and/or
41bdb6e2
AJ
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
6d52618b
UD
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2 15 Lesser General Public License for more details.
6d52618b 16
41bdb6e2
AJ
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 02111-1307 USA. */
8f5ca04b
RM
21
22#include <sysdep.h>
5929563f 23#include "asm-syntax.h"
2fc08826 24#include "bp-sym.h"
3f02f778 25#include "bp-asm.h"
8f5ca04b
RM
26
27/* This version is especially optimized for the i586 (and following?)
28 processors. This is mainly done by using the two pipelines. The
29 version optimized for i486 is weak in this aspect because to get
6d52618b 30 as much parallelism we have to execute some *more* instructions.
8f5ca04b
RM
31
32 The code below is structured to reflect the pairing of the instructions
33 as *I think* it is. I have no processor data book to verify this.
34 If you find something you think is incorrect let me know. */
35
36
37/* The magic value which is used throughout in the whole code. */
38#define magic 0xfefefeff
39
3f02f778
GM
40#define PARMS LINKAGE+16 /* space for 4 saved regs */
41#define RTN PARMS
42#define STR RTN+RTN_SIZE
43#define CHR STR+PTR_SIZE
8f5ca04b
RM
44
45 .text
2fc08826 46ENTRY (BP_SYM (strchr))
3f02f778
GM
47 ENTER
48
8f5ca04b 49 pushl %edi /* Save callee-safe registers. */
0ecb606c 50 cfi_adjust_cfa_offset (-4)
8f5ca04b 51 pushl %esi
0ecb606c 52 cfi_adjust_cfa_offset (-4)
8f5ca04b
RM
53
54 pushl %ebx
0ecb606c 55 cfi_adjust_cfa_offset (-4)
8f5ca04b 56 pushl %ebp
0ecb606c 57 cfi_adjust_cfa_offset (-4)
8f5ca04b 58
3f02f778
GM
59 movl STR(%esp), %eax
60 movl CHR(%esp), %edx
2fc08826 61 CHECK_BOUNDS_LOW (%eax, STR(%esp))
8f5ca04b
RM
62
63 movl %eax, %edi /* duplicate string pointer for later */
0ecb606c 64 cfi_rel_offset (edi, 12)
8f5ca04b
RM
65 xorl %ecx, %ecx /* clear %ecx */
66
67 /* At the moment %edx contains C. What we need for the
68 algorithm is C in all bytes of the dword. Avoid
69 operations on 16 bit words because these require an
70 prefix byte (and one more cycle). */
71 movb %dl, %dh /* now it is 0|0|c|c */
72 movb %dl, %cl /* we construct the lower half in %ecx */
73
74 shll $16, %edx /* now %edx is c|c|0|0 */
75 movb %cl, %ch /* now %ecx is 0|0|c|c */
76
77 orl %ecx, %edx /* and finally c|c|c|c */
78 andl $3, %edi /* mask alignment bits */
79
5929563f 80 jz L(11) /* alignment is 0 => start loop */
8f5ca04b 81
11336c16 82 movb %dl, %cl /* 0 is needed below */
5929563f 83 jp L(0) /* exactly two bits set */
8f5ca04b 84
11336c16 85 xorb (%eax), %cl /* is byte the one we are looking for? */
5929563f 86 jz L(2) /* yes => return pointer */
8f5ca04b 87
11336c16 88 xorb %dl, %cl /* load single byte and test for NUL */
5929563f 89 je L(3) /* yes => return NULL */
8f5ca04b 90
11336c16
UD
91 movb 1(%eax), %cl /* load single byte */
92 incl %eax
8f5ca04b 93
8f5ca04b 94 cmpb %cl, %dl /* is byte == C? */
5929563f 95 je L(2) /* aligned => return pointer */
8f5ca04b 96
3e2ee727 97 cmpb $0, %cl /* is byte NUL? */
5929563f 98 je L(3) /* yes => return NULL */
8f5ca04b 99
11336c16
UD
100 incl %eax
101 decl %edi
102
5929563f 103 jne L(11)
11336c16 104
5929563f 105L(0): movb (%eax), %cl /* load single byte */
8f5ca04b 106
8f5ca04b 107 cmpb %cl, %dl /* is byte == C? */
5929563f 108 je L(2) /* aligned => return pointer */
8f5ca04b 109
3e2ee727 110 cmpb $0, %cl /* is byte NUL? */
5929563f 111 je L(3) /* yes => return NULL */
8f5ca04b
RM
112
113 incl %eax /* increment pointer */
114
0ecb606c
JJ
115 cfi_rel_offset (esi, 8)
116 cfi_rel_offset (ebx, 4)
117 cfi_rel_offset (ebp, 0)
118
8f5ca04b
RM
119 /* The following code is the preparation for the loop. The
120 four instruction up to `L1' will not be executed in the loop
121 because the same code is found at the end of the loop, but
122 there it is executed in parallel with other instructions. */
5929563f 123L(11): movl (%eax), %ecx
8f5ca04b
RM
124 movl $magic, %ebp
125
126 movl $magic, %edi
127 addl %ecx, %ebp
128
129 /* The main loop: it looks complex and indeed it is. I would
130 love to say `it was hard to write, so it should he hard to
131 read' but I will give some more hints. To fully understand
132 this code you should first take a look at the i486 version.
133 The basic algorithm is the same, but here the code organized
134 in a way which permits to use both pipelines all the time.
135
136 I tried to make it a bit more understandable by indenting
137 the code according to stage in the algorithm. It goes as
138 follows:
139 check for 0 in 1st word
140 check for C in 1st word
141 check for 0 in 2nd word
142 check for C in 2nd word
143 check for 0 in 3rd word
144 check for C in 3rd word
145 check for 0 in 4th word
146 check for C in 4th word
147
148 Please note that doing the test for NUL before the test for
149 C allows us to overlap the test for 0 in the next word with
150 the test for C. */
151
5929563f 152L(1): xorl %ecx, %ebp /* (word^magic) */
8f5ca04b
RM
153 addl %ecx, %edi /* add magic word */
154
155 leal 4(%eax), %eax /* increment pointer */
5929563f 156 jnc L(4) /* previous addl caused overflow? */
8f5ca04b
RM
157
158 movl %ecx, %ebx /* duplicate original word */
159 orl $magic, %ebp /* (word^magic)|magic */
160
161 addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */
5929563f 162 jne L(4) /* yes => we found word with NUL */
8f5ca04b
RM
163
164 movl $magic, %esi /* load magic value */
165 xorl %edx, %ebx /* clear words which are C */
166
167 movl (%eax), %ecx
168 addl %ebx, %esi /* (word+magic) */
169
170 movl $magic, %edi
5929563f 171 jnc L(5) /* previous addl caused overflow? */
8f5ca04b
RM
172
173 movl %edi, %ebp
174 xorl %ebx, %esi /* (word+magic)^word */
175
176 addl %ecx, %ebp
177 orl $magic, %esi /* ((word+magic)^word)|magic */
178
179 addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/
5929563f 180 jne L(5) /* yes => we found word with C */
8f5ca04b
RM
181
182 xorl %ecx, %ebp
183 addl %ecx, %edi
184
185 leal 4(%eax), %eax
5929563f 186 jnc L(4)
8f5ca04b
RM
187
188 movl %ecx, %ebx
189 orl $magic, %ebp
190
191 addl $1, %ebp
5929563f 192 jne L(4)
8f5ca04b
RM
193
194 movl $magic, %esi
195 xorl %edx, %ebx
196
197 movl (%eax), %ecx
198 addl %ebx, %esi
199
200 movl $magic, %edi
5929563f 201 jnc L(5)
8f5ca04b
RM
202
203 movl %edi, %ebp
204 xorl %ebx, %esi
205
206 addl %ecx, %ebp
207 orl $magic, %esi
208
209 addl $1, %esi
5929563f 210 jne L(5)
8f5ca04b
RM
211
212 xorl %ecx, %ebp
213 addl %ecx, %edi
214
215 leal 4(%eax), %eax
5929563f 216 jnc L(4)
8f5ca04b
RM
217
218 movl %ecx, %ebx
219 orl $magic, %ebp
220
221 addl $1, %ebp
5929563f 222 jne L(4)
8f5ca04b
RM
223
224 movl $magic, %esi
225 xorl %edx, %ebx
226
227 movl (%eax), %ecx
228 addl %ebx, %esi
229
230 movl $magic, %edi
5929563f 231 jnc L(5)
8f5ca04b
RM
232
233 movl %edi, %ebp
234 xorl %ebx, %esi
235
236 addl %ecx, %ebp
237 orl $magic, %esi
238
239 addl $1, %esi
5929563f 240 jne L(5)
8f5ca04b
RM
241
242 xorl %ecx, %ebp
243 addl %ecx, %edi
244
245 leal 4(%eax), %eax
5929563f 246 jnc L(4)
8f5ca04b
RM
247
248 movl %ecx, %ebx
249 orl $magic, %ebp
250
251 addl $1, %ebp
5929563f 252 jne L(4)
8f5ca04b
RM
253
254 movl $magic, %esi
255 xorl %edx, %ebx
256
257 movl (%eax), %ecx
258 addl %ebx, %esi
259
260 movl $magic, %edi
5929563f 261 jnc L(5)
8f5ca04b
RM
262
263 movl %edi, %ebp
264 xorl %ebx, %esi
265
266 addl %ecx, %ebp
267 orl $magic, %esi
268
269 addl $1, %esi
270
5929563f 271 je L(1)
8f5ca04b
RM
272
273 /* We know there is no NUL byte but a C byte in the word.
274 %ebx contains NUL in this particular byte. */
5929563f 275L(5): subl $4, %eax /* adjust pointer */
8f5ca04b
RM
276 testb %bl, %bl /* first byte == C? */
277
5929563f 278 jz L(2) /* yes => return pointer */
8f5ca04b
RM
279
280 incl %eax /* increment pointer */
281 testb %bh, %bh /* second byte == C? */
282
5929563f 283 jz L(2) /* yes => return pointer */
8f5ca04b
RM
284
285 shrl $16, %ebx /* make upper bytes accessible */
286 incl %eax /* increment pointer */
287
288 cmp $0, %bl /* third byte == C */
5929563f 289 je L(2) /* yes => return pointer */
8f5ca04b
RM
290
291 incl %eax /* increment pointer */
292
2fc08826
GM
293L(2): CHECK_BOUNDS_HIGH (%eax, STR(%esp), jb)
294 RETURN_BOUNDED_POINTER (STR(%esp))
0ecb606c
JJ
295L(out): popl %ebp /* restore saved registers */
296 cfi_adjust_cfa_offset (-4)
297 cfi_restore (ebp)
8f5ca04b 298 popl %ebx
0ecb606c
JJ
299 cfi_adjust_cfa_offset (-4)
300 cfi_restore (ebx)
8f5ca04b
RM
301
302 popl %esi
0ecb606c
JJ
303 cfi_adjust_cfa_offset (-4)
304 cfi_restore (esi)
8f5ca04b 305 popl %edi
0ecb606c
JJ
306 cfi_adjust_cfa_offset (-4)
307 cfi_restore (edi)
8f5ca04b 308
3f02f778
GM
309 LEAVE
310 RET_PTR
8f5ca04b 311
0ecb606c
JJ
312 cfi_adjust_cfa_offset (16)
313 cfi_rel_offset (edi, 12)
314 cfi_rel_offset (esi, 8)
315 cfi_rel_offset (ebx, 4)
316 cfi_rel_offset (ebp, 0)
8f5ca04b
RM
317 /* We know there is a NUL byte in the word. But we have to test
318 whether there is an C byte before it in the word. */
5929563f 319L(4): subl $4, %eax /* adjust pointer */
8f5ca04b
RM
320 cmpb %dl, %cl /* first byte == C? */
321
5929563f 322 je L(2) /* yes => return pointer */
8f5ca04b
RM
323
324 cmpb $0, %cl /* first byte == NUL? */
5929563f 325 je L(3) /* yes => return NULL */
8f5ca04b
RM
326
327 incl %eax /* increment pointer */
328
329 cmpb %dl, %ch /* second byte == C? */
5929563f 330 je L(2) /* yes => return pointer */
8f5ca04b
RM
331
332 cmpb $0, %ch /* second byte == NUL? */
5929563f 333 je L(3) /* yes => return NULL */
8f5ca04b
RM
334
335 shrl $16, %ecx /* make upper bytes accessible */
336 incl %eax /* increment pointer */
337
338 cmpb %dl, %cl /* third byte == C? */
5929563f 339 je L(2) /* yes => return pointer */
8f5ca04b
RM
340
341 cmpb $0, %cl /* third byte == NUL? */
5929563f 342 je L(3) /* yes => return NULL */
8f5ca04b
RM
343
344 incl %eax /* increment pointer */
345
346 /* The test four the fourth byte is necessary! */
347 cmpb %dl, %ch /* fourth byte == C? */
5929563f 348 je L(2) /* yes => return pointer */
8f5ca04b 349
2fc08826
GM
350L(3): xorl %eax, %eax
351 RETURN_NULL_BOUNDED_POINTER
0ecb606c 352 jmp L(out)
2fc08826 353END (BP_SYM (strchr))
8f5ca04b
RM
354
355#undef index
2fc08826 356weak_alias (BP_SYM (strchr), BP_SYM (index))
85dd1003 357libc_hidden_builtin_def (strchr)