1 /* Optimized memrchr with sse2
2 Copyright (C) 2011-2014 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 # define CFI_PUSH(REG) \
25 cfi_adjust_cfa_offset (4); \
26 cfi_rel_offset (REG, 0)
28 # define CFI_POP(REG) \
29 cfi_adjust_cfa_offset (-4); \
32 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
33 # define POP(REG) popl REG; CFI_POP (REG)
40 # define MEMCHR __memrchr_sse2_bsf
45 movd STR2(%esp), %xmm1
51 punpcklbw %xmm1, %xmm1
53 punpcklbw %xmm1, %xmm1
56 pshufd $0, %xmm1, %xmm1
59 /* Check if there is a match. */
75 /* Loop start on aligned string. */
80 movdqa 48(%ecx), %xmm0
86 movdqa 32(%ecx), %xmm2
92 movdqa 16(%ecx), %xmm3
108 movdqa 48(%ecx), %xmm0
114 movdqa 32(%ecx), %xmm2
120 movdqa 16(%ecx), %xmm3
149 movdqa 16(%ecx), %xmm2
150 movdqa 32(%ecx), %xmm3
151 movdqa 48(%ecx), %xmm4
174 movdqa 16(%ecx), %xmm2
177 pcmpeqb (%ecx), %xmm1
195 movdqa 48(%ecx), %xmm0
201 movdqa 32(%ecx), %xmm2
207 movdqa 16(%ecx), %xmm3
215 pcmpeqb (%ecx), %xmm1
224 movdqa 48(%ecx), %xmm0
232 pcmpeqb 32(%ecx), %xmm1
248 lea 16(%eax, %ecx), %eax
254 lea 32(%eax, %ecx), %eax
260 lea 48(%eax, %ecx), %eax
278 lea 16(%ecx, %eax), %eax
287 lea 32(%ecx, %eax), %eax
296 lea 48(%ecx, %eax), %eax
305 L(length_less16_offset0):
307 pcmpeqb (%eax), %xmm1
326 punpcklbw %xmm1, %xmm1
328 punpcklbw %xmm1, %xmm1
332 pshufd $0, %xmm1, %xmm1
334 jz L(length_less16_offset0)
342 ja L(length_less16_part2)
344 pcmpeqb (%eax), %xmm1
367 L(length_less16_part2):
368 movdqa 16(%eax), %xmm2
382 jnz L(length_less16_part2_return)
384 pcmpeqb (%eax), %xmm1
402 L(length_less16_part2_return):
404 lea 16(%eax, %edi), %eax