]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/rawmemchr.S
Improve 64 bit memchr, memrchr, rawmemchr with SSE2
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / rawmemchr.S
1 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
2 Contributed by Ulrich Drepper <drepper@redhat.com>.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
19
20 #include <sysdep.h>
21 #include <init-arch.h>
22
23
24 /* Define multiple versions only for the definition in lib. */
25 #ifndef NOT_IN_libc
26 .text
27 ENTRY(rawmemchr)
28 .type rawmemchr, @gnu_indirect_function
29 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
30 jne 1f
31 call __init_cpu_features
32 1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
33 jnz 2f
34 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
35 jz 2f
36 leaq __rawmemchr_sse42(%rip), %rax
37 ret
38 2: leaq __rawmemchr_sse2(%rip), %rax
39 ret
40
41 END(rawmemchr)
42 strong_alias (rawmemchr, __rawmemchr)
43
44
45 .section .text.sse4.2,"ax",@progbits
46 .align 16
47 .type __rawmemchr_sse42, @function
48 __rawmemchr_sse42:
49 cfi_startproc
50 CALL_MCOUNT
51 movd %esi, %xmm1
52 movq %rdi, %rcx
53 punpcklbw %xmm1, %xmm1
54 andq $~15, %rdi
55 punpcklbw %xmm1, %xmm1
56 orl $0xffffffff, %esi
57 movdqa (%rdi), %xmm0
58 pshufd $0, %xmm1, %xmm1
59 subq %rdi, %rcx
60 pcmpeqb %xmm1, %xmm0
61 shl %cl, %esi
62 pmovmskb %xmm0, %ecx
63 movl $16, %eax
64 movl $16, %edx
65 andl %esi, %ecx
66 jnz 1f
67
68 2: pcmpestri $0x08, 16(%rdi), %xmm1
69 leaq 16(%rdi), %rdi
70 jnc 2b
71
72 leaq (%rdi,%rcx), %rax
73 ret
74
75 1: bsfl %ecx, %eax
76 addq %rdi, %rax
77 ret
78 cfi_endproc
79 .size __rawmemchr_sse42, .-__rawmemchr_sse42
80
81
82 # undef ENTRY
83 # define ENTRY(name) \
84 .type __rawmemchr_sse2, @function; \
85 .align 16; \
86 __rawmemchr_sse2: cfi_startproc; \
87 CALL_MCOUNT
88 # undef END
89 # define END(name) \
90 cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
91 # undef libc_hidden_builtin_def
92 /* It doesn't make sense to send libc-internal rawmemchr calls through a PLT.
93 The speedup we get from using SSE4.2 instruction is likely eaten away
94 by the indirect call in the PLT. */
95 # define libc_hidden_builtin_def(name) \
96 .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
97 #endif
98
99 #include "../rawmemchr.S"