x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S

author Noah Goldstein <goldstein.w.n@gmail.com>

Tue, 12 Jul 2022 19:29:05 +0000 (12:29 -0700)

committer Noah Goldstein <goldstein.w.n@gmail.com>

Wed, 13 Jul 2022 21:55:31 +0000 (14:55 -0700)
author Noah Goldstein <goldstein.w.n@gmail.com>
Tue, 12 Jul 2022 19:29:05 +0000 (12:29 -0700)
committer Noah Goldstein <goldstein.w.n@gmail.com>
Wed, 13 Jul 2022 21:55:31 +0000 (14:55 -0700)
diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S

new file mode 100644 (file)

index 0000000..2b7b879
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S

new file mode 100644 (file)

index 0000000..0cc5bec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S

index 992f7000777dd28f8cb2e3877d8b0763a1c34334..f7767ca543684d5da89f5b44d01da0b3e5cc7379 100644 (file)
--- a/sysdeps/x86_64/multiarch/strchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
@@ -16,13 +16,172 @@
     License along with the GNU C Library; if not, see
     <https://www.gnu.org/licenses/>.  */
  
-#if IS_IN (libc)
-# define strchr __strchr_sse2
+#if IS_IN (libc) || defined STRCHR
+# ifndef STRCHR
+#  define STRCHR __strchr_sse2
+# endif
  
-# undef weak_alias
-# define weak_alias(strchr, index)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strchr)
-#endif
+# include <sysdep.h>
+
+       .text
+ENTRY (STRCHR)
+       movd    %esi, %xmm1
+       movl    %edi, %eax
+       andl    $4095, %eax
+       punpcklbw %xmm1, %xmm1
+       cmpl    $4032, %eax
+       punpcklwd %xmm1, %xmm1
+       pshufd  $0, %xmm1, %xmm1
+       jg      L(cross_page)
+       movdqu  (%rdi), %xmm0
+       pxor    %xmm3, %xmm3
+       movdqa  %xmm0, %xmm4
+       pcmpeqb %xmm1, %xmm0
+       pcmpeqb %xmm3, %xmm4
+       por     %xmm4, %xmm0
+       pmovmskb %xmm0, %eax
+       test    %eax, %eax
+       je      L(next_48_bytes)
+       bsf     %eax, %eax
+# ifdef AS_STRCHRNUL
+       leaq    (%rdi,%rax), %rax
+# else
+       movl    $0, %edx
+       leaq    (%rdi,%rax), %rax
+       cmpb    %sil, (%rax)
+       cmovne  %rdx, %rax
+# endif
+       ret
+
+       .p2align 3
+L(next_48_bytes):
+       movdqu  16(%rdi), %xmm0
+       movdqa  %xmm0, %xmm4
+       pcmpeqb %xmm1, %xmm0
+       pcmpeqb %xmm3, %xmm4
+       por     %xmm4, %xmm0
+       pmovmskb %xmm0, %ecx
+       movdqu  32(%rdi), %xmm0
+       movdqa  %xmm0, %xmm4
+       pcmpeqb %xmm1, %xmm0
+       salq    $16, %rcx
+       pcmpeqb %xmm3, %xmm4
+       por     %xmm4, %xmm0
+       pmovmskb %xmm0, %eax
+       movdqu  48(%rdi), %xmm0
+       pcmpeqb %xmm0, %xmm3
+       salq    $32, %rax
+       pcmpeqb %xmm1, %xmm0
+       orq     %rcx, %rax
+       por     %xmm3, %xmm0
+       pmovmskb %xmm0, %ecx
+       salq    $48, %rcx
+       orq     %rcx, %rax
+       testq   %rax, %rax
+       jne     L(return)
+L(loop_start):
+       /* We use this alignment to force loop be aligned to 8 but not
+          16 bytes.  This gives better sheduling on AMD processors.  */
+       .p2align 4
+       pxor    %xmm6, %xmm6
+       andq    $-64, %rdi
+       .p2align 3
+L(loop64):
+       addq    $64, %rdi
+       movdqa  (%rdi), %xmm5
+       movdqa  16(%rdi), %xmm2
+       movdqa  32(%rdi), %xmm3
+       pxor    %xmm1, %xmm5
+       movdqa  48(%rdi), %xmm4
+       pxor    %xmm1, %xmm2
+       pxor    %xmm1, %xmm3
+       pminub  (%rdi), %xmm5
+       pxor    %xmm1, %xmm4
+       pminub  16(%rdi), %xmm2
+       pminub  32(%rdi), %xmm3
+       pminub  %xmm2, %xmm5
+       pminub  48(%rdi), %xmm4
+       pminub  %xmm3, %xmm5
+       pminub  %xmm4, %xmm5
+       pcmpeqb %xmm6, %xmm5
+       pmovmskb %xmm5, %eax
+
+       testl   %eax, %eax
+       je      L(loop64)
  
-#include "../strchr.S"
+       movdqa  (%rdi), %xmm5
+       movdqa  %xmm5, %xmm0
+       pcmpeqb %xmm1, %xmm5
+       pcmpeqb %xmm6, %xmm0
+       por     %xmm0, %xmm5
+       pcmpeqb %xmm6, %xmm2
+       pcmpeqb %xmm6, %xmm3
+       pcmpeqb %xmm6, %xmm4
+
+       pmovmskb %xmm5, %ecx
+       pmovmskb %xmm2, %eax
+       salq    $16, %rax
+       pmovmskb %xmm3, %r8d
+       pmovmskb %xmm4, %edx
+       salq    $32, %r8
+       orq     %r8, %rax
+       orq     %rcx, %rax
+       salq    $48, %rdx
+       orq     %rdx, %rax
+       .p2align 3
+L(return):
+       bsfq    %rax, %rax
+# ifdef AS_STRCHRNUL
+       leaq    (%rdi,%rax), %rax
+# else
+       movl    $0, %edx
+       leaq    (%rdi,%rax), %rax
+       cmpb    %sil, (%rax)
+       cmovne  %rdx, %rax
+# endif
+       ret
+       .p2align 4
+
+L(cross_page):
+       movq    %rdi, %rdx
+       pxor    %xmm2, %xmm2
+       andq    $-64, %rdx
+       movdqa  %xmm1, %xmm0
+       movdqa  (%rdx), %xmm3
+       movdqa  %xmm3, %xmm4
+       pcmpeqb %xmm1, %xmm3
+       pcmpeqb %xmm2, %xmm4
+       por     %xmm4, %xmm3
+       pmovmskb %xmm3, %r8d
+       movdqa  16(%rdx), %xmm3
+       movdqa  %xmm3, %xmm4
+       pcmpeqb %xmm1, %xmm3
+       pcmpeqb %xmm2, %xmm4
+       por     %xmm4, %xmm3
+       pmovmskb %xmm3, %eax
+       movdqa  32(%rdx), %xmm3
+       movdqa  %xmm3, %xmm4
+       pcmpeqb %xmm1, %xmm3
+       salq    $16, %rax
+       pcmpeqb %xmm2, %xmm4
+       por     %xmm4, %xmm3
+       pmovmskb %xmm3, %r9d
+       movdqa  48(%rdx), %xmm3
+       pcmpeqb %xmm3, %xmm2
+       salq    $32, %r9
+       pcmpeqb %xmm3, %xmm0
+       orq     %r9, %rax
+       orq     %r8, %rax
+       por     %xmm2, %xmm0
+       pmovmskb %xmm0, %ecx
+       salq    $48, %rcx
+       orq     %rcx, %rax
+       movl    %edi, %ecx
+       subb    %dl, %cl
+       shrq    %cl, %rax
+       testq   %rax, %rax
+       jne     L(return)
+       jmp     L(loop_start)
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S

index f91c67036943d1a497db5282899afacdf90cf13a..7238977a21b46f655ba73933a639aa72ab6730d2 100644 (file)
--- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
@@ -17,10 +17,11 @@
     <https://www.gnu.org/licenses/>.  */
  
  #if IS_IN (libc)
-# define __strchrnul __strchrnul_sse2
-
-# undef weak_alias
-# define weak_alias(__strchrnul, strchrnul)
+# ifndef STRCHR
+#  define STRCHR       __strchrnul_sse2
+# endif
  #endif
  
-#include "../strchrnul.S"
+#define AS_STRCHRNUL
+
+#include "strchr-sse2.S"
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S

index dda7c0431d16aa2e06ac472b163d3ac84b67ec12..77c956c92ce5ade4b255a9221cc91378a2fe9fdf 100644 (file)
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -17,171 +17,8 @@
     License along with the GNU C Library; if not, see
     <https://www.gnu.org/licenses/>.  */
  
-#include <sysdep.h>
  
-       .text
-ENTRY (strchr)
-       movd    %esi, %xmm1
-       movl    %edi, %eax
-       andl    $4095, %eax
-       punpcklbw %xmm1, %xmm1
-       cmpl    $4032, %eax
-       punpcklwd %xmm1, %xmm1
-       pshufd  $0, %xmm1, %xmm1
-       jg      L(cross_page)
-       movdqu  (%rdi), %xmm0
-       pxor    %xmm3, %xmm3
-       movdqa  %xmm0, %xmm4
-       pcmpeqb %xmm1, %xmm0
-       pcmpeqb %xmm3, %xmm4
-       por     %xmm4, %xmm0
-       pmovmskb %xmm0, %eax
-       test    %eax, %eax
-       je      L(next_48_bytes)
-       bsf     %eax, %eax
-#ifdef AS_STRCHRNUL
-       leaq    (%rdi,%rax), %rax
-#else
-       movl    $0, %edx
-       leaq    (%rdi,%rax), %rax
-       cmpb    %sil, (%rax)
-       cmovne  %rdx, %rax
-#endif
-       ret
-
-       .p2align 3
-       L(next_48_bytes):
-       movdqu  16(%rdi), %xmm0
-       movdqa  %xmm0, %xmm4
-       pcmpeqb %xmm1, %xmm0
-       pcmpeqb %xmm3, %xmm4
-       por     %xmm4, %xmm0
-       pmovmskb %xmm0, %ecx
-       movdqu  32(%rdi), %xmm0
-       movdqa  %xmm0, %xmm4
-       pcmpeqb %xmm1, %xmm0
-       salq    $16, %rcx
-       pcmpeqb %xmm3, %xmm4
-       por     %xmm4, %xmm0
-       pmovmskb %xmm0, %eax
-       movdqu  48(%rdi), %xmm0
-       pcmpeqb %xmm0, %xmm3
-       salq    $32, %rax
-       pcmpeqb %xmm1, %xmm0
-       orq     %rcx, %rax
-       por     %xmm3, %xmm0
-       pmovmskb %xmm0, %ecx
-       salq    $48, %rcx
-       orq     %rcx, %rax
-       testq   %rax, %rax
-       jne     L(return)
-L(loop_start):
-       /* We use this alignment to force loop be aligned to 8 but not
-          16 bytes.  This gives better sheduling on AMD processors.  */
-       .p2align 4
-       pxor    %xmm6, %xmm6
-       andq    $-64, %rdi
-       .p2align 3
-L(loop64):
-       addq    $64, %rdi
-       movdqa  (%rdi), %xmm5
-       movdqa  16(%rdi), %xmm2
-       movdqa  32(%rdi), %xmm3
-       pxor    %xmm1, %xmm5
-       movdqa  48(%rdi), %xmm4
-       pxor    %xmm1, %xmm2
-       pxor    %xmm1, %xmm3
-       pminub  (%rdi), %xmm5
-       pxor    %xmm1, %xmm4
-       pminub  16(%rdi), %xmm2
-       pminub  32(%rdi), %xmm3
-       pminub  %xmm2, %xmm5
-       pminub  48(%rdi), %xmm4
-       pminub  %xmm3, %xmm5
-       pminub  %xmm4, %xmm5
-       pcmpeqb %xmm6, %xmm5
-       pmovmskb %xmm5, %eax
-
-       testl   %eax, %eax
-       je      L(loop64)
-
-       movdqa  (%rdi), %xmm5
-       movdqa  %xmm5, %xmm0
-       pcmpeqb %xmm1, %xmm5
-       pcmpeqb %xmm6, %xmm0
-       por     %xmm0, %xmm5
-       pcmpeqb %xmm6, %xmm2
-       pcmpeqb %xmm6, %xmm3
-       pcmpeqb %xmm6, %xmm4
-
-       pmovmskb %xmm5, %ecx
-       pmovmskb %xmm2, %eax
-       salq    $16, %rax
-       pmovmskb %xmm3, %r8d
-       pmovmskb %xmm4, %edx
-       salq    $32, %r8
-       orq     %r8, %rax
-       orq     %rcx, %rax
-       salq    $48, %rdx
-       orq     %rdx, %rax
-       .p2align 3
-L(return):
-       bsfq    %rax, %rax
-#ifdef AS_STRCHRNUL
-       leaq    (%rdi,%rax), %rax
-#else
-       movl    $0, %edx
-       leaq    (%rdi,%rax), %rax
-       cmpb    %sil, (%rax)
-       cmovne  %rdx, %rax
-#endif
-       ret
-       .p2align 4
-
-L(cross_page):
-       movq    %rdi, %rdx
-       pxor    %xmm2, %xmm2
-       andq    $-64, %rdx
-       movdqa  %xmm1, %xmm0
-       movdqa  (%rdx), %xmm3
-       movdqa  %xmm3, %xmm4
-       pcmpeqb %xmm1, %xmm3
-       pcmpeqb %xmm2, %xmm4
-       por     %xmm4, %xmm3
-       pmovmskb %xmm3, %r8d
-       movdqa  16(%rdx), %xmm3
-       movdqa  %xmm3, %xmm4
-       pcmpeqb %xmm1, %xmm3
-       pcmpeqb %xmm2, %xmm4
-       por     %xmm4, %xmm3
-       pmovmskb %xmm3, %eax
-       movdqa  32(%rdx), %xmm3
-       movdqa  %xmm3, %xmm4
-       pcmpeqb %xmm1, %xmm3
-       salq    $16, %rax
-       pcmpeqb %xmm2, %xmm4
-       por     %xmm4, %xmm3
-       pmovmskb %xmm3, %r9d
-       movdqa  48(%rdx), %xmm3
-       pcmpeqb %xmm3, %xmm2
-       salq    $32, %r9
-       pcmpeqb %xmm3, %xmm0
-       orq     %r9, %rax
-       orq     %r8, %rax
-       por     %xmm2, %xmm0
-       pmovmskb %xmm0, %ecx
-       salq    $48, %rcx
-       orq     %rcx, %rax
-       movl    %edi, %ecx
-       subb    %dl, %cl
-       shrq    %cl, %rax
-       testq   %rax, %rax
-       jne     L(return)
-       jmp     L(loop_start)
-
-END (strchr)
-
-#ifndef AS_STRCHRNUL
+#define STRCHR strchr
+#include "multiarch/strchr-sse2.S"
  weak_alias (strchr, index)
  libc_hidden_builtin_def (strchr)
-#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S

index ec2e652e2510cc0c0af7e0bd14a0f62853aec32c..508e42db26a9d10d9c11e7b842455a673d466935 100644 (file)
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -18,10 +18,7 @@
     License along with the GNU C Library; if not, see
     <https://www.gnu.org/licenses/>.  */
  
-#include <sysdep.h>
-
-#define strchr __strchrnul
-#define AS_STRCHRNUL
-#include "strchr.S"
+#define STRCHR __strchrnul
+#include "multiarch/strchrnul-sse2.S"
  
  weak_alias (__strchrnul, strchrnul)
author	Noah Goldstein <goldstein.w.n@gmail.com>
	Tue, 12 Jul 2022 19:29:05 +0000 (12:29 -0700)
committer	Noah Goldstein <goldstein.w.n@gmail.com>
	Wed, 13 Jul 2022 21:55:31 +0000 (14:55 -0700)
sysdeps/x86_64/multiarch/rtld-strchr.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/rtld-strchrnul.S	[new file with mode: 0644]	patch \| blob
sysdeps/x86_64/multiarch/strchr-sse2.S		patch \| blob \| blame \| history
sysdeps/x86_64/multiarch/strchrnul-sse2.S		patch \| blob \| blame \| history
sysdeps/x86_64/strchr.S		patch \| blob \| blame \| history
sysdeps/x86_64/strchrnul.S		patch \| blob \| blame \| history