]> git.ipfire.org Git - thirdparty/glibc.git/commitdiff
powerpc: POWER8 memcpy optimization for cached memory
authorAdhemerval Zanella <azanella@linux.vnet.ibm.com>
Mon, 11 Dec 2017 19:39:42 +0000 (17:39 -0200)
committerTulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Mon, 11 Dec 2017 19:39:42 +0000 (17:39 -0200)
On POWER8, unaligned memory accesses to cached memory has little impact
on performance as opposed to its ancestors.

It is disabled by default and will only be available when the tunable
glibc.tune.cached_memopt is set to 1.

                 __memcpy_power8_cached      __memcpy_power7
============================================================
    max-size=4096:     33325.70 ( 12.65%)        38153.00
    max-size=8192:     32878.20 ( 11.17%)        37012.30
   max-size=16384:     33782.20 ( 11.61%)        38219.20
   max-size=32768:     33296.20 ( 11.30%)        37538.30
   max-size=65536:     33765.60 ( 10.53%)        37738.40

* manual/tunables.texi (Hardware Capability Tunables): Document
glibc.tune.cached_memopt.
* sysdeps/powerpc/cpu-features.c: New file.
* sysdeps/powerpc/cpu-features.h: New file.
* sysdeps/powerpc/dl-procinfo.c [!IS_IN(ldconfig)]: Add
_dl_powerpc_cpu_features.
* sysdeps/powerpc/dl-tunables.list: New file.
* sysdeps/powerpc/ldsodefs.h: Include cpu-features.h.
* sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h
(INIT_ARCH): Initialize use_aligned_memopt.
* sysdeps/powerpc/powerpc64/dl-machine.h [defined(SHARED &&
IS_IN(rtld))]: Restrict dl_platform_init availability and
initialize CPU features used by tunables.
* sysdeps/powerpc/powerpc64/multiarch/Makefile (sysdep_routines):
Add memcpy-power8-cached.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Add
__memcpy_power8_cached.
* sysdeps/powerpc/powerpc64/multiarch/memcpy.c: Likewise.
* sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S:
New file.

Reviewed-by: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
13 files changed:
ChangeLog
manual/tunables.texi
sysdeps/powerpc/cpu-features.c [new file with mode: 0644]
sysdeps/powerpc/cpu-features.h [new file with mode: 0644]
sysdeps/powerpc/dl-procinfo.c
sysdeps/powerpc/dl-tunables.list [new file with mode: 0644]
sysdeps/powerpc/ldsodefs.h
sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h
sysdeps/powerpc/powerpc64/dl-machine.h
sysdeps/powerpc/powerpc64/multiarch/Makefile
sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S [new file with mode: 0644]
sysdeps/powerpc/powerpc64/multiarch/memcpy.c

index 50da0310bfdcc154949a7f0052afb52430421684..d5f7256fcf73c63ff5b0bfe26374922d3dfa33fc 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+2017-12-11  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+           Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>
+
+       * manual/tunables.texi (Hardware Capability Tunables): Document
+       glibc.tune.cached_memopt.
+       * sysdeps/powerpc/cpu-features.c: New file.
+       * sysdeps/powerpc/cpu-features.h: New file.
+       * sysdeps/powerpc/dl-procinfo.c [!IS_IN(ldconfig)]: Add
+       _dl_powerpc_cpu_features.
+       * sysdeps/powerpc/dl-tunables.list: New file.
+       * sysdeps/powerpc/ldsodefs.h: Include cpu-features.h.
+       * sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h
+       (INIT_ARCH): Initialize use_aligned_memopt.
+       * sysdeps/powerpc/powerpc64/dl-machine.h [defined(SHARED &&
+       IS_IN(rtld))]: Restrict dl_platform_init availability and
+       initialize CPU features used by tunables.
+       * sysdeps/powerpc/powerpc64/multiarch/Makefile (sysdep_routines):
+       Add memcpy-power8-cached.
+       * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Add
+       __memcpy_power8_cached.
+       * sysdeps/powerpc/powerpc64/multiarch/memcpy.c: Likewise.
+       * sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S:
+       New file.
+
 2017-12-11  H.J. Lu  <hongjiu.lu@intel.com>
 
        * string/Makefile (CFLAGS-inl-tester.c): Replace = with +=.
index e851b95c598cdb381af5aabf71c36fa5a761497c..6e0ee2898036fe91a5b96368f629b78abd982400 100644 (file)
@@ -319,6 +319,16 @@ the ones in @code{sysdeps/x86/cpu-features.h}.
 This tunable is specific to i386 and x86-64.
 @end deftp
 
+@deftp Tunable glibc.tune.cached_memopt
+The @code{glibc.tune.cached_memopt=[0|1]} tunable allows the user to
+enable optimizations recommended for cacheable memory.  If set to
+@code{1}, @theglibc{} assumes that the process memory image consists
+of cacheable (non-device) memory only.  The default, @code{0},
+indicates that the process may use device memory.
+
+This tunable is specific to powerpc, powerpc64 and powerpc64le.
+@end deftp
+
 @deftp Tunable glibc.tune.cpu
 The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to
 assume that the CPU is @code{xxx} where xxx may have one of these values:
diff --git a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c
new file mode 100644 (file)
index 0000000..6870582
--- /dev/null
@@ -0,0 +1,39 @@
+/* Initialize cpu feature data.  PowerPC version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <cpu-features.h>
+
+#if HAVE_TUNABLES
+# include <elf/dl-tunables.h>
+#endif
+
+static inline void
+init_cpu_features (struct cpu_features *cpu_features)
+{
+  /* Default is to use aligned memory access on optimized function unless
+     tunables is enable, since for this case user can explicit disable
+     unaligned optimizations.  */
+#if HAVE_TUNABLES
+  int32_t cached_memfunc = TUNABLE_GET (glibc, tune, cached_memopt, int32_t,
+                                       NULL);
+  cpu_features->use_cached_memopt = (cached_memfunc > 0);
+#else
+  cpu_features->use_cached_memopt = false;
+#endif
+}
diff --git a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h
new file mode 100644 (file)
index 0000000..36a8bb4
--- /dev/null
@@ -0,0 +1,28 @@
+/* Initialize cpu feature data.  PowerPC version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef __CPU_FEATURES_POWERPC_H
+# define __CPU_FEATURES_POWERPC_H
+
+#include <stdbool.h>
+
+struct cpu_features
+{
+  bool use_cached_memopt;
+};
+
+#endif /* __CPU_FEATURES_H  */
index 55a6e78aa8e24eb2ba84e123561b6b76b5f80686..c8b14454dc3068fa89ab90507488cd5d2bd55556 100644 (file)
 # define PROCINFO_CLASS
 #endif
 
+#if !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL && defined SHARED
+  ._dl_powerpc_cpu_features
+# else
+PROCINFO_CLASS struct cpu_features _dl_powerpc_cpu_features
+# endif
+# ifndef PROCINFO_DECL
+= { }
+# endif
+# if !defined SHARED || defined PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_powerpc_cap_flags
 #else
diff --git a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list
new file mode 100644 (file)
index 0000000..9e14b9a
--- /dev/null
@@ -0,0 +1,28 @@
+# powerpc specific tunables.
+# Copyright (C) 2017 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+glibc {
+  tune {
+    cached_memopt {
+      type: INT_32
+      minval: 0
+      maxval: 1
+      default: 0
+    }
+  }
+}
index 466de797fc733e65ff7538e2bb686b5a77700e7d..6f8b3a2054ac44be57bc75ec360200ad122c455a 100644 (file)
@@ -20,6 +20,7 @@
 #define        _POWERPC_LDSODEFS_H     1
 
 #include <elf.h>
+#include <cpu-features.h>
 
 struct La_ppc32_regs;
 struct La_ppc32_retval;
index f2e6a4b705c8953c658e3e42b7a84b9dd2c80880..6038941a70a6c2bd22fa756f23ebcc0e2e1dcd2d 100644 (file)
@@ -37,6 +37,8 @@
 #define INIT_ARCH() \
   unsigned long int hwcap = __GLRO(dl_hwcap);                  \
   unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \
+  bool __attribute__((unused)) use_cached_memopt =             \
+    GLRO(dl_powerpc_cpu_features).use_cached_memopt;           \
   if (hwcap & PPC_FEATURE_ARCH_2_06)                           \
     hwcap |= PPC_FEATURE_ARCH_2_05 |                           \
             PPC_FEATURE_POWER5_PLUS |                          \
index aeb91b8f692aee23261b39f8360458aa8e3f7580..76dceee80bdc979eb7232919dceb7d6a0fb85279 100644 (file)
@@ -27,6 +27,7 @@
 #include <dl-tls.h>
 #include <sysdep.h>
 #include <hwcapinfo.h>
+#include <cpu-features.c>
 
 /* Translate a processor specific dynamic tag to the index
    in l_info array.  */
@@ -300,13 +301,14 @@ BODY_PREFIX "_dl_start_user:\n"                                           \
 /* We define an initialization function to initialize HWCAP/HWCAP2 and
    platform data so it can be copied into the TCB later.  This is called
    very early in _dl_sysdep_start for dynamically linked binaries.  */
-#ifdef SHARED
+#if defined(SHARED) && IS_IN (rtld)
 # define DL_PLATFORM_INIT dl_platform_init ()
 
 static inline void __attribute__ ((unused))
 dl_platform_init (void)
 {
   __tcb_parse_hwcap_and_convert_at_platform ();
+  init_cpu_features (&GLRO(dl_powerpc_cpu_features));
 }
 #endif
 
index dea49acff5ded439312415ad8c68c05ac677985b..4df6b45c4c1c495a83bc049c422487c0dac98223 100644 (file)
@@ -1,6 +1,6 @@
 ifeq ($(subdir),string)
-sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
-                  memcpy-power4 memcpy-ppc64 \
+sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
+                  memcpy-cell memcpy-power4 memcpy-ppc64 \
                   memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \
                   memset-power7 memset-power6 memset-power4 \
                   memset-ppc64 memset-power8 \
index 6a88536c9812c5e77e0f837d911fa2e50cbc39b2..77a60eaf273a6e6b2b43ba2c66416fbb3904c4b0 100644 (file)
@@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #ifdef SHARED
   /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c.  */
   IFUNC_IMPL (i, name, memcpy,
+             IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __memcpy_power8_cached)
              IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
                              __memcpy_power7)
              IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S
new file mode 100644 (file)
index 0000000..e8bea91
--- /dev/null
@@ -0,0 +1,176 @@
+/* Optimized memcpy implementation for cached memory on PowerPC64/POWER8.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+   Returns 'dst'.  */
+
+       .machine power8
+ENTRY_TOCLESS (__memcpy_power8_cached, 5)
+       CALL_MCOUNT 3
+
+       cmpldi  cr7,r5,15
+       bgt     cr7,L(ge_16)
+       andi.   r9,r5,0x1
+       mr      r9,r3
+       beq     cr0,1f
+       lbz     r10,0(r4)
+       addi    r9,r3,1
+       addi    r4,r4,1
+       stb     r10,0(r3)
+1:
+       andi.   r10,r5,0x2
+       beq     cr0,2f
+       lhz     r10,0(r4)
+       addi    r9,r9,2
+       addi    r4,r4,2
+       sth     r10,-2(r9)
+2:
+       andi.   r10,r5,0x4
+       beq     cr0,3f
+       lwz     r10,0(r4)
+       addi    r9,9,4
+       addi    r4,4,4
+       stw     r10,-4(r9)
+3:
+       andi.   r10,r5,0x8
+       beqlr   cr0
+       ld      r10,0(r4)
+       std     r10,0(r9)
+       blr
+
+       .align 4
+L(ge_16):
+       cmpldi  cr7,r5,32
+       ble     cr7,L(ge_16_le_32)
+       cmpldi  cr7,r5,64
+       ble     cr7,L(gt_32_le_64)
+
+       /* Align dst to 16 bytes.  */
+       andi.   r9,r3,0xf
+       mr      r12,r3
+       beq     cr0,L(dst_is_align_16)
+       lxvd2x  v0,0,r4
+       subfic  r12,r9,16
+       subf    r5,r12,r5
+       add     r4,r4,r12
+       add     r12,r3,r12
+       stxvd2x v0,0,r3
+L(dst_is_align_16):
+       cmpldi  cr7,r5,127
+       ble     cr7,L(tail_copy)
+       mr      r9,r12
+       srdi    r10,r5,7
+       li      r11,16
+       li      r6,32
+       li      r7,48
+       mtctr   r10
+       clrrdi  r0,r5,7
+
+       /* Main loop, copy 128 bytes each time.  */
+       .align 4
+L(copy_128):
+       lxvd2x  v10,0,r4
+       lxvd2x  v11,r4,r11
+       addi    r8,r4,64
+       addi    r10,r9,64
+       lxvd2x  v12,r4,r6
+       lxvd2x  v0,r4,r7
+       addi    r4,r4,128
+       stxvd2x v10,0,r9
+       stxvd2x v11,r9,r11
+       stxvd2x v12,r9,r6
+       stxvd2x v0,r9,r7
+       addi    r9,r9,128
+       lxvd2x  v10,0,r8
+       lxvd2x  v11,r8,r11
+       lxvd2x  v12,r8,r6
+       lxvd2x  v0,r8,r7
+       stxvd2x v10,0,r10
+       stxvd2x v11,r10,r11
+       stxvd2x v12,r10,r6
+       stxvd2x v0,r10,r7
+       bdnz    L(copy_128)
+
+       add     r12,r12,r0
+       rldicl  r5,r5,0,57
+L(tail_copy):
+       cmpldi  cr7,r5,63
+       ble     cr7,L(tail_le_64)
+       li      r8,16
+       li      r10,32
+       lxvd2x  v10,0,r4
+       li      r9,48
+       addi    r5,r5,-64
+       lxvd2x  v11,r4,r8
+       lxvd2x  v12,r4,r10
+       lxvd2x  v0,r4,r9
+       addi    r4,r4,64
+       stxvd2x v10,0,r12
+       stxvd2x v11,r12,r8
+       stxvd2x v12,r12,r10
+       stxvd2x v0,r12,9
+       addi    r12,r12,64
+
+L(tail_le_64):
+       cmpldi  cr7,r5,32
+       bgt     cr7,L(tail_gt_32_le_64)
+       cmpdi   cr7,r5,0
+       beqlr   cr7
+       addi    r5,r5,-32
+       li      r9,16
+       add     r8,r4,r5
+       add     r10,r12,r5
+       lxvd2x  v12,r4,r5
+       lxvd2x  v0,r8,r9
+       stxvd2x v12,r12,r5
+       stxvd2x v0,r10,r9
+       blr
+
+       .align 4
+L(ge_16_le_32):
+       addi    r5,r5,-16
+       lxvd2x  v0,0,r4
+       lxvd2x  v1,r4,r5
+       stxvd2x v0,0,r3
+       stxvd2x v1,r3,r5
+       blr
+
+       .align 4
+L(gt_32_le_64):
+       mr      r12,r3
+
+       .align 4
+L(tail_gt_32_le_64):
+       li      r9,16
+       lxvd2x  v0,0,r4
+       addi    r5,r5,-32
+       lxvd2x  v1,r4,r9
+       add     r8,r4,r5
+       lxvd2x  v2,r4,r5
+       add     r10,r12,r5
+       lxvd2x  v3,r8,r9
+       stxvd2x v0,0,r12
+       stxvd2x v1,r12,r9
+       stxvd2x v2,r12,r5
+       stxvd2x v3,r10,r9
+       blr
+
+END_GEN_TB (__memcpy_power8_cached,TB_TOCLESS)
index 9f4286c4fee302b246adc10e7ecb1a968c947670..fb49fe161f86e0a0c11b1452ab71dd4594b6e960 100644 (file)
@@ -35,18 +35,21 @@ extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __memcpy_power7 :
-             (hwcap & PPC_FEATURE_ARCH_2_06)
-             ? __memcpy_a2 :
-               (hwcap & PPC_FEATURE_ARCH_2_05)
-               ? __memcpy_power6 :
-                 (hwcap & PPC_FEATURE_CELL_BE)
-                 ? __memcpy_cell :
-                   (hwcap & PPC_FEATURE_POWER4)
-                   ? __memcpy_power4
+           ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
+           ? __memcpy_power8_cached :
+             (hwcap & PPC_FEATURE_HAS_VSX)
+             ? __memcpy_power7 :
+               (hwcap & PPC_FEATURE_ARCH_2_06)
+               ? __memcpy_a2 :
+                 (hwcap & PPC_FEATURE_ARCH_2_05)
+                 ? __memcpy_power6 :
+                   (hwcap & PPC_FEATURE_CELL_BE)
+                   ? __memcpy_cell :
+                     (hwcap & PPC_FEATURE_POWER4)
+                     ? __memcpy_power4
             : __memcpy_ppc);
 
 #undef memcpy