From 1fa5773eb1c38fe23bfe678bd510a792fa257b16 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 15 Sep 2025 18:52:18 -0700 Subject: [PATCH] x86: Don't use asm statement for trunc/truncf Compiler inlines trunc and truncf with SSE4.1. But older versions of GCC doesn't inline them with -Os: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121861 Don't use asm statement for trunc and truncf if compiler can inline them with -Os. It removes one register move with GCC 16: __modff_sse41: __modff_sse41: .LFB23: .LFB23: .cfi_startproc .cfi_startproc endbr64 endbr64 subq $24, %rsp subq $24, %rsp .cfi_def_cfa_offset 32 .cfi_def_cfa_offset 32 movq %fs:40, %rax movq %fs:40, %rax movq %rax, 8(%rsp) movq %rax, 8(%rsp) xorl %eax, %eax xorl %eax, %eax movd %xmm0, %eax movd %xmm0, %eax addl %eax, %eax addl %eax, %eax cmpl $-16777216, %eax cmpl $-16777216, %eax je .L7 je .L7 > movaps %xmm0, %xmm3 movaps %xmm0, %xmm4 movaps %xmm0, %xmm4 movss .LC0(%rip), %xmm2 | movss .LC0(%rip), %xmm1 movaps %xmm2, %xmm3 | movaps %xmm1, %xmm2 andps %xmm0, %xmm2 | roundss $11, %xmm3, %xmm3 roundss $11, %xmm0, %xmm1 | subss %xmm3, %xmm4 subss %xmm1, %xmm4 | andps %xmm0, %xmm1 andnps %xmm4, %xmm3 | andnps %xmm4, %xmm2 orps %xmm3, %xmm2 | orps %xmm2, %xmm1 .L3: .L3: movss %xmm1, (%rdi) | movss %xmm3, (%rdi) movq 8(%rsp), %rax movq 8(%rsp), %rax subq %fs:40, %rax subq %fs:40, %rax jne .L8 jne .L8 movaps %xmm2, %xmm0 | movaps %xmm1, %xmm0 addq $24, %rsp addq $24, %rsp .cfi_remember_state .cfi_remember_state .cfi_def_cfa_offset 8 .cfi_def_cfa_offset 8 ret ret Signed-off-by: H.J. Lu Reviewed-by: Uros Bizjak --- config.h.in | 3 ++ sysdeps/x86/configure | 52 ++++++++++++++++++++++++++++++++++ sysdeps/x86/configure.ac | 31 ++++++++++++++++++++ sysdeps/x86/fpu/math_private.h | 20 ++++++------- 4 files changed, 94 insertions(+), 12 deletions(-) diff --git a/config.h.in b/config.h.in index 8b4077f578..9fb369c640 100644 --- a/config.h.in +++ b/config.h.in @@ -308,4 +308,7 @@ /* Define if -mapxf is enabled by default on x86. */ #undef HAVE_X86_APX +/* Define if trunc is inlined on x86. */ +#undef HAVE_X86_INLINE_TRUNC + #endif diff --git a/sysdeps/x86/configure b/sysdeps/x86/configure index a021cdbcf5..2e95277f29 100644 --- a/sysdeps/x86/configure +++ b/sysdeps/x86/configure @@ -340,6 +340,58 @@ fi config_vars="$config_vars test-cc-cflags-no-direct-extern-access = $libc_cv_test_cc_cflags_no_direct_extern_access" +conftest_code=" +extern float truncf (float __x) __attribute__ ((__nothrow__,__const__)); + +float +tf (float x) +{ + return truncf (x); +} +" + +cat > conftest.c <&5 +printf %s "checking if -Os inlines trunc... " >&6; } +if test ${libc_cv_cc_x86_inline_trunc+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -S -Os -msse4.1 conftest.c -o conftest 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } + then + +libc_cv_cc_x86_inline_trunc=no +if grep -E -q "roundss" conftest; then + libc_cv_cc_x86_inline_trunc=yes +fi + + else + +echo "failed to check if -Os inlines trunc." +rm -f conftest* +exit 1 + + fi ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_x86_inline_trunc" >&5 +printf "%s\n" "$libc_cv_cc_x86_inline_trunc" >&6; } +rm -f conftest* +if test "$libc_cv_cc_x86_inline_trunc" = yes; then + printf "%s\n" "#define HAVE_X86_INLINE_TRUNC 1" >>confdefs.h + +else + printf "%s\n" "#define HAVE_X86_INLINE_TRUNC 0" >>confdefs.h + +fi + if test "${libc_cv_cc_no_direct_extern_access}${libc_cv_test_cc_cflags_no_direct_extern_access}" = yes; then libc_cv_protected_data=no fi diff --git a/sysdeps/x86/configure.ac b/sysdeps/x86/configure.ac index a87e2f6c41..f3888e6618 100644 --- a/sysdeps/x86/configure.ac +++ b/sysdeps/x86/configure.ac @@ -192,6 +192,37 @@ fi LIBC_CONFIG_VAR(test-cc-cflags-no-direct-extern-access, $libc_cv_test_cc_cflags_no_direct_extern_access) +conftest_code=" +extern float truncf (float __x) __attribute__ ((__nothrow__,__const__)); + +float +tf (float x) +{ + return truncf (x); +} +" +dnl Check if CC inlines trunc with -Os. +LIBC_TRY_CC_COMMAND([if -Os inlines trunc], + [$conftest_code], + [-S -Os -msse4.1], + libc_cv_cc_x86_inline_trunc, + [ +libc_cv_cc_x86_inline_trunc=no +if grep -E -q "roundss" conftest; then + libc_cv_cc_x86_inline_trunc=yes +fi +], +[ +echo "failed to check if -Os inlines trunc." +rm -f conftest* +exit 1 +]) +if test "$libc_cv_cc_x86_inline_trunc" = yes; then + AC_DEFINE(HAVE_X86_INLINE_TRUNC, 1) +else + AC_DEFINE(HAVE_X86_INLINE_TRUNC, 0) +fi + dnl If the building compiler enables no direct external data access by dnl default, access to protected data in shared libraries from executables dnl must be compiled with no direct external data access. If the testing diff --git a/sysdeps/x86/fpu/math_private.h b/sysdeps/x86/fpu/math_private.h index d30d580cea..bba085a578 100644 --- a/sysdeps/x86/fpu/math_private.h +++ b/sysdeps/x86/fpu/math_private.h @@ -33,27 +33,23 @@ __NTH (__ieee754_atan2l (long double y, long double x)) __extern_always_inline double __trunc (double x) { -#ifdef __AVX__ - asm ("vroundsd $11, %1, %1, %0" : "=v" (x) : "v" (x)); -#elif defined __SSE4_1__ - asm ("roundsd $11, %1, %0" : "=x" (x) : "x" (x)); +#if HAVE_X86_INLINE_TRUNC || !defined __SSE4_1__ + return trunc (x); #else - x = trunc (x); -#endif + asm ("%vroundsd $11, %d1, %0" : "=v" (x) : "v" (x)); return x; +#endif } __extern_always_inline float __truncf (float x) { -#ifdef __AVX__ - asm ("vroundss $11, %1, %1, %0" : "=v" (x) : "v" (x)); -#elif defined __SSE4_1__ - asm ("roundss $11, %1, %0" : "=x" (x) : "x" (x)); +#if HAVE_X86_INLINE_TRUNC || !defined __SSE4_1__ + return truncf (x); #else - x = truncf (x); -#endif + asm ("%vroundss $11, %d1, %0" : "=v" (x) : "v" (x)); return x; +#endif } #endif -- 2.47.3