Compiler inlines trunc and truncf with SSE4.1. But older versions of GCC
doesn't inline them with -Os:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121861
Don't use asm statement for trunc and truncf if compiler can inline them
with -Os. It removes one register move with GCC 16:
__modff_sse41: __modff_sse41:
.LFB23: .LFB23:
.cfi_startproc .cfi_startproc
endbr64 endbr64
subq $24, %rsp subq $24, %rsp
.cfi_def_cfa_offset 32 .cfi_def_cfa_offset 32
movq %fs:40, %rax movq %fs:40, %rax
movq %rax, 8(%rsp) movq %rax, 8(%rsp)
xorl %eax, %eax xorl %eax, %eax
movd %xmm0, %eax movd %xmm0, %eax
addl %eax, %eax addl %eax, %eax
cmpl $-
16777216, %eax cmpl $-
16777216, %eax
je .L7 je .L7
> movaps %xmm0, %xmm3
movaps %xmm0, %xmm4 movaps %xmm0, %xmm4
movss .LC0(%rip), %xmm2 | movss .LC0(%rip), %xmm1
movaps %xmm2, %xmm3 | movaps %xmm1, %xmm2
andps %xmm0, %xmm2 | roundss $11, %xmm3, %xmm3
roundss $11, %xmm0, %xmm1 | subss %xmm3, %xmm4
subss %xmm1, %xmm4 | andps %xmm0, %xmm1
andnps %xmm4, %xmm3 | andnps %xmm4, %xmm2
orps %xmm3, %xmm2 | orps %xmm2, %xmm1
.L3: .L3:
movss %xmm1, (%rdi) | movss %xmm3, (%rdi)
movq 8(%rsp), %rax movq 8(%rsp), %rax
subq %fs:40, %rax subq %fs:40, %rax
jne .L8 jne .L8
movaps %xmm2, %xmm0 | movaps %xmm1, %xmm0
addq $24, %rsp addq $24, %rsp
.cfi_remember_state .cfi_remember_state
.cfi_def_cfa_offset 8 .cfi_def_cfa_offset 8
ret ret
Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Uros Bizjak <ubizjak@gmail.com>
/* Define if -mapxf is enabled by default on x86. */
#undef HAVE_X86_APX
+/* Define if trunc is inlined on x86. */
+#undef HAVE_X86_INLINE_TRUNC
+
#endif
config_vars="$config_vars
test-cc-cflags-no-direct-extern-access = $libc_cv_test_cc_cflags_no_direct_extern_access"
+conftest_code="
+extern float truncf (float __x) __attribute__ ((__nothrow__,__const__));
+
+float
+tf (float x)
+{
+ return truncf (x);
+}
+"
+
+cat > conftest.c <<EOF
+$conftest_code
+EOF
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if -Os inlines trunc" >&5
+printf %s "checking if -Os inlines trunc... " >&6; }
+if test ${libc_cv_cc_x86_inline_trunc+y}
+then :
+ printf %s "(cached) " >&6
+else case e in #(
+ e) if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -S -Os -msse4.1 conftest.c -o conftest 1>&5'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }
+ then
+
+libc_cv_cc_x86_inline_trunc=no
+if grep -E -q "roundss" conftest; then
+ libc_cv_cc_x86_inline_trunc=yes
+fi
+
+ else
+
+echo "failed to check if -Os inlines trunc."
+rm -f conftest*
+exit 1
+
+ fi ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_x86_inline_trunc" >&5
+printf "%s\n" "$libc_cv_cc_x86_inline_trunc" >&6; }
+rm -f conftest*
+if test "$libc_cv_cc_x86_inline_trunc" = yes; then
+ printf "%s\n" "#define HAVE_X86_INLINE_TRUNC 1" >>confdefs.h
+
+else
+ printf "%s\n" "#define HAVE_X86_INLINE_TRUNC 0" >>confdefs.h
+
+fi
+
if test "${libc_cv_cc_no_direct_extern_access}${libc_cv_test_cc_cflags_no_direct_extern_access}" = yes; then
libc_cv_protected_data=no
fi
LIBC_CONFIG_VAR(test-cc-cflags-no-direct-extern-access,
$libc_cv_test_cc_cflags_no_direct_extern_access)
+conftest_code="
+extern float truncf (float __x) __attribute__ ((__nothrow__,__const__));
+
+float
+tf (float x)
+{
+ return truncf (x);
+}
+"
+dnl Check if CC inlines trunc with -Os.
+LIBC_TRY_CC_COMMAND([if -Os inlines trunc],
+ [$conftest_code],
+ [-S -Os -msse4.1],
+ libc_cv_cc_x86_inline_trunc,
+ [
+libc_cv_cc_x86_inline_trunc=no
+if grep -E -q "roundss" conftest; then
+ libc_cv_cc_x86_inline_trunc=yes
+fi
+],
+[
+echo "failed to check if -Os inlines trunc."
+rm -f conftest*
+exit 1
+])
+if test "$libc_cv_cc_x86_inline_trunc" = yes; then
+ AC_DEFINE(HAVE_X86_INLINE_TRUNC, 1)
+else
+ AC_DEFINE(HAVE_X86_INLINE_TRUNC, 0)
+fi
+
dnl If the building compiler enables no direct external data access by
dnl default, access to protected data in shared libraries from executables
dnl must be compiled with no direct external data access. If the testing
__extern_always_inline double
__trunc (double x)
{
-#ifdef __AVX__
- asm ("vroundsd $11, %1, %1, %0" : "=v" (x) : "v" (x));
-#elif defined __SSE4_1__
- asm ("roundsd $11, %1, %0" : "=x" (x) : "x" (x));
+#if HAVE_X86_INLINE_TRUNC || !defined __SSE4_1__
+ return trunc (x);
#else
- x = trunc (x);
-#endif
+ asm ("%vroundsd $11, %d1, %0" : "=v" (x) : "v" (x));
return x;
+#endif
}
__extern_always_inline float
__truncf (float x)
{
-#ifdef __AVX__
- asm ("vroundss $11, %1, %1, %0" : "=v" (x) : "v" (x));
-#elif defined __SSE4_1__
- asm ("roundss $11, %1, %0" : "=x" (x) : "x" (x));
+#if HAVE_X86_INLINE_TRUNC || !defined __SSE4_1__
+ return truncf (x);
#else
- x = truncf (x);
-#endif
+ asm ("%vroundss $11, %d1, %0" : "=v" (x) : "v" (x));
return x;
+#endif
}
#endif